diff --git a/prov/psm3/Makefile.am b/prov/psm3/Makefile.am
index a6d3fbc68ed..cec9bddede3 100644
--- a/prov/psm3/Makefile.am
+++ b/prov/psm3/Makefile.am
@@ -30,9 +30,9 @@ ACLOCAL_AMFLAGS = -I config
 AM_CFLAGS = -Wall
 
 if HAVE_LD_VERSION_SCRIPT
-    libpsm3_fi_version_script = -Wl,--version-script=$(builddir)/libpsm3-fi.map
+	libpsm3_fi_version_script = -Wl,--version-script=$(builddir)/libpsm3-fi.map
 else !HAVE_LD_VERSION_SCRIPT
-    libpsm3_fi_version_script =
+	libpsm3_fi_version_script =
 endif !HAVE_LD_VERSION_SCRIPT
 
 # rdmaincludedir = $(includedir)/rdma
@@ -51,6 +51,8 @@ common_srcs = \
 	shared/hmem_neuron.c \
 	shared/hmem_synapseai.c \
 	shared/hmem_ipc_cache.c \
+	shared/xpmem.c \
+	shared/xpmem_cache.c \
 	shared/common.c \
 	shared/enosys.c \
 	shared/rbtree.c \
@@ -78,13 +80,22 @@ common_srcs = \
 	util/src/util_ns.c \
 	util/src/util_pep.c \
 	util/src/util_poll.c \
+	util/src/util_profile.c \
+	util/src/util_srx.c \
 	util/src/util_wait.c \
 	util/src/rxm_av.c \
 	util/src/cuda_mem_monitor.c \
 	util/src/cuda_ipc_monitor.c \
 	util/src/rocr_mem_monitor.c \
 	util/src/rocr_ipc_monitor.c \
-	util/src/ze_mem_monitor.c
+	util/src/ze_mem_monitor.c \
+	util/src/xpmem_monitor.c \
+	shared/fabric.c \
+	shared/fi_tostr.c \
+	shared/perf.c \
+	shared/log.c \
+	shared/var.c \
+	shared/abi_1_0.c
 
 if MACOS
 common_srcs += shared/osx/osd.c
@@ -103,9 +114,7 @@ if LINUX
 common_srcs += shared/unix/osd.c
 common_srcs += shared/linux/osd.c
 if HAVE_LINUX_PERF_RDPMC
-if !HAVE_PSM3_SRC
-common_srcs += shared/linux/rdpmc.c  #seems to be a copy of psm3/psm_perf.c
-endif
+common_srcs += shared/linux/rdpmc.c
 endif
 common_srcs += inc/linux/rdpmc.h
 common_srcs += inc/linux/osd.h
@@ -120,6 +129,8 @@ bin_SCRIPTS =
 nodist_src_libpsm3_fi_la_SOURCES =
 src_libpsm3_fi_la_SOURCES = \
 	inc/ofi_hmem.h \
+	inc/ofi_cma.h \
+	inc/ofi_xpmem.h \
 	inc/ofi.h \
 	inc/ofi_abi.h \
 	inc/ofi_atom.h \
@@ -137,7 +148,7 @@ src_libpsm3_fi_la_SOURCES = \
 	inc/ofi_proto.h \
 	inc/ofi_recvwin.h \
 	inc/ofi_rbuf.h \
-	inc/ofi_shm.h \
+	inc/ofi_shm_p2p.h \
 	inc/ofi_signal.h \
 	inc/ofi_epoll.h \
 	inc/ofi_tree.h \
@@ -148,10 +159,12 @@ src_libpsm3_fi_la_SOURCES = \
 	inc/ofi_net.h \
 	inc/ofi_perf.h \
 	inc/ofi_coll.h \
+	inc/ofi_mb.h \
 	inc/fasthash.h \
 	inc/rbtree.h \
 	inc/uthash.h \
 	inc/ofi_prov.h \
+	inc/ofi_profile.h \
 	inc/rdma/providers/fi_log.h \
 	inc/rdma/providers/fi_prov.h \
 	inc/rdma/providers/fi_peer.h \
@@ -167,6 +180,7 @@ src_libpsm3_fi_la_SOURCES = \
 	inc/rdma/fi_errno.h \
 	inc/rdma/fi_tagged.h \
 	inc/rdma/fi_trigger.h \
+	inc/rdma/fi_profile.h \
 	src/psmx3.h \
 	src/psmx3_am.c \
 	src/psmx3_atomic.c \
@@ -216,7 +230,7 @@ src_libpsm3_fi_la_LDFLAGS += -lpsm2
 endif !HAVE_PSM3_SRC
 
 if !EMBEDDED
-src_libpsm3_fi_la_LDFLAGS += -version-info 22:0:21
+src_libpsm3_fi_la_LDFLAGS += -version-info 24:0:23
 endif
 
 prov_install_man_pages = man/man7/fi_psm3.7
@@ -249,8 +263,8 @@ src/psm3_src_chksum.h: Makefile $(chksum_srcs)
 
 nroff:
 	@for file in $(prov_install_man_pages); do \
-	    source=`echo $$file | sed -e 's@/man[0-9]@@'`; \
-	    perl $(top_srcdir)/config/md2nroff.pl --source=$(top_srcdir)/$$source.md; \
+		source=`echo $$file | sed -e 's@/man[0-9]@@'`; \
+		perl $(top_srcdir)/config/md2nroff.pl --source=$(top_srcdir)/$$source.md; \
 	done
 
 dist-hook: libpsm3-fi.spec
diff --git a/prov/psm3/Makefile.include b/prov/psm3/Makefile.include
index 4716706b0e0..47424fc2caf 100644
--- a/prov/psm3/Makefile.include
+++ b/prov/psm3/Makefile.include
@@ -220,6 +220,8 @@ prov_psm3_psm3_libpsm3i_la_SOURCES = \
 	prov/psm3/psm3/psm_mq_recv.c \
 	prov/psm3/psm3/psm_mq_utils.c \
 	prov/psm3/psm3/psm_netutils.h \
+	prov/psm3/psm3/psm_nic_select.c \
+	prov/psm3/psm3/psm_nic_select.h \
 	prov/psm3/psm3/psm_oneapi_ze.c \
 	prov/psm3/psm3/psm_perf.c \
 	prov/psm3/psm3/psm_perf.h \
diff --git a/prov/psm3/VERSION b/prov/psm3/VERSION
index ef63cfba3ce..144229f3d51 100644
--- a/prov/psm3/VERSION
+++ b/prov/psm3/VERSION
@@ -1 +1 @@
-3_5_1_1
+3_6_0_1
diff --git a/prov/psm3/configure.ac b/prov/psm3/configure.ac
index b680fddcdb9..a985fc05b85 100644
--- a/prov/psm3/configure.ac
+++ b/prov/psm3/configure.ac
@@ -58,7 +58,7 @@ AC_DEFINE_UNQUOTED([BUILD_ID],["$with_build_id"],
 
 # Override autoconf default CFLAG settings (e.g. "-g -O2") while still
 # allowing the user to explicitly set CFLAGS=""
-: ${CFLAGS="-fvisibility=hidden ${base_c_warn_flags}"}
+: ${CFLAGS="${base_c_warn_flags}"}
 
 # AM_PROG_AS would set CFLAGS="-g -O2" by default if not set already so it
 # should not be called earlier
@@ -242,6 +242,35 @@ AS_IF([test x"$enable_debug" != x"no"],
 AC_DEFINE_UNQUOTED([ENABLE_DEBUG],[$dbg],
                    [defined to 1 if libfabric was configured with --enable-debug, 0 otherwise])
 
+AC_ARG_ENABLE([profile],
+              [AS_HELP_STRING([--enable-profile],
+                              [Enable profiling @<:@default=no@:>@])],
+              [],
+              [enable_profile=no])
+
+AS_IF([test x"$enable_profile" != x"no"],
+      [AC_DEFINE([HAVE_FABRIC_PROFILE], [1],
+                 [defined to 1 if libfabric was configured with --enable-profile, 0 otherwise])
+])
+
+AC_DEFUN([FI_ARG_ENABLE_SANITIZER],[
+        AC_ARG_ENABLE([$1],
+                      [AS_HELP_STRING([--enable-$1],
+                                      [Enable $3Sanitizer @<:@default=no@:>@])
+                      ],
+                      [],
+                      [enable_$1=no])
+        AS_IF([test x"$enable_$1" != x"no"],
+              [CFLAGS="-fsanitize=$2 $CFLAGS"])
+])
+
+m4_map([FI_ARG_ENABLE_SANITIZER],[
+       [asan, address, Address],
+       [lsan, leak, Leak],
+       [tsan, thread, Thread],
+       [ubsan, undefined, UndefinedBehavior]
+])
+
 dnl Checks for header files.
 AC_HEADER_STDC
 
@@ -463,7 +492,9 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([[__asm__(".symver main_, main@ABIVER_1.0");]],
 
 ]) dnl AS_IF icc_symver_hack
 
-AC_DEFINE_UNQUOTED([HAVE_SYMVER_SUPPORT], [$ac_asm_symver_support],
+dnl Disable in psm3 to include all symbols without symver
+dnl AC_DEFINE_UNQUOTED([HAVE_SYMVER_SUPPORT], [$ac_asm_symver_support],
+AC_DEFINE_UNQUOTED([HAVE_SYMVER_SUPPORT], [0],
            [Define to 1 if compiler/linker support symbol versioning.])
 
 AC_MSG_CHECKING(for __alias__ attribute support)
@@ -478,8 +509,9 @@ AC_LINK_IFELSE([AC_LANG_PROGRAM([[
       AC_MSG_RESULT(no)
       ac_prog_cc_alias_symbols=0
    ])
-
-AC_DEFINE_UNQUOTED([HAVE_ALIAS_ATTRIBUTE], [$ac_prog_cc_alias_symbols],
+dnl Disable in psm3 to include all symbols without symver
+dnl AC_DEFINE_UNQUOTED([HAVE_ALIAS_ATTRIBUTE], [$ac_prog_cc_alias_symbols],
+AC_DEFINE_UNQUOTED([HAVE_ALIAS_ATTRIBUTE], [0],
            [Define to 1 if the linker supports alias attribute.])
 AC_CHECK_FUNCS([getifaddrs])
 
@@ -772,6 +804,37 @@ AS_IF([test "x$enable_psm3_umr_cache" != "xno"],
               ])
       ])
 
+dnl ------------- hwloc
+AC_ARG_ENABLE([psm3-hwloc],
+        [AS_HELP_STRING([--enable-psm3-hwloc],
+                        [Enable PSM3 use of hwloc for NIC affinity selections @<:@default=check@:>@])],
+        [], [enable_psm3_hwloc=check])
+psm3_hwloc_happy=0
+AS_IF([test "x$enable_psm3_hwloc" != "xno"],
+      [
+        FI_CHECK_PACKAGE([psm3_hwloc],
+                         [hwloc.h],
+                         [hwloc],
+                         [hwloc_topology_init],
+                         [],
+                         [$psm3_PREFIX],
+                         [$psm3_LIBDIR],
+                         [psm3_hwloc_found=1],
+                         [psm3_hwloc_found=0])
+        AS_IF([test $psm3_hwloc_found -ne 1 && test "x$enable_psm3_hwloc" == "xyes"],
+              [
+                psm3_happy=0
+                AC_MSG_ERROR([hwloc Support requested but hwloc headers and/or library not found.])
+              ])
+        AS_IF([test "$psm3_hwloc_found" -eq 1],
+              [
+                  psm3_hwloc_happy=1
+                  CPPFLAGS="$CPPFLAGS $psm3_hwloc_CPPFLAGS -DPSM_USE_HWLOC"
+                  LDFLAGS="$LDFLAGS $psm3_hwloc_LDFLAGS"
+                  LIBS="$LIBS $psm3_hwloc_LIBS"
+              ])
+      ])
+
 dnl ------------- Driver Modules
 psm3_rv_happy=0
 AC_ARG_WITH([psm3-rv],
@@ -852,6 +915,9 @@ AC_DEFINE_UNQUOTED([PSM3_MARCH], ["$PSM3_MARCH"], [PSM3 built with instruction s
 AS_IF([test ! -z "$PSM_CPPFLAGS"], [CPPFLAGS="$CPPFLAGS $PSM_CPPFLAGS"], [])
 AS_IF([test ! -z "$PSM_CFLAGS"], [CFLAGS="$CFLAGS $PSM_CFLAGS"], [])
 
+dnl Workaround for including fabric.c
+AC_DEFINE([HOOK_NOOP_INIT], NULL, [Ignore HOOK_NOOP_INIT])
+AC_DEFINE([COLL_INIT], NULL, [Ignore COLL_INIT])
 dnl Defines not used in PSM3 provider
 AC_DEFINE([HAVE_DMABUF_PEER_MEM], 0, [Ignore HAVE_DMABUF_PEER_MEM])
 AC_DEFINE([HAVE_GDRCOPY], 0, [Ignore HAVE_GDRCOPY])
@@ -862,10 +928,16 @@ AC_DEFINE([HAVE_NEURON], 0, [Ignore HAVE_NEURON])
 AC_DEFINE([HAVE_ROCR], 0, [Ignore HAVE_ROCR])
 AC_DEFINE([HAVE_SYNAPSEAI], 0, [Ignore HAVE_SYNAPSEAI])
 AC_DEFINE([HAVE_UFFD_MONITOR], 0, [Ignore HAVE_UFFD_MONITOR])
+AC_DEFINE([HAVE_XPMEM], 0, [Ignore HAVE_XPMEM])
+
 dnl Provider-specific checks
 dnl FI_PROVIDER_INIT
+AC_DEFINE([HAVE_BGQ], 0, [Ignore HAVE_BGQ])
+AC_DEFINE([HAVE_BGQ_DL], 0, [Ignore HAVE_BGQ_DL])
 AC_DEFINE([HAVE_EFA], 0, [Ignore HAVE_EFA])
 AC_DEFINE([HAVE_EFA_DL], 0, [Ignore HAVE_EFA_DL])
+AC_DEFINE([HAVE_GNI], 0, [Ignore HAVE_GNI])
+AC_DEFINE([HAVE_GNI_DL], 0, [Ignore HAVE_GNI_DL])
 AC_DEFINE([HAVE_MRAIL], 0, [Ignore HAVE_MRAIL])
 AC_DEFINE([HAVE_MRAIL_DL], 0, [Ignore HAVE_MRAIL_DL])
 AC_DEFINE([HAVE_NET], 0, [Ignore HAVE_NET])
@@ -878,6 +950,8 @@ AC_DEFINE([HAVE_PSM2_DL], 0, [Ignore HAVE_PSM2_DL])
 dnl FI_PROVIDER_SETUP([psm3])
 AC_DEFINE([HAVE_OPX], 0, [Ignore HAVE_OPX])
 AC_DEFINE([HAVE_OPX_DL], 0, [Ignore HAVE_OPX_DL])
+AC_DEFINE([HAVE_RSTREAM], 0, [Ignore HAVE_RSTREAM])
+AC_DEFINE([HAVE_RSTREAM_DL], 0, [Ignore HAVE_RSTREAM_DL])
 AC_DEFINE([HAVE_RXD], 0, [Ignore HAVE_RXD])
 AC_DEFINE([HAVE_RXD_DL], 0, [Ignore HAVE_RXD_DL])
 AC_DEFINE([HAVE_RXM], 0, [Ignore HAVE_RXM])
@@ -980,6 +1054,9 @@ fi
 if test $psm3_dsa_happy -eq 1; then
     afeatures="$afeatures, Intel DSA"
 fi
+if test $psm3_hwloc_happy -eq 1; then
+    afeatures="$afeatures, hwloc"
+fi
 if test "x$enable_psm3_udp" = "xyes"; then
     afeatures="$afeatures, UDP"
 fi
diff --git a/prov/psm3/configure.m4 b/prov/psm3/configure.m4
index 6ae917558e8..25aea136db6 100644
--- a/prov/psm3/configure.m4
+++ b/prov/psm3/configure.m4
@@ -371,6 +371,28 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[
                   ])
         ])
 
+        AS_IF([test "x$enable_psm3_hwloc" != "xno"],
+              [
+                FI_CHECK_PACKAGE([psm3_hwloc],
+                                 [hwloc.h],
+                                 [hwloc],
+                                 [hwloc_topology_init],
+                                 [],
+                                 [$psm3_PREFIX],
+                                 [$psm3_LIBDIR],
+                                 [psm3_hwloc_found=1],
+                                 [psm3_hwloc_found=0])
+                AS_IF([test $psm3_hwloc_found -ne 1 && test "x$enable_psm3_hwloc" == "xyes"],
+                      [
+                        psm3_happy=0
+                        AC_MSG_ERROR([hwloc Support requested but hwloc headers and/or library not found.])
+                      ])
+                AS_IF([test "$psm3_hwloc_found" -eq 1],
+                      [
+                        psm3_CPPFLAGS="$psm3_CPPFLAGS -DPSM_USE_HWLOC"
+                      ])
+               ])
+
         AS_IF([test $psm3_happy -eq 1], [
             AC_CONFIG_FILES([prov/psm3/psm3/psm2_hal_inlines_i.h \
                  prov/psm3/psm3/psm2_hal_inlines_d.h \
@@ -381,9 +403,9 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[
      AS_IF([test $psm3_happy -eq 1], [$1], [$2])
 
      psm3_ARCH_CFLAGS="$PSM3_ARCH_CFLAGS"
-     psm3_CPPFLAGS="$psm3_CPPFLAGS $psm3_rt_CPPFLAGS $psm3_dl_CPPFLAGS $psm3_numa_CPPFLAGS $psm3_ibv_CPPFLAGS $psm3_uuid_CPPFLAGS"
-     psm3_LDFLAGS="$psm3_LDFLAGS $psm3_rt_LDFLAGS $psm3_dl_LDFLAGS $psm3_numa_LDFLAGS $psm3_ibv_LDFLAGS $psm3_uuid_LDFLAGS"
-     psm3_LIBS="$psm3_LIBS $psm3_rt_LIBS $psm3_dl_LIBS $psm3_numa_LIBS $psm3_ibv_LIBS $psm3_uuid_LIBS"
+     psm3_CPPFLAGS="$psm3_CPPFLAGS $psm3_rt_CPPFLAGS $psm3_dl_CPPFLAGS $psm3_numa_CPPFLAGS $psm3_ibv_CPPFLAGS $psm3_uuid_CPPFLAGS $psm3_hwloc_CPPFLAGS"
+     psm3_LDFLAGS="$psm3_LDFLAGS $psm3_rt_LDFLAGS $psm3_dl_LDFLAGS $psm3_numa_LDFLAGS $psm3_ibv_LDFLAGS $psm3_uuid_LDFLAGS $psm3_hwloc_LDFLAGS"
+     psm3_LIBS="$psm3_LIBS $psm3_rt_LIBS $psm3_dl_LIBS $psm3_numa_LIBS $psm3_ibv_LIBS $psm3_uuid_LIBS $psm3_hwloc_LIBS"
      AC_SUBST(psm3_CFLAGS)
      AC_SUBST(psm3_ARCH_CFLAGS)
      AC_SUBST(psm3_CPPFLAGS)
@@ -448,4 +470,9 @@ AC_ARG_ENABLE([psm3-umr-cache],
         [Enable support for Userspace Memory Region (UMR) Caching @<:@default=check@:>@])],
     [],
     [enable_psm3_umr_cache=check])
+AC_ARG_ENABLE([psm3-hwloc],
+    [AS_HELP_STRING([--enable-psm3-hwloc],
+        [Enable PSM3 use of hwloc for NIC affinity selections @<:@default=check@:>@])],
+    [],
+    [enable_psm3_hwloc=check])
 dnl vim: set ts=4 sw=4 tw=0 et :
diff --git a/prov/psm3/debian/changelog b/prov/psm3/debian/changelog
index 7eaab218a3a..0b1b356686f 100644
--- a/prov/psm3/debian/changelog
+++ b/prov/psm3/debian/changelog
@@ -1,4 +1,4 @@
-libpsm3-fi (11.5.1.1-1) unstable; urgency=medium
+libpsm3-fi (11.6.0.0-231) unstable; urgency=medium
 
   * Initial release 
 
diff --git a/prov/psm3/debian/control b/prov/psm3/debian/control
index 40dd0224032..43e38c07d02 100644
--- a/prov/psm3/debian/control
+++ b/prov/psm3/debian/control
@@ -2,7 +2,7 @@ Source: libpsm3-fi
 Section: libs
 Priority: optional
 Maintainer: https://www.intel.com/content/www/us/en/support.html
-Build-Depends: debhelper (>= 12~), uuid-dev, libnuma-dev, libibverbs-dev, librdmacm-dev
+Build-Depends: debhelper (>= 12~), uuid-dev, libnuma-dev, libibverbs-dev, librdmacm-dev, libhwloc-dev
 Standards-Version: 4.5.1
 Rules-Requires-Root: no
 
diff --git a/prov/psm3/libpsm3-fi.spec.in b/prov/psm3/libpsm3-fi.spec.in
index a5cbce1be15..b24d4c13a63 100644
--- a/prov/psm3/libpsm3-fi.spec.in
+++ b/prov/psm3/libpsm3-fi.spec.in
@@ -1,6 +1,8 @@
 %{!?configopts: %global configopts LDFLAGS=-Wl,--build-id}
 %{!?provider: %define provider psm3}
 %{!?provider_formal: %define provider_formal PSM3}
+# Disable setting SOURCE_DATE_EPOCH from changelog
+%define source_date_epoch_from_changelog 0
 
 Name: lib%{provider}-fi
 Version: @VERSION@
@@ -18,6 +20,7 @@ Provides: lib${provider}-fi1 = %{version}-%{release}
 
 BuildRequires: libuuid-devel
 BuildRequires: rdma-core-devel
+BuildRequires: hwloc-devel
 %if 0%{?suse_version} >= 1
 BuildRequires: glibc-devel
 BuildRequires: libnuma-devel
diff --git a/prov/psm3/psm3/Makefile.include b/prov/psm3/psm3/Makefile.include
index 3cd1eff52ff..cc52b8f1868 100644
--- a/prov/psm3/psm3/Makefile.include
+++ b/prov/psm3/psm3/Makefile.include
@@ -185,6 +185,8 @@ psm3_libpsm3i_la_SOURCES = \
 	psm3/psm_mq_recv.c \
 	psm3/psm_mq_utils.c \
 	psm3/psm_netutils.h \
+	psm3/psm_nic_select.c \
+	psm3/psm_nic_select.h \
 	psm3/psm_oneapi_ze.c \
 	psm3/psm_perf.c \
 	psm3/psm_perf.h \
@@ -196,13 +198,13 @@ psm3_libpsm3i_la_SOURCES = \
 	psm3/psm_sysbuf.h \
 	psm3/psm_timer.c \
 	psm3/psm_timer.h \
+	psm3/psm_uffd.c \
+	psm3/psm_uffd.h \
 	psm3/psm_user.h \
 	psm3/psm_utils.c \
 	psm3/psm_utils.h \
 	psm3/psm_verbs_mr.c \
 	psm3/psm_verbs_mr.h \
-	psm3/psm_verbs_umrc.c \
-	psm3/psm_verbs_umrc.h \
 	psm3/psmi_wrappers.c \
 	psm3/psmi_wrappers.h \
 	psm3/psm2.h \
diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.c b/prov/psm3/psm3/hal_sockets/sockets_ep.c
index 8e095b71315..27b98631508 100755
--- a/prov/psm3/psm3/hal_sockets/sockets_ep.c
+++ b/prov/psm3/psm3/hal_sockets/sockets_ep.c
@@ -159,11 +159,16 @@ psm3_ep_open_udp_internal(psm2_ep_t ep, int unit, int port,
 	}
 
 	if (!is_aux) {
-		psm3_getenv("PSM3_UDP_GSO",
-				"Enable UDP GSO Segmentation Offload (0 disables GSO)",
-				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
-				(union psmi_envvar_val)1, &env_gso);
-		ep->sockets_ep.udp_gso = env_gso.e_int;
+		psm3_getenv_range("PSM3_UDP_GSO",
+				"Enable UDP GSO Segmentation Offload",
+				"(0 disables GSO, 1 sets max chunk to 65536, >1 specifies max chunk)",
+				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val)UINT16_MAX,
+				(union psmi_envvar_val)0, (union psmi_envvar_val)UINT16_MAX,
+				NULL, NULL, &env_gso);
+		ep->sockets_ep.udp_gso = env_gso.e_uint;
+		if (ep->sockets_ep.udp_gso == 1)
+			ep->sockets_ep.udp_gso = UINT16_MAX;
 		if (ep->sockets_ep.udp_gso) {
 			int gso;
 			socklen_t optlen = sizeof(gso);
@@ -553,6 +558,57 @@ psm2_error_t psm3_tune_tcp_socket(const char *sck_name, psm2_ep_t ep, int fd)
 	return PSM2_INTERNAL_ERR;
 }
 
+/* parse TCP port range for PSM3_TCP_PORT_RANGE
+ * format is low:high
+ * low must be <= high and each must be < UINT16_MAX.
+ * Either field can be omitted in which case default (input tvals) is used
+ * for given field.
+ * 0 - successfully parsed, tvals updated
+ * -1 - str empty, tvals unchanged
+ * -2 - syntax error, tvals may have been changed
+ */
+static int parse_tcp_port_range(const char *str,
+				size_t errstr_size, char errstr[],
+				int tvals[2])
+{
+	psmi_assert(tvals);
+	int ret = psm3_parse_str_tuples(str, 2, tvals);
+	if (ret < 0)
+		return ret;
+	if (tvals[0] > UINT16_MAX || tvals[1] > UINT16_MAX) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " Max allowed is %u", UINT16_MAX);
+		return -2;
+	}
+	if (tvals[0] < 0 || tvals[1] < 0) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " Negative values not allowed");
+		return -2;
+	}
+	if ((tvals[0] == TCP_PORT_AUTODETECT && tvals[1] != TCP_PORT_AUTODETECT)
+		|| (tvals[0] != TCP_PORT_AUTODETECT && tvals[1] == TCP_PORT_AUTODETECT)) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " low of %d only allowed with high of %d", TCP_PORT_AUTODETECT, TCP_PORT_AUTODETECT);
+		return -2;
+	}
+	if (tvals[0] > tvals[1]) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " low (%d) > high (%d)", tvals[0], tvals[1]);
+		return -2;
+	}
+	return 0;
+}
+
+static int parse_check_tcp_port_range(int type,
+				const union psmi_envvar_val val, void *ptr,
+				size_t errstr_size, char errstr[])
+{
+	// parser will set tvals to result, use a copy to protect input of defaults
+	int tvals[2] = { ((int*)ptr)[0], ((int*)ptr)[1] };
+	psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES);
+	return parse_tcp_port_range(val.e_str, errstr_size, errstr, tvals);
+}
+
 static __inline__
 psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd,
 	psm3_sockaddr_in_t *addr,
@@ -567,12 +623,16 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd,
 	char range_def[32];
 	snprintf(range_def, sizeof(range_def), "%d:%d", tvals[0], tvals[1]);
 
-	if (!psm3_getenv("PSM3_TCP_PORT_RANGE",
-		"Set the TCP listener port range <low:high>. The listener will bind to a random port in the range. '0:0'=let OS pick.",
+	(void)psm3_getenv_range("PSM3_TCP_PORT_RANGE",
+		"Set the TCP listener port range <low:high>.",
+		"The listener will bind to a random port in the range. '0:0'=let OS pick.",
 		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES,
-		(union psmi_envvar_val) range_def, &env_val)) {
-		/* not using default values */
-		(void)psm3_parse_str_tuples(env_val.e_str, 2, tvals);
+		(union psmi_envvar_val) range_def,
+		(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+		parse_check_tcp_port_range, tvals, &env_val);
+	if (parse_tcp_port_range(env_val.e_str, 0, NULL, tvals) < 0) {
+		// already checked, shouldn't get parse errors nor empty strings
+		psmi_assert(0);
 	}
 
 	_HFI_DBG("PSM3_TCP_PORT_RANGE = %d:%d\n", tvals[0], tvals[1]);
@@ -583,17 +643,14 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd,
 		start = 0;
 		end = 0;
 		_HFI_DBG("Binding to OS provided port\n");
-	} else if (tvals[0] > 0 && tvals[0] <= tvals[1] && tvals[1] <= UINT16_MAX) {
+	} else {
+		psmi_assert(tvals[0] > 0);
 		// start with a random port, find the first available one.
 		port = psm3_rand((long int) getpid());
 		port = port % (tvals[1] + 1 - tvals[0]) + tvals[0];
 		start = (uint16_t)tvals[0];
 		end = (uint16_t)tvals[1];
 		_HFI_DBG("Binding to port in range [%" PRIu16 ":%" PRIu16 "], starting from %ld\n", start, end, port);
-	} else {
-		// high < low or only set one
-		_HFI_ERROR("Invalid TCP port range [%d:%d]\n", tvals[0], tvals[1]);
-		return PSM2_INTERNAL_ERR;
 	}
 
 	psm3_getenv("PSM3_TCP_BACKLOG",
@@ -637,6 +694,46 @@ psm2_error_t listen_to_port(psm2_ep_t ep, int sockfd,
 	return PSM2_INTERNAL_ERR;
 }
 
+/* parse TCP skip poll counts for PSM3_TCP_SKIPPOLL_COUNT
+ * format is inactive_polls:active_polls
+ * inactive_polls must be >= active_polls
+ * Either field can be omitted in which case default (input tvals) is used
+ * for given field.
+ * 0 - successfully parsed, tvals updated
+ * -1 - str empty, tvals unchanged
+ * -2 - syntax error, tvals may have been changed
+ */
+static int parse_tcp_skippoll_count(const char *str,
+				size_t errstr_size, char errstr[],
+				int tvals[2])
+{
+	psmi_assert(tvals);
+	int ret = psm3_parse_str_tuples(str, 2, tvals);
+	if (ret < 0)
+		return ret;
+	if (tvals[0] < 0 || tvals[1] < 0) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " Negative values not allowed");
+		return -2;
+	}
+	if (tvals[0] < tvals[1]) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " inactive_polls (%d) must be >= active_polls (%d)", tvals[0], tvals[1]);
+		return -2;
+	}
+	return 0;
+}
+
+static int parse_check_tcp_skippoll_count(int type,
+				const union psmi_envvar_val val, void *ptr,
+				size_t errstr_size, char errstr[])
+{
+	// parser will set tvals to result, use a copy to protect input of defaults
+	int tvals[2] = { ((int*)ptr)[0], ((int*)ptr)[1] };
+	psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES);
+	return parse_tcp_skippoll_count(val.e_str, errstr_size, errstr, tvals);
+}
+
 psm2_error_t
 psm3_ep_open_tcp_internal(psm2_ep_t ep, int unit, int port,
 			psm2_uuid_t const job_key)
@@ -772,21 +869,16 @@ psm3_ep_open_tcp_internal(psm2_ep_t ep, int unit, int port,
 	char buf[32];
 	snprintf(buf, sizeof(buf), "%d:%d", TCP_INACT_SKIP_POLLS, TCP_ACT_SKIP_POLLS);
 	int tvals[2] = {TCP_INACT_SKIP_POLLS, TCP_ACT_SKIP_POLLS};
-	if (!psm3_getenv("PSM3_TCP_SKIPPOLL_COUNT",
-		"Polls to skip under inactive and active connections <inactive_polls[:active_polls]> "
+	(void)psm3_getenv_range("PSM3_TCP_SKIPPOLL_COUNT",
+		"Polls to skip under inactive and active connections <inactive_polls[:active_polls]> ",
 		"where inactive_polls >= active_polls.",
 		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES,
-		(union psmi_envvar_val) buf, &env_val)) {
-		(void)psm3_parse_str_tuples(env_val.e_str, 2, tvals);
-		if (tvals[0] < 0) {
-			tvals[0] = TCP_INACT_SKIP_POLLS;
-		}
-		if (tvals[1] < 0) {
-			tvals[1] = TCP_ACT_SKIP_POLLS;
-		}
-		if (tvals[1] > tvals[0]) {
-			tvals[1] = tvals[0];
-		}
+		(union psmi_envvar_val) buf,
+		(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+		parse_check_tcp_skippoll_count, tvals, &env_val);
+	if (parse_tcp_skippoll_count(env_val.e_str, 0, NULL, tvals) < 0) {
+		// already checked, shouldn't get parse errors nor empty strings
+		psmi_assert(0);
 	}
 	ep->sockets_ep.inactive_skip_polls = tvals[0];
 	ep->sockets_ep.active_skip_polls_offset = tvals[0] - tvals[1];
@@ -1084,10 +1176,11 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 
 	if (ep->sockets_ep.udp_gso) {
 		// set upper bounds for GSO segmentation
-		// OS limitation of 64K (UINT16_MAX)
+		// OS limitation of 64K (UINT16_MAX) and UDP_MAX_SEGMENTS (64)
 		ep->chunk_max_segs = min(UINT16_MAX / (ep->mtu + sizeof(struct ips_message_header)), UDP_MAX_SEGMENTS);
-		ep->chunk_max_size = ep->mq->hfi_base_window_rv;
-		// for acks to pipeline well need to limit max_nsegs to
+		ep->chunk_max_size = ep->sockets_ep.udp_gso;
+
+		// for acks to pipeline we'll need to limit max_nsegs to
 		// < flow_credits/2 and max_size to < flow_credit_bytes/2
 		// (ideally 1/4, but that makes GSO too small and is worse)
 		ep->chunk_max_segs = min(ep->chunk_max_segs, proto->flow_credits/2);
diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.h b/prov/psm3/psm3/hal_sockets/sockets_ep.h
index 5bfc3ffdb82..51fcd06f792 100644
--- a/prov/psm3/psm3/hal_sockets/sockets_ep.h
+++ b/prov/psm3/psm3/hal_sockets/sockets_ep.h
@@ -185,7 +185,7 @@ struct psm3_sockets_ep {
 	int active_skip_polls_offset; // tailored for internal use. it's inactive_skip_polls - active_skip_polls
 	struct msghdr snd_msg; // struct used for sendmsg
 	/* fields specific to UDP */
-	int udp_gso;	// is GSO enabled for UDP
+	unsigned udp_gso;	// is GSO enabled for UDP, max chunk_size
 	uint8_t *sbuf_udp_gso;	// buffer to compose UDP GSO packet sequence
 	int udp_gso_zerocopy;	// is UDP GSO Zero copy option enabled
 	int udp_gro; // will be used later
diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal.c b/prov/psm3/psm3/hal_sockets/sockets_hal.c
index 0c8087450b3..8d4527bdd64 100644
--- a/prov/psm3/psm3/hal_sockets/sockets_hal.c
+++ b/prov/psm3/psm3/hal_sockets/sockets_hal.c
@@ -175,15 +175,15 @@ static void psm3_hfp_sockets_mq_init_defaults(struct psm2_mq *mq)
 	 * corresponding PSM3_* env variables.
 	 * Otherwise these defaults are used.
 	 */
-	mq->hfi_thresh_rv = 64000;
-	mq->hfi_base_window_rv = 131072;
+	mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH;
+	mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR;
 	// Even without RDMA do we want to disable rendezvous?
 	// even without RDMA, the receiver controlled pacing helps scalability
 	mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous
 	mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY;
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	if (PSMI_IS_GPU_ENABLED)
-		mq->hfi_base_window_rv = 2097152;
+		mq->ips_gpu_window_rv_str = PSM_GPU_NIC_RNDV_WINDOW_STR;
 #endif
 	// we parse inet and rv_gpu_cache_size here so we can cache it
 	// once per EP open, even if multi-rail or multi-QP
diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.c b/prov/psm3/psm3/hal_verbs/verbs_ep.c
index 979787b7af6..10a4e845e4b 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_ep.c
+++ b/prov/psm3/psm3/hal_verbs/verbs_ep.c
@@ -113,7 +113,7 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t
 	// make sure all fields are empty.
 	memset(&ep->verbs_ep,0,sizeof(ep->verbs_ep));
 
-	ep->verbs_ep.qkey = *(uint32_t*)job_key;	// use 1st 32 bits of job_key
+	ep->verbs_ep.qkey = (*(uint32_t*)job_key) & 0x7FFFFFFF; // use 1st 31 bits of job_key (MSB is reserved)
 
 	if (_HFI_PRDBG_ON) {
 		char uuid_str[64];
@@ -180,12 +180,48 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t
 					ep->dev_name, strerror(errno));
 		goto fail;
 	}
-	// this gets done by psm3_verbs_poll_type
-	//if (ibv_req_notify_cq(ep->verbs_ep.recv_cq, 0)) {
-	//	_HFI_ERROR("Can't request RQ events from %s: %s\n",
-	//					ep->dev_name, strerror(errno));
-	//	goto fail;
-	//}
+
+#ifdef USE_RC
+	if (IPS_PROTOEXP_FLAG_USER_RC_QP(ep->rdmamode)) {
+		// SRQ improves scalability
+		struct ibv_device_attr dev_attr;
+		union psmi_envvar_val envvar_val;
+
+		// get RDMA capabilities of device
+		if (ibv_query_device(ep->verbs_ep.context, &dev_attr)) {
+			_HFI_ERROR("Unable to query device %s: %s\n", ep->dev_name,
+						strerror(errno));
+			goto fail;
+		}
+		_HFI_DBG("max_srq=%d\n", dev_attr.max_srq);
+		if (dev_attr.max_srq) {
+			psm3_getenv("PSM3_USE_SRQ",
+				"If device supports SRQ, use it [1=yes, 0=no) [1]",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val)1, &envvar_val);
+			if (envvar_val.e_uint) {
+				struct ibv_srq_init_attr attr = { 0 };
+				attr.srq_context = ep;	// our own pointer
+				attr.attr.max_wr = ep->verbs_ep.hfi_num_recv_wqes;
+				attr.attr.max_sge = 1;
+
+				ep->verbs_ep.srq = ibv_create_srq(ep->verbs_ep.pd, &attr);
+				if (ep->verbs_ep.srq == NULL) {
+					_HFI_ERROR( "Unable to create SRQ on %s: %s\n",
+						ep->dev_name, strerror(errno));
+					if (errno == ENOMEM) {
+						_HFI_ERROR( "Requested SRQ size might be too big. Try reducing TX depth and/or inline size.\n");
+						_HFI_ERROR( "Requested RX depth was %u .\n",
+								ep->verbs_ep.hfi_num_recv_wqes);
+					}
+					goto fail;
+				}
+				_HFI_DBG("created SRQ\n");
+				ep->addl_nic_info = " SRQ";
+			}
+		}
+	}
+#endif /* USE_RC */
 
 	// TBD - should we pick an EQ number
 	// we use ep as the cq_context (would be in callbacks if any)
@@ -194,13 +230,20 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t
 	// so CQ only needs a little headroom to be safe (1000)
 	// HFI_TF_NFLOWS (32) limits receiver side concurrent tidflows (aka inbound
 	// RDMA w/immed).
-	// For USER RC Eager we can have num_recv_wqes/FRACTION per QP
-	// in which case theoretical need could be huge.  We add 4000 as a
+	// For USER RC Eager without SRQ we can have num_recv_wqes/FRACTION per
+	// QP in which case theoretical need could be huge.  We add 4000 as a
 	// swag to cover most cases and user can always tune higher as needed
+	// For USER RC Eager with SRQ worse case is num_recv_wqes so we
+	// add that to allow up to num_recv_wqes on UD QP and SRQ each and keep
+	// the HFI_TF_NFLOWS+1000 as headroom.
 	if (! ep->verbs_ep.hfi_num_recv_cqes) {
 		ep->verbs_ep.hfi_num_recv_cqes = ep->verbs_ep.hfi_num_recv_wqes+HFI_TF_NFLOWS+1000;
-		if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC)
-			ep->verbs_ep.hfi_num_recv_cqes += 4000;
+		if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) {
+			if (ep->verbs_ep.srq)
+				ep->verbs_ep.hfi_num_recv_cqes += ep->verbs_ep.hfi_num_recv_wqes;
+			else
+				ep->verbs_ep.hfi_num_recv_cqes += 4000;
+		}
 	}
 	ep->verbs_ep.recv_cq = ibv_create_cq(ep->verbs_ep.context,
 						 ep->verbs_ep.hfi_num_recv_cqes,
@@ -211,12 +254,16 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t
 					strerror(errno));
 		goto fail;
 	}
+	// this gets done by psm3_verbs_poll_type
+	//if (ibv_req_notify_cq(ep->verbs_ep.recv_cq, 0)) {
+	//	_HFI_ERROR("Can't request RQ events from %s: %s\n",
+	//					ep->dev_name, strerror(errno));
+	//	goto fail;
+	//}
 
 	ep->verbs_ep.qp = ud_qp_create(ep);
-	if (! ep->verbs_ep.qp) {
-		_HFI_ERROR( "Unable to create UD QP on %s\n", ep->dev_name);
+	if (! ep->verbs_ep.qp)
 		goto fail;
-	}
 
 	psmi_assert_always (ep->verbs_ep.context);
 
@@ -306,7 +353,8 @@ psm3_verbs_parse_params(psm2_ep_t ep)
 	psm3_getenv("PSM3_NUM_RECV_CQES",
 			"Number of recv CQEs to allocate\n"
 			"(0 will calculate as PSM3_NUM_RECV_WQES+1032 for PSM3_RDMA=0-2\n"
-			"and 4000 more than that for PSM3_RDMA=3]) [0]",
+			"for PSM3_RDMA=3 with SRQ, allow an additional PSM3_NUM_RECV_WQES\n"
+			"for PSM3_RDMA=3 without SRQ, allow an additional 4000) [0]",
 			PSMI_ENVVAR_LEVEL_USER,
 			PSMI_ENVVAR_TYPE_UINT,
 			(union psmi_envvar_val)0, &envvar_val);
@@ -343,11 +391,12 @@ psm3_verbs_parse_params(psm2_ep_t ep)
 	 * otherwise ignored
 	 */
 	// RV defaults are sufficient for default PSM parameters
-	// but if user adjusts ep->hfi_num_send_rdma or mq->hfi_base_window_rv
+	// but if user adjusts ep->hfi_num_send_rdma or mq->ips_cpu_window_rv
 	// they also need to increase the cache size.  psm3_verbs_alloc_mr_cache
 	// will verify cache size is sufficient.
 	// min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) *
-	// chunk size (mq->hfi_base_window_rv after psm3_mq_initialize_params)
+	// chunk size (psm3_mq_max_window_rv(mq, 0) after
+	// psm3_mq_initialize_params)
 	// for OPA native, actual window_rv may be smaller, but for UD it
 	// is not reduced
 	psm3_getenv("PSM3_RV_MR_CACHE_SIZE",
@@ -358,12 +407,14 @@ psm3_verbs_parse_params(psm2_ep_t ep)
 			(union psmi_envvar_val)0, &envvar_val);
 	ep->rv_mr_cache_size = envvar_val.e_uint;
 	// TBD - we could check cache_size >= minimum based on:
-	// 		(HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * mq->hfi_base_window_rv 
+	// 		(HFI_TF_NFLOWS + ep->hfi_num_send_rdma)
+	//		* psm3_mq_max_window_rv(mq, 0)
 	// and automatically increase with warning if not?
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	ep->rv_gpu_cache_size = psmi_parse_gpudirect_rv_gpu_cache_size(0);
 	// TBD - we could check gpu_cache_size >= minimum based on:
-	// 		(HFI_TF_NFLOWS + ep->hfi_num_send_rdma) * mq->hfi_base_window_rv 
+	// 		(HFI_TF_NFLOWS + ep->hfi_num_send_rdma)
+	//		* psm3_mq_max_window_rv(mq, 1)
 	// and automatically increase with warning if not?
 #endif
 
@@ -464,7 +515,7 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 	ep->verbs_ep.send_reap_thresh = min(ep->verbs_ep.hfi_send_reap_thresh, ep->verbs_ep.send_pool.send_total/2);
 	_HFI_PRDBG("reaping when %u posted.\n", ep->verbs_ep.send_reap_thresh);
 
-	if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, ep->verbs_ep.qp, &ep->verbs_ep.recv_pool, 
+	if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, 0, ep->verbs_ep.qp, &ep->verbs_ep.recv_pool,
 				min(ep->verbs_ep.hfi_num_recv_wqes, ep->verbs_ep.qp_cap.max_recv_wr),
 				// want to end up with multiple of cache line (64)
 				// ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
@@ -474,6 +525,25 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 		_HFI_ERROR( "Unable to allocate UD recv buffer pool\n");
 		goto fail;
 	}
+#ifdef USE_RC
+	if (ep->verbs_ep.srq) {
+		if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, 1, ep->verbs_ep.srq, &ep->verbs_ep.srq_recv_pool,
+					ep->verbs_ep.hfi_num_recv_wqes,
+					 (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0
+					// want to end up with multiple of cache line (64)
+					// ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
+					// be conservative (+BUFFER_HEADROOM)
+					: (ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM)
+			)) {
+			_HFI_ERROR( "Unable to allocate SRQ recv buffer pool\n");
+			goto fail;
+		}
+		if (PSM2_OK != psm3_ep_verbs_prepost_recv(&ep->verbs_ep.srq_recv_pool)) {
+			_HFI_ERROR( "Unable to prepost recv buffers on SRQ for %s port %u\n", ep->dev_name, ep->portnum);
+			goto fail;
+		}
+	}
+#endif /* USE_RC */
 
 	// no send segmentation, max_segs will constrain
 	ep->chunk_max_segs = 1;
@@ -515,6 +585,9 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 	return PSM2_OK;
 
 fail:
+#ifdef USE_RC
+	psm_verbs_free_recv_pool(&ep->verbs_ep.srq_recv_pool);
+#endif
 	psm_verbs_free_send_pool(&ep->verbs_ep.send_pool);
 	psm_verbs_free_recv_pool(&ep->verbs_ep.recv_pool);
 	return PSM2_INTERNAL_ERR;
@@ -756,6 +829,13 @@ void psm3_ep_free_verbs(psm2_ep_t ep)
 		psm3_rv_close(ep->rv);
 		ep->rv = NULL;
 	}
+#endif
+#ifdef USE_RC
+	if (ep->verbs_ep.srq) {
+		ibv_destroy_srq(ep->verbs_ep.srq);
+		ep->verbs_ep.srq = NULL;
+	}
+	psm_verbs_free_recv_pool(&ep->verbs_ep.srq_recv_pool);
 #endif
 	if (ep->verbs_ep.pd) {
 		ibv_dealloc_pd(ep->verbs_ep.pd);
@@ -796,6 +876,16 @@ psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd,
 			_HFI_ERROR( "can't alloc send buffers");
 			goto fail;
 		}
+#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
+		// By registering memory with Cuda, we make
+		// cuMemcpy run faster for copies from
+		// GPU to the send buffer.
+		if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
+			PSMI_CUDA_CALL(cuMemHostRegister,
+				pool->send_buffers,
+				pool->send_total*pool->send_buffer_size,
+				CU_MEMHOSTALLOC_PORTABLE);
+#endif
 #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
 		// By registering memory with Level Zero, we make
 		// zeCommandListAppendMemoryCopy run faster for copies from
@@ -860,13 +950,22 @@ extern psm2_error_t psm_verbs_init_send_allocator(
 // which are tracked in other structures but still part of the ep's memory stats
 // For RC QPs receiving only RDMA Write with immediate, no buffer space is
 // needed.  Caller will specify recv_buffer_size==0 with a recv_total.
-psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp,
-			psm3_verbs_recv_pool_t pool,
+psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, uint32_t for_srq,
+			void *qp_srq, psm3_verbs_recv_pool_t pool,
 			uint32_t recv_total, uint32_t recv_buffer_size)
 {
 	memset(pool,0,sizeof(*pool));
 
-	pool->qp = qp;	// save a reference
+#ifdef USE_RC
+	pool->for_srq = for_srq;
+	if (for_srq)
+		pool->srq = (struct ibv_srq *)qp_srq;	// save a reference
+	else
+#endif
+		pool->qp = (struct ibv_qp *)qp_srq;	// save a reference
+#ifndef USE_RC
+	psmi_assert(! for_srq);
+#endif
 	pool->ep = ep;
 	pool->recv_total = recv_total;
 
@@ -878,7 +977,11 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp,
 			// allocate recv buffers
 			pool->recv_buffer_size = recv_buffer_size;
 			// beginning of UD QP Recv Buf always consumed with space for IB GRH
-			if (qp->qp_type == IBV_QPT_UD) {
+			if (
+#ifdef USE_RC
+				! pool->for_srq &&
+#endif
+				pool->qp->qp_type == IBV_QPT_UD) {
 				// round up UD_ADDITION (40) to multiple of 64 for better
 				// cache alignment of buffers
 				pool->recv_buffer_size += ROUNDUP(UD_ADDITION, 64);
@@ -892,6 +995,16 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp,
 				_HFI_ERROR( "can't alloc recv buffers");
 				goto fail;
 			}
+#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
+			// By registering memory with Cuda, we make
+			// cuMemcpy run faster for copies from
+			// recv buffer to GPU
+			if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
+				PSMI_CUDA_CALL(cuMemHostRegister,
+					pool->recv_buffers,
+					pool->recv_total*pool->recv_buffer_size,
+					CU_MEMHOSTALLOC_PORTABLE);
+#endif
 #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
 			// By registering memory with Level Zero, we make
 			// zeCommandListAppendMemoryCopy run faster for copies from
@@ -921,7 +1034,11 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp,
 			// UD doesn't support RDMA, so we just need local NIC to be able to
 			// access our buffers with kernel bypass (IBV_ACCESS_LOCAL_WRITE)
 			pool->recv_buffer_mr = ibv_reg_mr(
-							qp->pd, pool->recv_buffers,
+#ifdef USE_RC
+							for_srq?pool->srq->pd:
+#endif
+								pool->qp->pd,
+							pool->recv_buffers,
 							pool->recv_total*pool->recv_buffer_size,
 							IBV_ACCESS_LOCAL_WRITE);
 			if (! pool->recv_buffer_mr) {
@@ -932,7 +1049,7 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp,
 		} else {
 #ifdef USE_RC
 			// we want a pool for RDMA Write w/immediate recv.  No buffers
-			psmi_assert(qp->qp_type != IBV_QPT_UD);
+			psmi_assert(for_srq || pool->qp->qp_type != IBV_QPT_UD);
 			// we use exactly 1 rbuf so wr_id can lead us to pool and qp
 			pool->recv_bufs = (struct verbs_rbuf *)psmi_calloc(ep, NETWORK_BUFFERS,
 							 sizeof(struct verbs_rbuf), 1);
@@ -989,10 +1106,37 @@ void psm_verbs_free_send_pool(psm3_verbs_send_pool_t pool)
 		pool->send_bufs = NULL;
 	}
 	if (pool->send_buffers) {
+#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
+		if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
+			/* ignore NOT_REGISTERED in case cuda initialized late */
+			/* ignore other errors as context could be destroyed before this */
+			CUresult cudaerr;
+			//PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+			//		cuMemHostUnregister, pool->send_buffers);
+			psmi_count_cuMemHostUnregister++;
+			cudaerr = psmi_cuMemHostUnregister(pool->send_buffers);
+			if (cudaerr) {
+				const char *pStr = NULL;
+				psmi_count_cuGetErrorString++;
+				psmi_cuGetErrorString(cudaerr, &pStr);
+				_HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n",
+						cudaerr, pStr?pStr:"Unknown");
+			}
+
+		}
+#endif
 #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-		if (PSMI_IS_GPU_ENABLED)
-			PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer,
-					 ze_driver, pool->send_buffers);
+		if (PSMI_IS_GPU_ENABLED) {
+			ze_result_t result;
+			//PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer,
+			//		 ze_driver, pool->send_buffers);
+			psmi_count_zexDriverReleaseImportedPointer++;
+			result = psmi_zexDriverReleaseImportedPointer(ze_driver,
+					pool->send_buffers);
+			if (result != ZE_RESULT_SUCCESS) {
+				_HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result));
+			}
+		}
 #endif
 		psmi_free(pool->send_buffers);
 		pool->send_buffers = NULL;
@@ -1014,10 +1158,36 @@ void psm_verbs_free_recv_pool(psm3_verbs_recv_pool_t pool)
 	}
 #endif
 	if (pool->recv_buffers) {
+#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
+		if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
+			/* ignore NOT_REGISTERED in case cuda initialized late */
+			/* ignore other errors as context could be destroyed before this */
+			CUresult cudaerr;
+			//PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+			//		cuMemHostUnregister, pool->recv_buffers);
+			psmi_count_cuMemHostUnregister++;
+			cudaerr = psmi_cuMemHostUnregister(pool->recv_buffers);
+			if (cudaerr) {
+				const char *pStr = NULL;
+				psmi_count_cuGetErrorString++;
+				psmi_cuGetErrorString(cudaerr, &pStr);
+				_HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n",
+						cudaerr, pStr?pStr:"Unknown");
+			}
+		}
+#endif
 #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-		if (PSMI_IS_GPU_ENABLED)
-			PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer,
-					 ze_driver, pool->recv_buffers);
+		if (PSMI_IS_GPU_ENABLED) {
+			ze_result_t result;
+			//PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer,
+			//		 ze_driver, pool->recv_buffers);
+			psmi_count_zexDriverReleaseImportedPointer++;
+			result = psmi_zexDriverReleaseImportedPointer(ze_driver,
+					pool->recv_buffers);
+			if (result != ZE_RESULT_SUCCESS) {
+				_HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result));
+			}
+		}
 #endif
 		psmi_free(pool->recv_buffers);
 		pool->recv_buffers = NULL;
@@ -1181,27 +1351,44 @@ psm2_error_t psm3_ep_verbs_post_recv(
 			PSM3_FAULTINJ_STATIC_DECL(fi_rq_lkey, "rq_lkey",
 					"post UD "
 #ifdef USE_RC
-					"or RC "
+					"or RC or SRQ "
 #endif
 					"RQ WQE with bad lkey",
 					0, IPS_FAULTINJ_RQ_LKEY);
-			if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep, " QP %u", pool->qp->qp_num))
+			// SRQ has no number but need consistency in fmt and number of args
+			if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep,
+#ifdef USE_RC
+				 "%s %u", pool->for_srq?"SRQ":"QP", pool->for_srq?0:pool->qp->qp_num))
+#else
+				 " QP %u", pool->qp->qp_num))
+#endif
 				wr->sg_list->lkey = 55;
 		} else
 			wr->sg_list->lkey = pool->recv_buffer_mr->lkey;
 #endif // PSM_FI
 		if_pf (++pool->next_recv_wqe >= VERBS_RECV_QP_COALLESCE) {
 			// we have a batch ready to post
-			if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) {
-				_HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
-				return PSM2_INTERNAL_ERR;
+#ifdef USE_RC
+			if (pool->for_srq) {
+				if_pf (ibv_post_srq_recv(pool->srq, pool->recv_wr_list, &bad_wr)) {
+					_HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+					return PSM2_INTERNAL_ERR;
+				}
+				//_HFI_VDBG("posted SRQ, including buffer %u\n", index);
+			} else
+#endif
+			{
+				if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) {
+					_HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+					return PSM2_INTERNAL_ERR;
+				}
+				//_HFI_VDBG("posted RQ, including buffer %u\n", index);
 			}
-			//_HFI_VDBG("posted RQ, including buffer %u\n", index);
 			pool->next_recv_wqe = 0;
 		} else {
 			//_HFI_VDBG("preped RQE, buffer %u\n", index);
 		}
-#else
+#else /* VERBS_RECV_QP_COALLESCE > 1 */
 		list.addr = (uintptr_t)rbuf_to_buffer(buf);
 		list.length = pool->recv_buffer_size;
 		list.lkey = pool->recv_buffer_mr->lkey;
@@ -1210,11 +1397,17 @@ psm2_error_t psm3_ep_verbs_post_recv(
 			PSM3_FAULTINJ_STATIC_DECL(fi_rq_lkey, "rq_lkey",
 					"post UD "
 #ifdef USE_RC
-					"or RC "
+					"or RC or SRQ"
 #endif
 					"RQ WQE with bad lkey",
 					0, IPS_FAULTINJ_RQ_LKEY);
-			if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep, " QP %u", pool->qp->qp_num))
+			// SRQ has no number but need consistency in fmt and number of args
+			if_pf(PSM3_FAULTINJ_IS_FAULT(fi_rq_lkey, pool->ep,
+#ifdef USE_RC
+				 "%s %u", pool->for_srq?"SRQ":"QP", pool->for_srq?0:pool->qp->qp_num))
+#else
+				 " QP %u", pool->qp->qp_num))
+#endif
 				list.lkey = 55;
 		}
 #endif // PSM_FI
@@ -1223,12 +1416,23 @@ psm2_error_t psm3_ep_verbs_post_recv(
 		wr.sg_list = &list;
 		wr.num_sge = 1;	// size of sg_list
 
-		if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) {
-			_HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
-			return PSM2_INTERNAL_ERR;
-		}
-		//_HFI_VDBG("posted RQ, buffer %u\n", index);
+#ifdef USE_RC
+		if (pool->for_srq) {
+			if_pf (ibv_post_srq_recv(pool->srq, &wr, &bad_wr)) {
+				_HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+				return PSM2_INTERNAL_ERR;
+			}
+			//_HFI_VDBG("posted SRQ, buffer %u\n", index);
+		} else
 #endif
+		{
+			if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) {
+				_HFI_ERROR("failed to post RQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+				return PSM2_INTERNAL_ERR;
+			}
+			//_HFI_VDBG("posted RQ, buffer %u\n", index);
+		}
+#endif /* VERBS_RECV_QP_COALLESCE > 1 */
 #ifdef USE_RC
 	} else {
 #if VERBS_RECV_QP_COALLESCE > 1
@@ -1238,27 +1442,43 @@ psm2_error_t psm3_ep_verbs_post_recv(
 		wr->wr_id = (uintptr_t)buf;	// we'll get this back in completion
 		if_pf (++pool->next_recv_wqe >= VERBS_RECV_QP_COALLESCE) {
 			// we have a batch ready to post
-			if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) {
-				_HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
-				return PSM2_INTERNAL_ERR;
+			if (pool->for_srq) {
+				if_pf (ibv_post_srq_recv(pool->srq, pool->recv_wr_list, &bad_wr)) {
+					_HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+					return PSM2_INTERNAL_ERR;
+				}
+				//_HFI_VDBG("posted SRQ\n");
+			} else {
+				if_pf (ibv_post_recv(pool->qp, pool->recv_wr_list, &bad_wr)) {
+					_HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+					return PSM2_INTERNAL_ERR;
+				}
+				//_HFI_VDBG("posted RQ\n");
 			}
-			//_HFI_VDBG("posted RQ\n");
 			pool->next_recv_wqe = 0;
 		} else {
 			//_HFI_VDBG("preped RQE\n");
 		}
-#else
+#else /* VERBS_RECV_QP_COALLESCE > 1 */
 		wr.next = NULL;	// just post 1
 		wr.wr_id = (uintptr_t)buf;	// we'll get this back in completion
 		wr.sg_list = NULL;
 		wr.num_sge = 0;	// size of sg_list
 
-		if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) {
-			_HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
-			return PSM2_INTERNAL_ERR;
+		if (pool->for_srq) {
+			if_pf (ibv_post_srq_recv(pool->srq, &wr, &bad_wr)) {
+				_HFI_ERROR("failed to post SRQ on %s port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+				return PSM2_INTERNAL_ERR;
+			}
+			//_HFI_VDBG("posted SRQ\n");
+		} else {
+			if_pf (ibv_post_recv(pool->qp, &wr, &bad_wr)) {
+				_HFI_ERROR("failed to post RQ on %s on port %u: %s", pool->ep->dev_name, pool->ep->portnum, strerror(errno));
+				return PSM2_INTERNAL_ERR;
+			}
+			//_HFI_VDBG("posted RQ\n");
 		}
-		//_HFI_VDBG("posted RQ\n");
-#endif
+#endif /* VERBS_RECV_QP_COALLESCE > 1 */
 #endif // USE_RC
 	}
 	return PSM2_OK;
@@ -2333,12 +2553,15 @@ static struct ibv_qp* ud_qp_create(psm2_ep_t ep)
 	attr.qp_type = IBV_QPT_UD;
 
 	qp = ibv_create_qp(ep->verbs_ep.pd, &attr);
-	if (qp == NULL && errno == ENOMEM) {
+	if (qp == NULL) {
 		_HFI_ERROR( "Unable to create UD QP on %s: %s\n",
 					ep->dev_name, strerror(errno));
-		_HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n");
-		_HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n",
+		if (errno == ENOMEM) {
+			_HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n");
+			_HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n",
 					ep->verbs_ep.hfi_num_send_wqes+1, ep->verbs_ep.hfi_num_recv_wqes);
+		}
+		return NULL;
 	}
 
 	// attr reports what we got, double check and react in case
@@ -2437,7 +2660,7 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap)
 	attr.qp_context = context;
 	attr.send_cq = ep->verbs_ep.send_cq;
 	attr.recv_cq = ep->verbs_ep.recv_cq;
-	attr.srq = NULL;
+	attr.srq = ep->verbs_ep.srq;
 	// one extra WQE to be safe in case verbs needs a spare WQE
 	if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) {
 		// need to be prepared in case all sends posted to same RC QP, so
@@ -2445,10 +2668,9 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap)
 		attr.cap.max_send_wr  = ep->verbs_ep.hfi_num_send_wqes+ep->hfi_num_send_rdma+1;
 		attr.cap.max_send_sge = 2;
 		// inline data helps latency and message rate for small sends
-		// Later we may explore use of
-		// send SGEs pointing to application buffers, somewhat like WFR send DMA
 		attr.cap.max_inline_data = ep->hfi_imm_size;
-		attr.cap.max_recv_wr  = ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION;// TBD
+		attr.cap.max_recv_wr  = ep->verbs_ep.srq?0
+				:(ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION);// TBD
 		attr.cap.max_recv_sge = 1;
 	} else {
 		// only RDMA Write w/immediate
@@ -2456,7 +2678,7 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap)
 		attr.cap.max_send_sge = 1;
 		attr.cap.max_inline_data = 0;
 		// incoming Write w/immediate consumes a RQ WQE but no buffer needed
-		attr.cap.max_recv_wr  = HFI_TF_NFLOWS+1;
+		attr.cap.max_recv_wr  = ep->verbs_ep.srq?0:(HFI_TF_NFLOWS+1);
 		attr.cap.max_recv_sge = 0;
 	}
 
@@ -2467,9 +2689,16 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap)
 		_HFI_ERROR( "Unable to create RC QP on %s: %s\n",
 					ep->dev_name, strerror(errno));
 		_HFI_ERROR( "Requested QP size might be too big. Try reducing TX depth and/or inline size.\n");
-		_HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n",
-					ep->verbs_ep.hfi_num_send_wqes+1,
-					ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION);
+		if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) {
+			_HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n",
+				ep->verbs_ep.hfi_num_send_wqes+ep->hfi_num_send_rdma+1,
+				ep->verbs_ep.srq?0
+					:(ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION));
+		} else {
+			_HFI_ERROR( "Requested TX depth was %u and RX depth was %u .\n",
+				ep->hfi_num_send_rdma+1,
+				ep->verbs_ep.srq?0:(HFI_TF_NFLOWS+1));
+		}
 		return NULL;
 	}
 
@@ -2492,7 +2721,8 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap)
 			_HFI_PRDBG( "Limited to %d SQ SGEs\n",
 				attr.cap.max_send_sge);
 		}
-		if (ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION > attr.cap.max_recv_wr) {
+		if (! ep->verbs_ep.srq
+		    && ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION > attr.cap.max_recv_wr) {
 			_HFI_PRDBG( "Limited to %d RQ WQEs, requested %u\n",
 				attr.cap.max_recv_wr, ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION);
 		} else {
@@ -2514,7 +2744,8 @@ struct ibv_qp* rc_qp_create(psm2_ep_t ep, void *context, struct ibv_qp_cap *cap)
 			_HFI_PRDBG( "Limited to %d SQ SGEs\n",
 				attr.cap.max_send_sge);
 		}
-		if (HFI_TF_NFLOWS+1 > attr.cap.max_recv_wr) {
+		if (! ep->verbs_ep.srq
+		    && HFI_TF_NFLOWS+1 > attr.cap.max_recv_wr) {
 			_HFI_PRDBG( "Limited to %d RQ WQEs, requested %u\n",
 				attr.cap.max_recv_wr, HFI_TF_NFLOWS+1);
 		} else {
@@ -2848,7 +3079,7 @@ psm3_dump_verbs_qp(struct ibv_qp *qp)
 	printf("QP %p (%u), type %u state %u PkeyIndx %u Port %u draining %u\n",
 			qp, qp->qp_num, qp->qp_type, attr.qp_state, attr.pkey_index,
 			attr.port_num, attr.sq_draining);
-	printf("  send: wr %u sge %u inline %u recv: wr %u sqe %u\n",
+	printf("  send: wr %u sge %u inline %u recv: wr %u sge %u\n",
 			attr.cap.max_send_wr, attr.cap.max_send_sge, attr.cap.max_inline_data,
 			attr.cap.max_recv_wr, attr.cap.max_recv_sge);
 	printf("  context %p send_cq %p recv_cq %p srq %p sg_sig_all %u\n",
@@ -2906,6 +3137,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed)
 		case 16: return PSM3_IBV_RATE_14_GBPS;
 		case 32: return PSM3_IBV_RATE_25_GBPS;
 		case 64: return PSM3_IBV_RATE_50_GBPS;
+		case 128: return PSM3_IBV_RATE_100_GBPS;
 		default:
 				_HFI_ERROR( "unknown link speed 0x%x\n", speed);
 				return PSM3_IBV_RATE_100_GBPS;
@@ -2919,6 +3151,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed)
 		case 16: return PSM3_IBV_RATE_56_GBPS;
 		case 32: return PSM3_IBV_RATE_100_GBPS;
 		case 64: return PSM3_IBV_RATE_200_GBPS;
+		case 128: return PSM3_IBV_RATE_400_GBPS;
 		default:
 				_HFI_ERROR( "unknown link speed 0x%x\n", speed);
 				return PSM3_IBV_RATE_100_GBPS;
@@ -2932,6 +3165,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed)
 		case 16: return PSM3_IBV_RATE_112_GBPS;
 		case 32: return PSM3_IBV_RATE_200_GBPS;
 		case 64: return PSM3_IBV_RATE_400_GBPS;
+		case 128: return PSM3_IBV_RATE_800_GBPS;
 		default:
 				_HFI_ERROR( "unknown link speed 0x%x\n", speed);
 				return PSM3_IBV_RATE_100_GBPS;
@@ -2945,6 +3179,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed)
 		case 16: return PSM3_IBV_RATE_168_GBPS;
 		case 32: return PSM3_IBV_RATE_300_GBPS;
 		case 64: return PSM3_IBV_RATE_600_GBPS;
+		case 128: return PSM3_IBV_RATE_1200_GBPS;
 		default:
 				_HFI_ERROR( "unknown link speed 0x%x\n", speed);
 				return PSM3_IBV_RATE_100_GBPS;
@@ -2958,6 +3193,7 @@ static enum psm3_ibv_rate verbs_get_rate(uint8_t width, uint8_t speed)
 		case 16: return PSM3_IBV_RATE_28_GBPS;
 		case 32: return PSM3_IBV_RATE_50_GBPS;
 		case 64: return PSM3_IBV_RATE_100_GBPS;
+		case 128: return PSM3_IBV_RATE_200_GBPS;
 		default:
 				_HFI_ERROR( "unknown link speed 0x%x\n", speed);
 				return PSM3_IBV_RATE_100_GBPS;
diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.h b/prov/psm3/psm3/hal_verbs/verbs_ep.h
index 8874831f3b5..c1da6b73e53 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_ep.h
+++ b/prov/psm3/psm3/hal_verbs/verbs_ep.h
@@ -161,12 +161,14 @@ struct verbs_rbuf {
 typedef struct verbs_rbuf *rbuf_t;
 #define rbuf_to_buffer(buf)	((buf)->buffer)
 #define rbuf_addition(buf) ((buf)->pool->addition)
-#define rbuf_qp(ep, buf) ((buf)->pool->qp)
+#define rbuf_qp_context(ep, buf) ((buf)->pool->for_srq?NULL:(buf)->pool->qp->qp_context)
+#define rbuf_qp_type_str(ep, buf) ((buf)->pool->for_srq?"SRQ":qp_type_str((buf)->pool->qp))
 #else
 typedef uint8_t *rbuf_t;
 #define rbuf_to_buffer(buf)	(buf)
 #define rbuf_addition(buf) (UD_ADDITION)
-#define rbuf_qp(ep, buf) ((ep)->verbs_ep.recv_pool.qp)
+#define rbuf_qp_context(ep, buf) ((ep)->verbs_ep.recv_pool.qp->qp_context)
+#define rbuf_qp_type_str(ep, buf) (qp_type_str((ep)->verbs_ep.recv_pool.qp))
 #endif
 
 static inline const char*qp_type_str(struct ibv_qp *qp) {
@@ -255,7 +257,12 @@ typedef struct psm3_verbs_send_allocator *psm3_verbs_send_allocator_t;
 // but sizes may differ
 // when USE_RC, we need a separate recv pool per QP so we can prepost bufs.
 struct psm3_verbs_recv_pool {
-	struct ibv_qp *qp;	// secondary reference to QP these buffers are for
+	union { // secondary reference to QP or SRQ these buffers are for
+		struct ibv_qp *qp;	// when ! for_srq
+#ifdef USE_RC
+		struct ibv_srq *srq;	// when for_srq
+#endif
+	};
 	psm2_ep_t ep;
 	// our preregistered recv buffers
 	uint32_t recv_buffer_size;
@@ -264,6 +271,7 @@ struct psm3_verbs_recv_pool {
 	struct ibv_mr *recv_buffer_mr;
 #ifdef USE_RC
 	uint32_t addition;	// UD_ADDITION for UD QP, 0 for RC QP
+	uint32_t for_srq;	// if this for an SRQ or QP?
 #endif
 #if VERBS_RECV_QP_COALLESCE > 1
 			// list of ready to post WQEs and SGEs
@@ -296,6 +304,9 @@ struct psm3_verbs_ep {
 	struct ibv_cq	*recv_cq;
 	struct ibv_qp	*qp;
 	struct ibv_qp_cap qp_cap;   // capabilities of QP we got
+#ifdef USE_RC
+	struct ibv_srq	*srq;
+#endif
 	uint32_t qkey;
 	//uint8_t link_layer;         // IBV_LINK_LAYER_ETHERNET or other
 	uint8_t active_rate;
@@ -309,6 +320,9 @@ struct psm3_verbs_ep {
 	int recv_wc_count;	// number left in recv_wc_list
 	int recv_wc_next;	// next index
 #else
+#ifdef USE_RC
+	struct psm3_verbs_recv_pool srq_recv_pool;
+#endif
 	// if asked to revisit a packet we save it here
 	rbuf_t revisit_buf;
 	uint32_t revisit_payload_size;
@@ -385,8 +399,8 @@ extern psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd,
 extern psm2_error_t psm_verbs_init_send_allocator(
             psm3_verbs_send_allocator_t allocator,
             psm3_verbs_send_pool_t pool);
-extern psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, struct ibv_qp *qp,
-            psm3_verbs_recv_pool_t pool,
+extern psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, uint32_t for_srq,
+            void *qp_srq, psm3_verbs_recv_pool_t pool,
             uint32_t recv_total, uint32_t recv_buffer_size);
 extern void psm_verbs_free_send_pool(psm3_verbs_send_pool_t pool);
 extern void psm_verbs_free_recv_pool(psm3_verbs_recv_pool_t pool);
diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal.c b/prov/psm3/psm3/hal_verbs/verbs_hal.c
index 4f6bfb742ef..9575b316ff2 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_hal.c
+++ b/prov/psm3/psm3/hal_verbs/verbs_hal.c
@@ -166,21 +166,17 @@ static void psm3_hfp_verbs_mq_init_defaults(struct psm2_mq *mq)
 	 * Otherwise these defaults are used.
 	 */
 	unsigned rdmamode = psm3_verbs_parse_rdmamode(1);
-	mq->hfi_thresh_rv = 64000;
-	mq->hfi_base_window_rv = 131072;
+	mq->hfi_thresh_rv = PSM_MQ_NIC_RNDV_THRESH;
+	mq->ips_cpu_window_rv_str = PSM_CPU_NIC_RNDV_WINDOW_STR;
 	if (! (rdmamode & IPS_PROTOEXP_FLAG_ENABLED)) {
 		// TBD - when RDMA is disabled do we want to disable rendezvous?
 		// even without RDMA, the receiver controlled pacing helps scalability
 		mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous
 	}
 	mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY;
-#ifdef PSM_CUDA
-	if (PSMI_IS_GPU_ENABLED)
-		mq->hfi_base_window_rv = 2097152;
-#endif
-#ifdef PSM_ONEAPI
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	if (PSMI_IS_GPU_ENABLED)
-		mq->hfi_base_window_rv = 512*1024;
+		mq->ips_gpu_window_rv_str = PSM_GPU_NIC_RNDV_WINDOW_STR;
 #endif
 	// we parse mr_cache_mode and rv_gpu_cache_size here so we can cache it
 	// once per EP open, even if multi-rail or multi-QP
diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h
index 4f2df710571..2ba92503e9f 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h
+++ b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h
@@ -287,29 +287,33 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_set_req_params(
 			//ipsaddr->verbs.rc_qp = NULL;
 		} else {
 			// we got a REQ or a REP, we can move to RTR
-			// if we are only doing RDMA, we don't need any buffers, but we need a
-			// pool object for RQ coallesce, so we create a pool with 0 size buffers
-			if (PSM2_OK != psm_verbs_alloc_recv_pool(proto->ep, ipsaddr->verbs.rc_qp, &ipsaddr->verbs.recv_pool,
-					min(proto->ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION, ipsaddr->verbs.rc_qp_max_recv_wr),
-				  (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0
-					// want to end up with multiple of cache line (64)
-					// pr_mtu is negotiated max PSM payload, not including hdrs
-					// pr_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
-					// be conservative (+BUFFER_HEADROOM)
-					: ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->pr_mtu
-							+ MAX_PSM_HEADER + BUFFER_HEADROOM
-			)) {
-				_HFI_ERROR("failed to alloc RC recv buffers\n");
-				return PSM2_INTERNAL_ERR;
+			if (! proto->ep->verbs_ep.srq) {
+				// if we are only doing RDMA, we don't need any buffers, but we need a
+				// pool object for RQ coallesce, so we create a pool with 0 size buffers
+				if (PSM2_OK != psm_verbs_alloc_recv_pool(proto->ep, 0, ipsaddr->verbs.rc_qp, &ipsaddr->verbs.recv_pool,
+						min(proto->ep->verbs_ep.hfi_num_recv_wqes/VERBS_RECV_QP_FRACTION, ipsaddr->verbs.rc_qp_max_recv_wr),
+					  (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0
+						// want to end up with multiple of cache line (64)
+						// pr_mtu is negotiated max PSM payload, not including hdrs
+						// pr_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
+						// be conservative (+BUFFER_HEADROOM)
+						: ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->pr_mtu
+								+ MAX_PSM_HEADER + BUFFER_HEADROOM
+				)) {
+					_HFI_ERROR("failed to alloc RC recv buffers\n");
+					return PSM2_INTERNAL_ERR;
+				}
 			}
 
 			if (modify_rc_qp_to_init(proto->ep, ipsaddr->verbs.rc_qp)) {
 				_HFI_ERROR("qp_to_init failed\n");
 				return PSM2_INTERNAL_ERR;
 			}
-			if (PSM2_OK != psm3_ep_verbs_prepost_recv(&ipsaddr->verbs.recv_pool)) {
-				_HFI_ERROR("prepost failed\n");
-				return PSM2_INTERNAL_ERR;
+			if (! proto->ep->verbs_ep.srq) {
+				if (PSM2_OK != psm3_ep_verbs_prepost_recv(&ipsaddr->verbs.recv_pool)) {
+					_HFI_ERROR("prepost failed\n");
+					return PSM2_INTERNAL_ERR;
+				}
 			}
 			// RC QP MTU will be set to min of req->verbs.qp_attr and pr_mtu
 			// TBD - we already factored in req vs pr to update pr no need
diff --git a/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c b/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c
index eebcac2e5da..f38aa505fc8 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c
+++ b/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c
@@ -278,7 +278,7 @@ psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq)
 				// wc.byte_len is len of inbound rdma write not including immed
 				// wc.qp_num - local QP
 				ips_protoexp_handle_immed_data(rcv_ev.proto,
-						(uint64_t)(rbuf_qp(ep, buf)->qp_context),
+						(uint64_t)(rbuf_qp_context(ep, buf)),
 						RDMA_IMMED_USER_RC, WC(imm_data), WC(byte_len));
 				goto repost;
 				break;
@@ -310,7 +310,7 @@ psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq)
 		}
 		rcv_ev.p_hdr = (struct ips_message_header *)(rbuf_to_buffer(buf)+rbuf_addition(buf));
 		rcv_ev.payload = (rbuf_to_buffer(buf) + rbuf_addition(buf) + sizeof(struct ips_message_header));
-		_HFI_VDBG("%s receive - opcode %x\n", qp_type_str(rbuf_qp(ep, buf)),
+		_HFI_VDBG("%s receive - opcode %x\n", rbuf_qp_type_str(ep, buf),
 			_get_proto_hfi_opcode(rcv_ev.p_hdr));
 
 		PSM2_LOG_PKT_STRM(PSM2_LOG_RX,rcv_ev.p_hdr,"PKT_STRM:");
diff --git a/prov/psm3/psm3/include/utils_debug.h b/prov/psm3/psm3/include/utils_debug.h
index 499f1a41699..b7b6655f2e6 100644
--- a/prov/psm3/psm3/include/utils_debug.h
+++ b/prov/psm3/psm3/include/utils_debug.h
@@ -202,6 +202,14 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len);
 		} \
 	} while (0)
 
+#define _HFI_ENV_ERROR(fmt, ...) \
+	do { \
+		_Pragma_unlikely \
+		if (unlikely(psm3_dbgmask&__HFI_INFO)) {  \
+			printf("%s: env " fmt, psm3_mylabel, ##__VA_ARGS__); \
+		} \
+	} while (0)
+
 #define __HFI_PKTDBG_ON unlikely(psm3_dbgmask & __HFI_PKTDBG)
 
 #define __HFI_DBG_WHICH(which, fmt, ...) \
@@ -218,8 +226,7 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len);
 	do { \
 		_Pragma_unlikely \
 		if (unlikely(psm3_dbgmask&(which))) { \
-			PSM3_GETTIME \
-			fprintf(psm3_dbgout, PSM3_TIME_FMT "%s: " fmt, PSM3_TIME_ARG, psm3_mylabel, \
+			fprintf(psm3_dbgout, "%s: " fmt, psm3_mylabel, \
 			       ##__VA_ARGS__); \
 		} \
 	} while (0)
@@ -291,6 +298,8 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len);
 
 #define _HFI_INFO(fmt, ...)
 
+#define _HFI_ENV_ERROR(fmt, ...)
+
 #define __HFI_PKTDBG_ON 0
 
 #define _HFI_DBG(fmt, ...)
diff --git a/prov/psm3/psm3/include/utils_env.h b/prov/psm3/psm3/include/utils_env.h
index 5e18975a36b..d95660f6e01 100644
--- a/prov/psm3/psm3/include/utils_env.h
+++ b/prov/psm3/psm3/include/utils_env.h
@@ -55,6 +55,7 @@
 #define UTILS_ENV_H
 
 #include "psm2_mock_testing.h"
+#include "fnmatch.h"
 
 /* we can only include low level headers here because this is
  * #included by utils_sysfs.c.  Can't pull in HAL headers or heap debug macros
@@ -81,21 +82,37 @@ union psmi_envvar_val {
 	unsigned long long e_ulonglong;
 };
 
-#define PSMI_ENVVAR_LEVEL_USER	         1
-#define PSMI_ENVVAR_LEVEL_HIDDEN         2
-#define PSMI_ENVVAR_LEVEL_NEVER_PRINT    4
-
-#define PSMI_ENVVAR_TYPE_YESNO		0
-#define PSMI_ENVVAR_TYPE_STR		1
-#define PSMI_ENVVAR_TYPE_INT		2
-#define PSMI_ENVVAR_TYPE_UINT		3
-#define PSMI_ENVVAR_TYPE_UINT_FLAGS	4
-#define PSMI_ENVVAR_TYPE_LONG		5
-#define PSMI_ENVVAR_TYPE_ULONG		6
-#define PSMI_ENVVAR_TYPE_ULONG_FLAGS	7
-#define PSMI_ENVVAR_TYPE_ULONG_ULONG    8
-#define PSMI_ENVVAR_TYPE_STR_VAL_PAT    9
-#define PSMI_ENVVAR_TYPE_STR_TUPLES    10
+// psm3_getenv only expects LEVEL
+// psm3_getenv_range accepts LEVEL and FLAGs
+// MIN/MAX N/A to TYPEs: YESNO, STR, STR_VAL_PAT_*, STR_TUPLES
+// 'min' and 'max' only allowed as input when corresponding
+// range check enabled
+// FLAG_FATAL will cause a fatal error on invalid input
+// (syntax, range or check function detected).  When FLAG_FATAL is not
+// set an invalid input will fallback to the default with a message.
+#define PSMI_ENVVAR_LEVEL_USER	         1	// show in user help
+#define PSMI_ENVVAR_LEVEL_HIDDEN         2	// hidden from user help
+#define PSMI_ENVVAR_LEVEL_NEVER_PRINT    4	// a bit flag, never show in help
+#define PSMI_ENVVAR_LEVEL_MASK           0x07	// mask for getting level
+#define PSMI_ENVVAR_FLAG_NOMIN           0x10	// no min check
+#define PSMI_ENVVAR_FLAG_NOMAX           0x20	// no max check
+#define PSMI_ENVVAR_FLAG_NOABBREV        0x40	// no 'min' or 'max' as input
+#define PSMI_ENVVAR_FLAG_NOMIN_NOMAX     0x70	// no min, no max, no abbrev
+#define PSMI_ENVVAR_FLAG_FATAL           0x80	// invalid input is fatal
+
+#define PSMI_ENVVAR_TYPE_YESNO					0
+#define PSMI_ENVVAR_TYPE_STR					1
+#define PSMI_ENVVAR_TYPE_INT					2
+#define PSMI_ENVVAR_TYPE_UINT					3
+#define PSMI_ENVVAR_TYPE_UINT_FLAGS				4
+#define PSMI_ENVVAR_TYPE_LONG					5
+#define PSMI_ENVVAR_TYPE_ULONG					6
+#define PSMI_ENVVAR_TYPE_ULONG_FLAGS			7
+#define PSMI_ENVVAR_TYPE_ULONG_ULONG			8
+#define PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT		9
+#define PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT		10
+#define PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS	11
+#define PSMI_ENVVAR_TYPE_STR_TUPLES				12
 
 #define PSMI_ENVVAR_VAL_YES ((union psmi_envvar_val) 1)
 #define PSMI_ENVVAR_VAL_NO  ((union psmi_envvar_val) 0)
@@ -105,43 +122,82 @@ void psm3_env_print_val(FILE *f, const char *name, int type,
 int psm3_env_snprint_val(char *buf, size_t size, const char *name, int type,
 						union psmi_envvar_val val);
 
+// psm3_getenv_check_t is optional in psm3_getenv_range
+// to confirm the resulting value is valid (return of 0).
+// On error (return != 0), errstr[errstr_size] is filled in with
+// '\0' terminated string with more information about the error.
+//
+// This may be used for any envvar type to do further checks of the value
+// such as integers which may need to be power of 2 or parse checking
+// of strings.
+// For strings parsed value(s) is not returned, so caller will need to parse
+// again, but this allows better error reporting during env variable get.
+//
+// ptr is caller specific and can pass additional input information which may
+// assist in verification of values. ptr should be used as input only
+// because the check function is only called by psm3_getenv_range when
+// otherwise valid input is supplied.
+typedef int (*psm3_getenv_check_t)(int type, const union psmi_envvar_val val,
+							 void *ptr, size_t errstr_size, char errstr[]);
+
 int
 MOCKABLE(psm3_getenv)(const char *name, const char *descr, int level,
 		int type, union psmi_envvar_val defval,
 		union psmi_envvar_val *newval);
 MOCK_DCL_EPILOGUE(psm3_getenv);
 
-/*
- * Parsing int and unsigned int parameters
- * 0 -> ok, *val updated
- * -1 -> empty string
- * -2 -> parse error
- */
-int psm3_parse_str_int(const char *string, int *val);
-int psm3_parse_str_uint(const char *string, unsigned int *val);
+int MOCKABLE(psm3_getenv_range)(const char *name, const char *descr,
+		const char *help, unsigned level_flags,
+		int type, union psmi_envvar_val defval, union psmi_envvar_val min,
+		union psmi_envvar_val max, psm3_getenv_check_t check, void *ptr,
+		union psmi_envvar_val *newval);
+MOCK_DCL_EPILOGUE(psm3_getenv_range);
 
 /*
- * Parse long parameters
- * -1 -> empty string
- * -2 -> parse error
+ * Parsing int, unsigned int and long parameters
+ * 0 -> ok, *val updated
+ * -1 -> empty string, *val not updated
+ * -2 -> parse error, *val not updated
  */
-long psm3_parse_str_long(const char *str);
+int psm3_parse_str_int(const char *string, int *val, int min, int max);
+int psm3_parse_str_uint(const char *string, unsigned int *val,
+						unsigned int min, unsigned int max);
+int psm3_parse_str_long(const char *str, long *val, long min, long max);
 
 /*
  * Parsing yesno parameters
  * allows: yes/no, true/false, on/off, 1/0
- * -1 -> empty string
- * -2 -> parse error
+ * 0 -> ok, *val updated
+ * -1 -> empty string, *val not updated
+ * -2 -> parse error, *val not updated
  */
-int psm3_parse_str_yesno(const char *str);
+int psm3_parse_str_yesno(const char *str, int *val);
 
 /*
  * Parsing int parameters set in string tuples.
+ * Returns:
+ * 0 - parsed with no errors, vals[] updated
+ * -1 - empty or NULL string, vals[] unchanged
+ * -2 -  syntax error in one of more of the parameters
+ *        parameters with syntax errors are unchanged, others without
+ *        syntax errors are updated in vals[]
  */
 int psm3_parse_str_tuples(const char *str, int ntup, int *vals);
 
-/* parse env of the form 'val' or 'val:' or 'val:pattern' */
-int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val);
+/* parse env of the form 'val' or 'val:' or 'val:pattern'
+ * Returns:
+ * 0 - parsed and matches current process, *val set to parsed val
+ * 0 - parsed and doesn't match current process, *val set to def
+ * -1 - nothing provided, *val set to def
+ * -2 - syntax error, *val set to def
+ * flags PSMI_ENVVAR_FLAG_NOMIN, PSMI_ENVVAR_FLAG_NOMAX and
+ * PSMI_ENVVAR_FLAG_NOABBREV control if 'min', 'minimum', 'max' or 'maximum'
+ * allowed as input and indicate if min and/or max supplied.
+ */
+int psm3_parse_val_pattern_int(const char *env, int def, int *val,
+					unsigned flags, int min, int max);
+int psm3_parse_val_pattern_uint(const char *env, unsigned def, unsigned *val,
+					unsigned flags, unsigned min, unsigned max);
 
 #if defined(PSM_VERBS) || defined(PSM_SOCKETS)
 // return forced speed in mbps or 0 if not forced
diff --git a/prov/psm3/psm3/psm.c b/prov/psm3/psm3/psm.c
index 40826d38c1c..df138dd8a2f 100644
--- a/prov/psm3/psm3/psm.c
+++ b/prov/psm3/psm3/psm.c
@@ -97,6 +97,7 @@ sem_t *psm3_sem_affinity_shm_rw = NULL;
 int psm3_affinity_shared_file_opened = 0;
 char *psm3_affinity_shm_name;
 uint64_t *psm3_shared_affinity_ptr;
+uint64_t *psm3_shared_affinity_nic_refcount_ptr;
 
 uint32_t psm3_cpu_model;
 
@@ -164,6 +165,8 @@ CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream);
 CUresult (*psmi_cuEventSynchronize)(CUevent hEvent);
 CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags);
 CUresult (*psmi_cuMemFreeHost)(void* p);
+CUresult (*psmi_cuMemHostRegister)(void* p, size_t bytesize, unsigned int Flags);
+CUresult (*psmi_cuMemHostUnregister)(void* p);
 CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
 CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
 CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
@@ -202,6 +205,8 @@ uint64_t psmi_count_cuEventRecord;
 uint64_t psmi_count_cuEventSynchronize;
 uint64_t psmi_count_cuMemHostAlloc;
 uint64_t psmi_count_cuMemFreeHost;
+uint64_t psmi_count_cuMemHostRegister;
+uint64_t psmi_count_cuMemHostUnregister;
 uint64_t psmi_count_cuMemcpy;
 uint64_t psmi_count_cuMemcpyDtoD;
 uint64_t psmi_count_cuMemcpyDtoH;
@@ -225,7 +230,7 @@ int psmi_cuda_lib_load()
 	char *dlerr;
 
 	PSM2_LOG_MSG("entering");
-	_HFI_VDBG("Loading CUDA library.\n");
+	_HFI_DBG("Loading CUDA library.\n");
 
 	psmi_cuda_lib = dlopen("libcuda.so.1", RTLD_LAZY);
 	if (!psmi_cuda_lib) {
@@ -270,6 +275,8 @@ int psmi_cuda_lib_load()
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventSynchronize);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostAlloc);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemFreeHost);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostRegister);
+	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostUnregister);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpy);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoD);
 	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoH);
@@ -333,6 +340,8 @@ static void psmi_cuda_stats_register()
 		PSMI_CUDA_COUNT_DECLU64(cuEventSynchronize),
 		PSMI_CUDA_COUNT_DECLU64(cuMemHostAlloc),
 		PSMI_CUDA_COUNT_DECLU64(cuMemFreeHost),
+		PSMI_CUDA_COUNT_DECLU64(cuMemHostRegister),
+		PSMI_CUDA_COUNT_DECLU64(cuMemHostUnregister),
 		PSMI_CUDA_COUNT_DECLU64(cuMemcpy),
 		PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoD),
 		PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoH),
@@ -366,6 +375,7 @@ static void psmi_cuda_stats_register()
 ze_result_t (*psmi_zeInit)(ze_init_flags_t flags);
 ze_result_t (*psmi_zeDriverGet)(uint32_t *pCount, ze_driver_handle_t *phDrivers);
 ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices);
+ze_result_t (*psmi_zeDevicePciGetPropertiesExt)(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties);
 #ifndef PSM3_NO_ONEAPI_IMPORT
 ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress);
 ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDriver, void *ptr, size_t size);
@@ -411,6 +421,7 @@ ze_result_t (*psmi_zelLoaderGetVersions)(size_t *num_elems, zel_component_versio
 uint64_t psmi_count_zeInit;
 uint64_t psmi_count_zeDriverGet;
 uint64_t psmi_count_zeDeviceGet;
+uint64_t psmi_count_zeDevicePciGetPropertiesExt;
 #ifndef PSM3_NO_ONEAPI_IMPORT
 uint64_t psmi_count_zeDriverGetExtensionFunctionAddress;
 uint64_t psmi_count_zexDriverImportExternalPointer;
@@ -473,6 +484,7 @@ int psmi_oneapi_ze_load()
 	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeInit);
 	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGet);
 	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDeviceGet);
+	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDevicePciGetPropertiesExt);
 #ifndef PSM3_NO_ONEAPI_IMPORT
 	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGetExtensionFunctionAddress);
 #endif
@@ -535,6 +547,7 @@ static void psmi_oneapi_ze_stats_register()
 		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeInit),
 		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGet),
 		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGet),
+		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDevicePciGetPropertiesExt),
 #ifndef PSM3_NO_ONEAPI_IMPORT
 		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGetExtensionFunctionAddress),
 		PSMI_ONEAPI_ZE_COUNT_DECLU64(zexDriverImportExternalPointer),
@@ -637,11 +650,13 @@ static void psmi_gpu_init(void)
 	is_gdr_copy_enabled = env_enable_gdr_copy.e_int;
 
 	union psmi_envvar_val env_gpu_thresh_rndv;
-	ret = psm3_getenv("PSM3_GPU_THRESH_RNDV",
+	ret = psm3_getenv_range("PSM3_GPU_THRESH_RNDV",
 			  "RNDV protocol is used for GPU send message sizes greater than the threshold",
-			  PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-			  (union psmi_envvar_val)gpu_thresh_rndv, &env_gpu_thresh_rndv);
-	if (ret)
+			  NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+			  (union psmi_envvar_val)gpu_thresh_rndv,
+			  (union psmi_envvar_val)0, (union psmi_envvar_val)UINT32_MAX,
+			  NULL, NULL, &env_gpu_thresh_rndv);
+	if (ret > 0)
 		/*
 		 * For backward compatibility, check if the old variable name is set.
 		 * Priority order: New name > old name > default value.
@@ -693,7 +708,7 @@ int psmi_cuda_initialize()
 	psm2_error_t err = PSM2_OK;
 
 	PSM2_LOG_MSG("entering");
-	_HFI_VDBG("Enabling CUDA support.\n");
+	_HFI_DBG("Enabling CUDA support.\n");
 
 	psmi_cuda_stats_register();
 
@@ -727,6 +742,7 @@ static void psmi_oneapi_find_copy_only_engine(ze_device_handle_t dev,
 	uint32_t count = 0;
 	ze_command_queue_group_properties_t *props = NULL;
 	int i;
+	int done = 0;
 
 	/* Set the default */
 	ctxt->ordinal = 0;
@@ -742,15 +758,27 @@ static void psmi_oneapi_find_copy_only_engine(ze_device_handle_t dev,
 	PSMI_ONEAPI_ZE_CALL(zeDeviceGetCommandQueueGroupProperties, dev,
 			    &count, props);
 
-	/* Select the first copy-only engine group if possible */
+	// pick the last command queue group which supports copy but not compute.
+	// For PVC this will be the xeLink copy engine which will also
+	// have numQueues >1 (TBD - perhaps only select if it has numQueues>1).
+	// This ordinal is then supplied to create Command Queues and Command Lists.
 	for (i = count - 1; i >= 0; i--) {
-		if ((props[i].flags &
+		_HFI_DBG("GPU Queue Group %d: copy=%d Compute=%d num_queues=%d\n", i,
+			(props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) != 0,
+			(props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0,
+			(int)props[i].numQueues);
+		if (! done && (props[i].flags &
 		    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) &&
 		    !(props[i].flags &
 		      ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) {
 			ctxt->ordinal = i;
 			ctxt->num_queues = props[i].numQueues;
-			break;
+			done = 1;
+			if (_HFI_DBG_ON) {
+				_HFI_DBG_ALWAYS("Selected GPU copy engine %d\n", i);
+			} else {
+				break;
+			}
 		}
 	}
 	psmi_free(props);
@@ -789,6 +817,35 @@ static void psmi_oneapi_cmd_create(ze_device_handle_t dev, struct ze_dev_ctxt *c
 			dev, &ze_cl_desc, &ctxt->cl);
 	}
 	ctxt->dev = dev;
+
+	if (psm3_oneapi_parallel_dtod_copy_thresh < UINT_MAX) {
+		// create resources for dual copy mechanism
+		ze_event_pool_desc_t pool_desc = {
+				.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,
+				.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE,
+				.count = 2
+		};
+		ze_event_desc_t event_desc = {
+				.stype = ZE_STRUCTURE_TYPE_EVENT_DESC,
+				.signal = ZE_EVENT_SCOPE_FLAG_HOST,
+				.wait = ZE_EVENT_SCOPE_FLAG_HOST,
+		};
+		PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate,
+				ze_context, &pool_desc, 0, NULL, &ctxt->event_pool);
+
+		event_desc.index = 0;
+		PSMI_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc,
+				&ctxt->copy_status0);
+
+		event_desc.index = 1;
+		PSMI_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc,
+				&ctxt->copy_status1);
+
+		psmi_oneapi_async_cmd_create(ctxt, &ctxt->async_cq0,
+				&ctxt->async_cl0);
+		psmi_oneapi_async_cmd_create(ctxt, &ctxt->async_cq1,
+				&ctxt->async_cl1);
+	}
 }
 
 void psmi_oneapi_cmd_create_all(void)
@@ -804,8 +861,11 @@ void psmi_oneapi_cmd_create_all(void)
 	for (i = 0; i < num_ze_devices; i++) {
 		ctxt = &ze_devices[i];
 
-		if (!ctxt->cl)
+		if (!ctxt->cl) {
 			psmi_oneapi_cmd_create(ctxt->dev, ctxt);
+			_HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n",
+						i, ctxt->dev);
+		}
 	}
 	if (num_ze_devices > 0)
 		cur_ze_dev = &ze_devices[0];
@@ -819,6 +879,34 @@ void psmi_oneapi_cmd_destroy_all(void)
 	for (i = 0; i < num_ze_devices; i++) {
 		ctxt = &ze_devices[i];
 
+		if (ctxt->async_cl1 != NULL) {
+			PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl1);
+			ctxt->async_cl1 = NULL;
+		}
+		if (ctxt->async_cq1 != NULL) {
+			PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq1);
+			ctxt->async_cq1 = NULL;
+		}
+		if (ctxt->async_cl0 != NULL) {
+			PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl0);
+			ctxt->async_cl0 = NULL;
+		}
+		if (ctxt->async_cq0 != NULL) {
+			PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq0);
+			ctxt->async_cq0 = NULL;
+		}
+		if (ctxt->copy_status1 != NULL) {
+			PSMI_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status1);
+			ctxt->copy_status1 = NULL;
+		}
+		if (ctxt->copy_status0 != NULL) {
+			PSMI_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status0);
+			ctxt->copy_status0 = NULL;
+		}
+		if (ctxt->event_pool != NULL) {
+			PSMI_ONEAPI_ZE_CALL(zeEventPoolDestroy, ctxt->event_pool);
+			ctxt->event_pool = NULL;
+		}
 		if (ctxt->cl) {
 			PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->cl);
 			ctxt->cl = NULL;
@@ -849,7 +937,7 @@ int psmi_oneapi_ze_initialize()
 	union psmi_envvar_val env;
 
 	PSM2_LOG_MSG("entering");
-	_HFI_VDBG("Init Level Zero library.\n");
+	_HFI_DBG("Init Level Zero library.\n");
 
 	psmi_oneapi_ze_stats_register();
 	err = psmi_oneapi_ze_load();
@@ -868,6 +956,13 @@ int psmi_oneapi_ze_initialize()
 				(union psmi_envvar_val)1, &env);
 	psm3_oneapi_immed_async_copy = env.e_int;
 
+	psm3_getenv("PSM3_ONEAPI_PARALLEL_DTOD_COPY_THRESH",
+				"Use parallel CommandLists for GPU to GPU copy larger than threshold",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val)(256*1024-1), &env);
+	// no benefit below 128K-1, plus the copy is spilt at a 64K boundary
+	psm3_oneapi_parallel_dtod_copy_thresh = max(128*1024-1, env.e_uint);
+
 
 	PSMI_ONEAPI_ZE_CALL(zeInit, ZE_INIT_FLAG_GPU_ONLY);
 
@@ -911,11 +1006,15 @@ int psmi_oneapi_ze_initialize()
 
 	ze_context_desc_t ctxtDesc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, NULL, 0 };
 	PSMI_ONEAPI_ZE_CALL(zeContextCreate, ze_driver, &ctxtDesc, &ze_context);
-	_HFI_VDBG("ze_driver %p first device %p ze_context %p\n",
-		   ze_driver, &devices[0], ze_context);
+	_HFI_DBG("ze_driver %p %u devices first device %p ze_context %p\n",
+		   ze_driver, ze_device_count, devices[0], ze_context);
 
-	for (i = 0; i < ze_device_count; i++)
+	for (i = 0; i < ze_device_count; i++) {
+		ze_devices[i].dev_index = i;
 		psmi_oneapi_cmd_create(devices[i], &ze_devices[i]);
+		_HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n",
+				i, ze_devices[i].dev);
+	}
 
 	num_ze_devices = ze_device_count;
 	if (num_ze_devices > 0)
@@ -1014,7 +1113,11 @@ void psmi_parse_nic_var()
 {
 	union psmi_envvar_val env_nic;
 	psm3_getenv("PSM3_NIC",
-		"Device Unit number or name or wildcard (-1 or 'any' autodetects)",
+		"Device(s) to consider for use.  By name ("
+#ifdef FNM_EXTMATCH
+		"extended "
+#endif
+		"glob pattern), unit number or 'any'",
 		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
 		(union psmi_envvar_val)"any", &env_nic);
 	//autodetect
@@ -1064,6 +1167,11 @@ static int psm3_parse_no_warn(void)
 }
 #endif
 
+int init_cache_on = 1;
+void psm3_turn_off_init_cache() {
+	init_cache_on = 0;
+}
+
 psm2_error_t psm3_init(int *major, int *minor)
 {
 	psm2_error_t err = PSM2_OK;
@@ -1177,10 +1285,10 @@ psm2_error_t psm3_init(int *major, int *minor)
 	psm3_getenv("PSM3_TRACEMASK",
 		    "Mask flags for tracing",
 		    PSMI_ENVVAR_LEVEL_USER,
-		    PSMI_ENVVAR_TYPE_STR_VAL_PAT,
+		    PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS,
 		    (union psmi_envvar_val)__HFI_DEBUG_DEFAULT_STR, &env_tmask);
-	(void)psm3_parse_val_pattern(env_tmask.e_str, __HFI_DEBUG_DEFAULT,
-			&psm3_dbgmask);
+	(void)psm3_parse_val_pattern_uint(env_tmask.e_str, __HFI_DEBUG_DEFAULT,
+		    &psm3_dbgmask, PSMI_ENVVAR_FLAG_NOMIN_NOMAX, 0, UINT_MAX);
 
 	/* The "real thing" is done in utils_mallopt.c as constructor function, but
 	 * we getenv it here to report what we're doing with the setting */
@@ -1319,6 +1427,10 @@ psm2_error_t psm3_init(int *major, int *minor)
 		goto fail_epid;
 	}
 
+	if (psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
+		psm3_hwloc_topology_init();
+	}
+
 #ifdef PSM_DSA
 	if (psm3_device_is_enabled(devid_enabled, PTL_DEVID_AMSH)) {
 		if (psm3_dsa_init()) {
@@ -1352,7 +1464,8 @@ psm2_error_t psm3_init(int *major, int *minor)
 	 * want it to appear in PSM3_VERBOSE_ENV help text
 	 */
 	int enable_cuda = 0;
-	if (psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &enable_cuda) == -2
+	if (psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &enable_cuda,
+				INT_MIN, INT_MAX) == -2
 		|| enable_cuda) {
 		_HFI_INFO("WARNING: PSM built without CUDA enabled, PSM3_CUDA unavailable\n");
 	}
@@ -1382,7 +1495,8 @@ psm2_error_t psm3_init(int *major, int *minor)
 	 * want it to appear in PSM3_VERBOSE_ENV help text
 	 */
 	int enable_oneapi = 0;
-	if (psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &enable_oneapi) == -2
+	if (psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &enable_oneapi,
+				INT_MIN, INT_MAX) == -2
 		|| enable_oneapi) {
 		_HFI_INFO("WARNING: PSM built without ONEAPI_ZE enabled, PSM3_ONEAPI_ZE unavailable\n");
 	}
@@ -1399,7 +1513,8 @@ psm2_error_t psm3_init(int *major, int *minor)
 	 * get the behavior they expected
 	 */
 	unsigned int gpudirect = 0;
-	if (psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect) == -2
+	if (psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect,
+				0, UINT_MAX) == -2
 		|| gpudirect) {
 		_HFI_INFO("WARNING: PSM built with neither ONEAPI_ZE nor CUDA enabled, PSM3_GPUDIRECT unavailable\n");
 	}
@@ -1420,6 +1535,7 @@ psm2_error_t psm3_init(int *major, int *minor)
 #endif
 #if defined(PSM_DSA) || defined(PSM_CUDA) || defined(PSM_ONEAPI)
 fail_hal:
+	psm3_hwloc_topology_destroy();	// always safe to call
 	psm3_hal_finalize();
 #endif
 fail_epid:
@@ -1450,6 +1566,7 @@ static inline psm2_error_t unit_query_ret_to_err(int ret)
 	}
 }
 
+static uint64_t nics_max_speed;
 psm2_error_t psm3_info_query(psm2_info_query_t q, void *out,
 			       size_t nargs, psm2_info_query_arg_t args[])
 {
@@ -1606,6 +1723,11 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out,
 			if (port == 0) port = 1; /* VERBS_PORT */
 
 			if (unit == -1) {
+				if (init_cache_on && nics_max_speed) {
+					*speed = nics_max_speed;
+					rv = PSM2_OK;
+					break;
+				}
 				// query for unit -1 returns max speed of all candidate NICs
 				*speed = 0;
 				for (unit = 0; unit < psmi_hal_get_num_units_(); unit++) {
@@ -1615,7 +1737,12 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out,
 					if (0 <= psmi_hal_get_port_speed(unit, port, &unit_speed))
 						*speed = max(*speed, unit_speed);
 				}
-				rv = (*speed) ? PSM2_OK : PSM2_EP_NO_DEVICE;
+				if (*speed) {
+					nics_max_speed = *speed;
+					rv = PSM2_OK;
+				} else {
+					rv = PSM2_EP_NO_DEVICE;
+				}
 			} else {
 				if (psmi_hal_get_port_active(unit, port) <= 0) break;
 
@@ -1749,7 +1876,9 @@ psm2_error_t psm3_finalize(void)
 		 * Start critical section to decrement ref count and unlink
 		 * affinity shm file.
 		 */
-		psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name);
+		if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) {
+			_HFI_ERROR("unable to get NIC affinity semaphone, proceeding anyway\n");
+		}
 
 		psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] -= 1;
 		if (psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] <= 0) {
@@ -1767,6 +1896,7 @@ psm2_error_t psm3_finalize(void)
 
 		munmap(psm3_shared_affinity_ptr, PSMI_PAGESIZE);
 		psm3_shared_affinity_ptr = NULL;
+		psm3_shared_affinity_nic_refcount_ptr = NULL;
 		psmi_free(psm3_affinity_shm_name);
 		psm3_affinity_shm_name = NULL;
 		psm3_affinity_shared_file_opened = 0;
@@ -1782,6 +1912,7 @@ psm2_error_t psm3_finalize(void)
 		psm3_affinity_semaphore_open = 0;
 	}
 
+	psm3_hwloc_topology_destroy();	// always safe to call
 	psm3_hal_finalize();
 #ifdef PSM_CUDA
 	if (PSMI_IS_GPU_ENABLED)
diff --git a/prov/psm3/psm3/psm2.h b/prov/psm3/psm3/psm2.h
index fe76b5fe4b8..b9ff1c598d1 100644
--- a/prov/psm3/psm3/psm2.h
+++ b/prov/psm3/psm3/psm2.h
@@ -1376,6 +1376,16 @@ void *psm3_epaddr_getctxt(psm2_epaddr_t epaddr);
    * option value: Deprecated; this option has no effect.
    */
 
+#define PSM2_MQ_OPT_GPU_RNDV_SHM_SZ      0x304
+#define PSM2_MQ_GPU_RNDV_SHM_SZ          PSM2_MQ_OPT_GPU_RNDV_SHM_SZ
+  /**< [@b uint32_t ] Size at which to start enabling
+   * rendezvous messaging for shared memory (intra-node) GPU messages (If
+   * unset, defaults to 127 bytes for Intel GPU, 127 for NVIDIA GPU).
+   *
+   * component object: PSM2 Matched Queue (@ref psm2_mq_t).
+   * option value: Size at which to switch to rendezvous protocol for GPU send.
+   */
+
 /* PSM2_COMPONENT_AM options */
 #define PSM2_AM_OPT_FRAG_SZ          0x401
 #define PSM2_AM_MAX_FRAG_SZ          PSM2_AM_OPT_FRAG_SZ
@@ -1802,10 +1812,10 @@ char* psm3_env_get(const char *name);
  *
  * @param[in] const char *str parameter value
  * @retval 0 The string was valid, *val has value
- * -1 The string was empty or NULL
- * -2 The string had invalid syntax
+ * -1 The string was empty or NULL, *val not updated
+ * -2 The string had invalid syntax, *val not updated
  */
-int psm3_parse_str_int(const char *string, int *val);
+int psm3_parse_str_int(const char *string, int *val, int min, int max);
 
 /** @brief PSM2 unsigned int parameter parsing
  *
@@ -1813,22 +1823,56 @@ int psm3_parse_str_int(const char *string, int *val);
  *
  * @param[in] const char *str parameter value
  * @retval 0 The string was valid, *val has value
- * -1 The string was empty or NULL
- * -2 The string had invalid syntax
+ * -1 The string was empty or NULL, *val not updated
+ * -2 The string had invalid syntax, *val not updated
  */
-int psm3_parse_str_uint(const char *string, unsigned int *val);
+int psm3_parse_str_uint(const char *string, unsigned int *val,
+							unsigned int min, unsigned int max);
 
 /** @brief PSM2 yesno parameter parsing
  *
  * Function that parses a string yesno parameter
  *
  * @param[in] const char *str parameter value
- * @retval -1 The string was empty or NULL
- * -2 The string had invalid syntax
+ * @retval 0 The string was valid, *val has value
+ * -1 The string was empty or NULL, *val not updated
+ * -2 The string had invalid syntax, *val not updated
+ * @param[out] int *val
  * 0  The string was No, False, Off or 0
  * 1  The string was Yes, True, On or 1
  */
-int psm3_parse_str_yesno(const char *str);
+int psm3_parse_str_yesno(const char *str, int *val);
+
+// for the purposes of psmx3 accessing PSM3_DEVICES config, these
+// interfaces are defined here.  Not for general consumption
+/* We currently have 3 PTLs, 0 is reserved. */
+#define PTL_DEVID_IPS  1	// ips aka nic, network inter-node
+#define PTL_DEVID_AMSH 2	// shm, intra-node, scale-up
+#define PTL_DEVID_SELF 3	// self
+
+/* We can currently initialize up to 3 PTLs */
+#define PTL_MAX_INIT    3
+
+/** @brief PSM2 devices parameter parsing
+ *
+ * Function that gets and parses the PSM3_DEVICES string parameter
+ *
+ * @param[out] array of devices
+ * @retval PSM2_OK - devices successfully returned
+ * other (PSM2_PARAM_ERR) - error parsing devices
+ */
+psm2_error_t psm3_parse_devices(int devices[PTL_MAX_INIT]);
+
+/** @brief PSM2 devices list search
+ *
+ * Function that searches devid_enabled for a specific device
+ *
+ * @param[in] array of devices from psm3_parse_devices
+ * @param[in] devid: PTL_DEVID_IPS, PTL_DEVID_AMSH, or PTL_DEVID_SELF
+ * @retval 1 - given devid is enabled in devices[]
+ * 0  Given devid is disabled in devices[]
+ */
+int psm3_device_is_enabled(const int devices[PTL_MAX_INIT], int devid);
 
 /** @brief PSM2 env finalize
  *
@@ -1872,6 +1916,8 @@ void psm3_memcpy(void *dest, const void *src, uint32_t len);
 
 /*! @} */
 
+void psm3_turn_off_init_cache();
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/prov/psm3/psm3/psm2_hal.c b/prov/psm3/psm3/psm2_hal.c
index 058a6c26034..0c347ce2160 100644
--- a/prov/psm3/psm3/psm2_hal.c
+++ b/prov/psm3/psm3/psm2_hal.c
@@ -230,6 +230,69 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...)
 					rv = -1;
 			}
 			break;
+		case psmi_hal_pre_init_cache_func_get_port_speed:
+			{
+				int unit = va_arg(ap,int);
+
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					int port = va_arg(ap,int);
+					if ((port >= 1) && (port <= p->params.num_ports))
+					{
+						int i = unit * (p->params.num_ports+1) + port;
+						// only cache during PSM3 init
+						if (!init_cache_on || !p->params.port_speed_valid[i]) {
+							rv = p->hfp_get_port_speed(unit,port,&p->params.port_speed[i]);
+							p->params.port_speed_valid[i] = rv == 0 ? 1 : -1;
+						}
+						rv = (p->params.port_subnet_valid[i] ==1)? 0: -1;
+						if (rv == 0) {
+							uint64_t *speed = va_arg(ap, uint64_t*);
+							if (speed) *speed = p->params.port_speed[i];
+						}
+					}
+					else
+						rv = -1;
+				}
+				else
+					rv = -1;
+			}
+			break;
+		case psmi_hal_pre_init_cache_func_get_port_lid:
+			{
+				int unit = va_arg(ap,int);
+
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					int port = va_arg(ap,int);
+					if ((port >= 1) && (port <= p->params.num_ports))
+					{
+						int addr_index = va_arg(ap,int);
+						if (addr_index >= 0 && addr_index < psm3_addr_per_nic)
+						{
+							int i = unit * ((p->params.num_ports+1) * psm3_addr_per_nic) + port * psm3_addr_per_nic + addr_index;
+							// only cache during PSM3 init
+							if (!init_cache_on || !p->params.port_lid_valid[i]) {
+								rv = p->hfp_get_port_lid(unit,port,addr_index);
+								if (rv > 0) {
+									p->params.port_lid_valid[i] = 1;
+									p->params.port_lid[i] = rv;
+								} else {
+									p->params.port_lid_valid[i] = -1;
+									rv = -1;
+								}
+								break;
+							}
+							rv = p->params.port_lid_valid[i] == -1 ? -1 : p->params.port_lid[i];
+						}
+					}
+					else
+						rv = -1;
+				}
+				else
+					rv = -1;
+			}
+			break;
 		case psmi_hal_pre_init_cache_func_get_num_contexts:
 			{
 				int unit = va_arg(ap,int);
@@ -310,6 +373,51 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...)
 					rv = -1;
 			}
 			break;
+		case psmi_hal_pre_init_cache_func_get_port_subnet_name:
+			{
+				int unit = va_arg(ap,int);
+
+				if ((unit >= 0) && (unit < p->params.num_units))
+				{
+					int port = va_arg(ap,int);
+					if ((port >= 1) && (port <= p->params.num_ports))
+					{
+						int addr_index = va_arg(ap,int);
+						if (addr_index >= 0 && addr_index < psm3_addr_per_nic)
+						{
+							int i = unit * ((p->params.num_ports+1) * psm3_addr_per_nic) + port * psm3_addr_per_nic + addr_index;
+							// only cache during PSM3 init
+							if (!init_cache_on || !p->params.port_subnet_name[i]) {
+								char buffer[PATH_MAX] = {};
+								rv = p->hfp_get_port_subnet_name(unit, port, addr_index, buffer, sizeof(buffer));
+								if (p->params.port_subnet_name[i]) {
+									psmi_free(p->params.port_subnet_name[i]);
+								}
+								if (rv == 0) {
+									p->params.port_subnet_name[i] = psmi_strdup(PSMI_EP_NONE, buffer);
+								} else {
+									p->params.port_subnet_name[i] = NULL;
+									rv = -1;
+									break;
+								}
+							}
+							char *buf      = va_arg(ap, char*);
+							size_t bufsize = va_arg(ap, size_t);
+							rv = p->params.port_subnet_name[i] ? 0 : -1;
+							if (rv == 0 && buf) {
+								(void)snprintf(buf, bufsize, "%s", p->params.port_subnet_name[i]);
+							}
+						}
+						else
+							rv = -1;
+					}
+					else
+						rv = -1;
+				}
+				else
+					rv = -1;
+			}
+			break;
 		case psmi_hal_pre_init_cache_func_get_unit_pci_bus:
 			{
 				int unit = va_arg(ap,int);
@@ -469,6 +577,10 @@ static void psm3_hal_free_cache(struct _psmi_hal_instance *p)
 	FREE_HAL_CACHE(unit_active_valid);
 	FREE_HAL_CACHE(port_active);
 	FREE_HAL_CACHE(port_active_valid);
+	FREE_HAL_CACHE(port_speed);
+	FREE_HAL_CACHE(port_speed_valid);
+	FREE_HAL_CACHE(port_lid);
+	FREE_HAL_CACHE(port_lid_valid);
 	FREE_HAL_CACHE(num_contexts);
 	FREE_HAL_CACHE(num_contexts_valid);
 	FREE_HAL_CACHE(num_free_contexts);
@@ -478,6 +590,7 @@ static void psm3_hal_free_cache(struct _psmi_hal_instance *p)
 	FREE_HAL_CACHE(port_subnet_addr);
 	FREE_HAL_CACHE(port_subnet_idx);
 	FREE_HAL_CACHE(port_subnet_gid);
+	FREE_HAL_CACHE_ARRAY(port_subnet_name, p->params.num_units * p->params.num_ports * psm3_addr_per_nic);
 
 	FREE_HAL_CACHE(unit_pci_bus_valid);
 	FREE_HAL_CACHE(unit_pci_bus_domain);
@@ -521,6 +634,10 @@ static psmi_hal_instance_t *psm3_hal_select_hal(psmi_hal_instance_t *p,
 	ALLOC_HAL_CACHE(unit_active_valid, int8_t, nunits);
 	ALLOC_HAL_CACHE(port_active, int8_t, nunits*(nports+1));
 	ALLOC_HAL_CACHE(port_active_valid, int8_t, nunits*(nports+1));
+	ALLOC_HAL_CACHE(port_speed, uint64_t, nunits*(nports+1));
+	ALLOC_HAL_CACHE(port_speed_valid, int8_t, nunits*(nports+1));
+	ALLOC_HAL_CACHE(port_lid, int, nunits*(nports+1)*psm3_addr_per_nic);
+	ALLOC_HAL_CACHE(port_lid_valid, int8_t, nunits*(nports+1)*psm3_addr_per_nic);
 	ALLOC_HAL_CACHE(num_contexts, uint16_t, nunits);
 	ALLOC_HAL_CACHE(num_contexts_valid, uint16_t, nunits);
 	ALLOC_HAL_CACHE(num_free_contexts, uint16_t, nunits);
@@ -530,6 +647,7 @@ static psmi_hal_instance_t *psm3_hal_select_hal(psmi_hal_instance_t *p,
 	ALLOC_HAL_CACHE(port_subnet_addr, psmi_naddr128_t, nunits*(nports+1)*psm3_addr_per_nic);
 	ALLOC_HAL_CACHE(port_subnet_idx, int, nunits*(nports+1)*psm3_addr_per_nic);
 	ALLOC_HAL_CACHE(port_subnet_gid, psmi_gid128_t, nunits*(nports+1)*psm3_addr_per_nic);
+	ALLOC_HAL_CACHE_ARRAY(port_subnet_name, char, nunits*(nports+1)*psm3_addr_per_nic);
 
 	ALLOC_HAL_CACHE(unit_pci_bus_valid, int8_t, nunits);
 	ALLOC_HAL_CACHE(unit_pci_bus_domain, uint32_t, nunits);
@@ -557,6 +675,72 @@ static psmi_hal_instance_t *psm3_hal_select_hal(psmi_hal_instance_t *p,
 	return NULL;
 }
 
+/* check syntax of pattern. and confirm it matches at least 1 HAL
+ * returns:
+ * 0 - valid
+ * -1 - empty string
+ * -2 - invalid syntax
+ */
+static int parse_check_hal(int type, const union psmi_envvar_val val, void *ptr,
+			size_t errstr_size, char errstr[])
+{
+	int i;
+	int ret;
+
+	psmi_assert(type == PSMI_ENVVAR_TYPE_STR);
+	if (! val.e_str || ! *val.e_str)
+		return -1;
+	// use fnmatch to check syntax of pattern
+	// reviewing fnmatch source it only returns 0 or FNM_NOMATCH, but be
+	// safe and match fnmatch documentation that other values indicate error
+	ret = fnmatch(val.e_str, "dontcare", 0
+#ifdef FNM_EXTMATCH
+				| FNM_EXTMATCH
+#endif
+		);
+	if (ret && ret != FNM_NOMATCH) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " invalid "
+#ifdef FNM_EXTMATCH
+					"extended "
+#endif
+					"glob pattern");
+		return -2;
+	}
+	// we check for at least 1 matching HAL, but purposely do
+	// not check for active NICs within the HAL
+	// We allow any valid HAL, even if not included in the build
+	// This avoids surprises if user or middleware uses PSM3_HAL to limit
+	// PSM3 to a specific HAL, but the PSM3 build found lacks that HAL
+	ret = -2;	// assume no matching HAL found
+	for (i=0; i <= PSM_HAL_INDEX_MAX; i++)
+	{
+		if (i == PSM_HAL_INDEX_LOOPBACK)
+			continue;
+		if (0 == strcmp("unknown", psm3_hal_index_to_str(i)))
+			continue;
+
+		if (0 == strcmp(val.e_str, "any") ||
+		    0 == fnmatch(val.e_str, psm3_hal_index_to_str(i), 0
+#ifdef FNM_EXTMATCH
+								| FNM_EXTMATCH
+#endif
+			 ))
+		{
+			ret = 0;
+			break;
+		}
+	}
+	if (ret == -2) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " no matching HAL found");
+		return -2;
+	}
+	return 0;
+}
+
+static char hal_help[512] = "";
+
 static struct _psmi_hal_instance *psm3_hal_get_pi_inst(void)
 {
 	int i;
@@ -584,11 +768,12 @@ static struct _psmi_hal_instance *psm3_hal_get_pi_inst(void)
 	 */
 
 	union psmi_envvar_val env_hal; /* HAL instance preference */
-	psm3_getenv("PSM3_HAL",
-		    "Hardware Abstraction Layer to use (Default is first HAL"
-		    " to find a valid, unfiltered NIC [any])",
+	psm3_getenv_range("PSM3_HAL",
+		    "Hardware Abstraction Layer to use", hal_help,
 		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
-		    (union psmi_envvar_val)"any", &env_hal);
+		    (union psmi_envvar_val)"any",
+		    (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+		    parse_check_hal, NULL, &env_hal);
 
 	for (i=0; i <= PSM_HAL_INDEX_MAX; i++)
 	{
@@ -651,6 +836,36 @@ int psm3_hal_initialize(int devid_enabled[PTL_MAX_INIT])
 	PSMI_HAL_INI();
 
 	if (! psm3_hal_current_hal_instance) {
+		int i;
+		char valid_hal_list[80];
+		int valid_len = 0;
+		char avail_hal_list[80];
+		int avail_len = 0;
+
+		valid_hal_list[0] = '\0';
+		avail_hal_list[0] = '\0';
+		for (i=0; i <= PSM_HAL_INDEX_MAX; i++)
+		{
+			if (i == PSM_HAL_INDEX_LOOPBACK)
+				continue;
+			if (0 == strcmp("unknown", psm3_hal_index_to_str(i)))
+				continue;
+
+			snprintf(&valid_hal_list[valid_len],
+					 sizeof(valid_hal_list)-valid_len, "%s'%s'",
+					 valid_hal_list[0]?", ":"", psm3_hal_index_to_str(i));
+			valid_len = strlen(valid_hal_list);
+			if (psm3_hal_table[i]) {
+				snprintf(&avail_hal_list[avail_len],
+					 sizeof(avail_hal_list)-avail_len, "%s'%s'",
+					 avail_hal_list[0]?", ":"", psm3_hal_index_to_str(i));
+				avail_len = strlen(avail_hal_list);
+			}
+		}
+		snprintf(hal_help, sizeof(hal_help),
+			"  'any' - use first HAL which finds a valid, unfiltered NIC (default)\n"
+			"  valid HALs: %s\n"
+			"  available HALs: %s", valid_hal_list, avail_hal_list);
 		if (! psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
 			// register the loopback HAL and select it.  Unlike normal HALs
 			// we don't call psm3_hal_register_instance because it would enforce
diff --git a/prov/psm3/psm3/psm2_hal.h b/prov/psm3/psm3/psm2_hal.h
index d5658221c0c..055261da6c4 100644
--- a/prov/psm3/psm3/psm2_hal.h
+++ b/prov/psm3/psm3/psm2_hal.h
@@ -228,6 +228,10 @@ typedef struct _psmi_hal_params
 	uint16_t   default_pkey;
 	int8_t     *unit_active,*unit_active_valid;
 	int8_t     *port_active,*port_active_valid;
+	uint64_t   *port_speed;
+	int8_t     *port_speed_valid;
+	int        *port_lid;
+	int8_t     *port_lid_valid;
 	uint16_t   *num_contexts,*num_contexts_valid;
 	uint16_t   *num_free_contexts,*num_free_contexts_valid;
 		// information from port_get_subnet
@@ -237,6 +241,7 @@ typedef struct _psmi_hal_params
 	psmi_naddr128_t   *port_subnet_addr;
 	int        *port_subnet_idx;
 	psmi_gid128_t   *port_subnet_gid;
+	char       **port_subnet_name;
 
 	int8_t     *unit_pci_bus_valid;
 	uint32_t   *unit_pci_bus_domain;
@@ -254,6 +259,10 @@ typedef struct _psmi_hal_params
 #define PSM_HAL_ALG_ACROSS     0
 #define PSM_HAL_ALG_WITHIN     1
 #define PSM_HAL_ALG_ACROSS_ALL 2
+#define PSM_HAL_ALG_CPU_CENTRIC 3
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+#define PSM_HAL_ALG_GPU_CENTRIC 4
+#endif
 
 
 typedef enum {
@@ -499,16 +508,22 @@ int psm3_hal_initialize(int devid_enabled[PTL_MAX_INIT]);
 
 int psm3_hal_finalize(void);
 
+// indicate whether we cache data during PSM3 init
+extern int init_cache_on;
+
 enum psmi_hal_pre_init_cache_func_krnls
 {
 	psmi_hal_pre_init_cache_func_get_num_units,
 	psmi_hal_pre_init_cache_func_get_num_ports,
 	psmi_hal_pre_init_cache_func_get_unit_active,
 	psmi_hal_pre_init_cache_func_get_port_active,
+	psmi_hal_pre_init_cache_func_get_port_speed,
+	psmi_hal_pre_init_cache_func_get_port_lid,
 	psmi_hal_pre_init_cache_func_get_num_contexts,
 	psmi_hal_pre_init_cache_func_get_num_free_contexts,
 	psmi_hal_pre_init_cache_func_get_default_pkey,
 	psmi_hal_pre_init_cache_func_get_port_subnet,
+	psmi_hal_pre_init_cache_func_get_port_subnet_name,
 	psmi_hal_pre_init_cache_func_get_unit_pci_bus,
 	psmi_hal_pre_init_cache_func_get_unit_device_id,
 	psmi_hal_pre_init_cache_func_get_unit_device_version,
@@ -549,9 +564,6 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...)
 
 	/* DISPATCH_FUNC */
 #define psmi_hal_get_unit_name(...)                          PSMI_HAL_DISPATCH_FUNC(get_unit_name,__VA_ARGS__)
-#define psmi_hal_get_port_subnet_name(...)                          PSMI_HAL_DISPATCH_FUNC(get_port_subnet_name,__VA_ARGS__)
-#define psmi_hal_get_port_speed(...)                            PSMI_HAL_DISPATCH_FUNC(get_port_speed,__VA_ARGS__)
-#define psmi_hal_get_port_lid(...)				PSMI_HAL_DISPATCH_FUNC(get_port_lid,__VA_ARGS__)
 #define psmi_hal_mq_init_defaults(...)		                PSMI_HAL_DISPATCH_FUNC(mq_init_defaults,__VA_ARGS__)
 #define psmi_hal_ep_open_opts_get_defaults(...)	                PSMI_HAL_DISPATCH_FUNC(ep_open_opts_get_defaults,__VA_ARGS__)
 #define psmi_hal_context_initstats(...)				PSMI_HAL_DISPATCH_FUNC(context_initstats,__VA_ARGS__)
@@ -566,10 +578,13 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...)
 #define psmi_hal_get_num_ports_(...)                            PSMI_HAL_DISPATCH_PI(get_num_ports,##__VA_ARGS__)
 #define psmi_hal_get_unit_active(...)                           PSMI_HAL_DISPATCH_PI(get_unit_active,__VA_ARGS__)
 #define psmi_hal_get_port_active(...)                           PSMI_HAL_DISPATCH_PI(get_port_active,__VA_ARGS__)
+#define psmi_hal_get_port_speed(...)                            PSMI_HAL_DISPATCH_PI(get_port_speed,__VA_ARGS__)
+#define psmi_hal_get_port_lid(...)				PSMI_HAL_DISPATCH_PI(get_port_lid,__VA_ARGS__)
 #define psmi_hal_get_num_contexts(...)                          PSMI_HAL_DISPATCH_PI(get_num_contexts,__VA_ARGS__)
 #define psmi_hal_get_num_free_contexts(...)                     PSMI_HAL_DISPATCH_PI(get_num_free_contexts,__VA_ARGS__)
 #define psmi_hal_get_default_pkey(...)			        PSMI_HAL_DISPATCH_PI(get_default_pkey,##__VA_ARGS__)
 #define psmi_hal_get_port_subnet(...)				PSMI_HAL_DISPATCH_PI(get_port_subnet,__VA_ARGS__)
+#define psmi_hal_get_port_subnet_name(...)                      PSMI_HAL_DISPATCH_PI(get_port_subnet_name,__VA_ARGS__)
 #define psmi_hal_get_unit_pci_bus(...)                          PSMI_HAL_DISPATCH_PI(get_unit_pci_bus,__VA_ARGS__)
 #define psmi_hal_get_unit_device_id(...)                        PSMI_HAL_DISPATCH_PI(get_unit_device_id,__VA_ARGS__)
 #define psmi_hal_get_unit_device_version(...)                   PSMI_HAL_DISPATCH_PI(get_unit_device_version,__VA_ARGS__)
diff --git a/prov/psm3/psm3/psm2_hal_loopback.c b/prov/psm3/psm3/psm2_hal_loopback.c
index cf78a99b2ee..913a45dec78 100644
--- a/prov/psm3/psm3/psm2_hal_loopback.c
+++ b/prov/psm3/psm3/psm2_hal_loopback.c
@@ -209,8 +209,10 @@ static int psm3_hfp_loopback_get_port_lid(int unit, int port, int addr_index)
 // also prior to the EP being opened
 static void psm3_hfp_loopback_mq_init_defaults(struct psm2_mq *mq)
 {
-	/* these are only used by ptl_ips */
-	mq->hfi_base_window_rv =  (~(uint32_t)0); // no rendezvous
+	mq->ips_cpu_window_rv_str =  NULL; // no rendezvous
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	mq->ips_gpu_window_rv_str =  NULL; // no rendezvous
+#endif
 	mq->hfi_thresh_rv = (~(uint32_t)0); // disable rendezvous
 	mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY;
 	// RDMA and MR cache N/A, leave ep->rdmamode, ep->mr_cache_mode and
diff --git a/prov/psm3/psm3/psm2_mq.h b/prov/psm3/psm3/psm2_mq.h
index b32c5126ba8..517b4802d5b 100644
--- a/prov/psm3/psm3/psm2_mq.h
+++ b/prov/psm3/psm3/psm2_mq.h
@@ -173,7 +173,8 @@ extern "C" {
  *   @li If and when possible, receive buffers should be posted as early as
  *       possible and ideally before calling into the progress engine.
  *   @li Use of rendezvous messaging that can be controlled with
- *       @ref PSM2_MQ_RNDV_HFI_SZ and @ref PSM2_MQ_RNDV_SHM_SZ options.  These
+ *       @ref PSM2_MQ_RNDV_HFI_SZ, @ref PSM2_MQ_RNDV_SHM_SZ and
+ *       PSM2_MQ_GPU_RNDV_SHM_SZ options.  These
  *       options default to values determined to make effective use of
  *       bandwidth and are hence not advisable for all communication message
  *       sizes, but rendezvous messages inherently prevent unexpected
@@ -477,6 +478,7 @@ struct psm2_mq_req_user {
  * @param[in] option Index of option to retrieve.  Possible values are:
  *            @li @ref PSM2_MQ_RNDV_HFI_SZ
  *            @li @ref PSM2_MQ_RNDV_SHM_SZ
+ *            @li @ref PSM2_MQ_GPU_RNDV_SHM_SZ
  *            @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES
  *
  * @param[in] value Pointer to storage that can be used to store the value of
@@ -498,6 +500,7 @@ psm2_error_t psm3_mq_getopt(psm2_mq_t mq, int option, void *value);
  * @param[in] option Index of option to retrieve.  Possible values are:
  *            @li @ref PSM2_MQ_RNDV_HFI_SZ
  *            @li @ref PSM2_MQ_RNDV_SHM_SZ
+ *            @li @ref PSM2_MQ_GPU_RNDV_SHM_SZ
  *            @li @ref PSM2_MQ_MAX_SYSBUF_MBYTES
  *
  * @param[in] value Pointer to storage that contains the value to be updated
@@ -519,6 +522,9 @@ psm2_error_t psm3_mq_setopt(psm2_mq_t mq, int option, const void *value);
 
 #define PSM2_MQ_FLAG_SENDSYNC	0x01
 				/**< MQ Send Force synchronous send */
+#define PSM2_MQ_FLAG_INJECT	0x02
+				/**< MQ Send Force bounce buffer for */
+				/* FI_INJECT/fi_inject behavior */
 
 #define PSM2_MQ_REQINVALID	((psm2_mq_req_t)(NULL))
 				/**< MQ request completion value */
@@ -710,6 +716,9 @@ psm3_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len,
  *            synchronously, meaning that the message will not be sent until
  *            the receiver acknowledges that it has matched the send with a
  *            receive buffer.
+ *            @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer
+ *            immediately to comply with FI_INJECT/fi_inject behavior,
+ *            cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC.
  * @param[in] stag Message Send Tag
  * @param[in] buf Source buffer pointer
  * @param[in] len Length of message starting at @c buf.
@@ -742,6 +751,9 @@ psm3_mq_send(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
  *            synchronously, meaning that the message will not be sent until
  *            the receiver acknowledges that it has matched the send with a
  *            receive buffer.
+ *            @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer
+ *            immediately to comply with FI_INJECT/fi_inject behavior,
+ *            cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC.
  * @param[in] stag Message Send Tag
  * @param[in] buf Source buffer pointer
  * @param[in] len Length of message starting at @c buf.
@@ -776,6 +788,9 @@ psm3_mq_send2(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags,
  *            synchronously, meaning that the message will not be sent until
  *            the receiver acknowledges that it has matched the send with a
  *            receive buffer.
+ *            @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer
+ *            immediately to comply with FI_INJECT/fi_inject behavior,
+ *            cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC.
  * @param[in] stag Message Send Tag
  * @param[in] buf Source buffer pointer
  * @param[in] len Length of message starting at @c buf.
@@ -841,6 +856,9 @@ psm3_mq_isend(psm2_mq_t mq, psm2_epaddr_t dest, uint32_t flags, uint64_t stag,
  *            synchronously, meaning that the message will not be sent until
  *            the receiver acknowledges that it has matched the send with a
  *            receive buffer.
+ *            @li PSM2_MQ_FLAG_INJECT tells PSM to consume the send buffer
+ *            immediately to comply with FI_INJECT/fi_inject behavior,
+ *            cannot be used in conjunction with PSM2_MQ_FLAG_SENDSYNC.
  * @param[in] stag Message Send Tag, array of three 32-bit values.
  * @param[in] buf Source buffer pointer
  * @param[in] len Length of message starting at @c buf.
diff --git a/prov/psm3/psm3/psm_config.h b/prov/psm3/psm3/psm_config.h
index 479b6f9d732..4ce7de78157 100644
--- a/prov/psm3/psm3/psm_config.h
+++ b/prov/psm3/psm3/psm_config.h
@@ -99,6 +99,11 @@
 /* #define PSM_PROFILE */
 #endif
 
+// If defined, for FI_INJECT Send DMA will be avoided
+#ifndef PSM_INJECT_NOSDMA
+/* #define PSM_INJECT_NOSDMA */
+#endif
+
 #define PSMI_MIN_EP_CONNECT_TIMEOUT	(2 * SEC_ULL)
 #define PSMI_MIN_EP_CLOSE_TIMEOUT	(1 * SEC_ULL)
 #define PSMI_MAX_EP_CLOSE_TIMEOUT	(2 * SEC_ULL)
@@ -174,9 +179,21 @@
 
 
 #define PSM_MQ_NIC_MAX_TINY		8	/* max TINY payload allowed */
+#define PSM_MQ_NIC_RNDV_THRESH	 	64000
+#define PSM_CPU_NIC_RNDV_WINDOW_STR "131072"
+#ifdef PSM_CUDA
+#define PSM_GPU_NIC_RNDV_WINDOW_STR "2097152"
+#elif defined(PSM_ONEAPI)
+#define PSM_GPU_NIC_RNDV_WINDOW_STR "131072:524287,262144:1048575,524288"
+#endif
 #define PSM_MQ_NIC_MAX_RNDV_WINDOW	(4 * 1024 * 1024) /* max rndv window */
 
 #define MQ_SHM_THRESH_RNDV 16000
+#if defined(PSM_CUDA)
+#define MQ_SHM_GPU_THRESH_RNDV 127
+#elif defined(PSM_ONEAPI)
+#define MQ_SHM_GPU_THRESH_RNDV 127
+#endif
 
 // LEARN_HASH_SELECTOR has PSM3 dynamically learn the combinations
 // of src_addr presence and tagsel used by a given middleware.  This
diff --git a/prov/psm3/psm3/psm_context.c b/prov/psm3/psm3/psm_context.c
index 047cfbc38a3..35477d69f2f 100644
--- a/prov/psm3/psm3/psm_context.c
+++ b/prov/psm3/psm3/psm_context.c
@@ -58,7 +58,6 @@
 #include "psm_user.h"
 #include "psm2_hal.h"
 
-static int psmi_parse_nic_selection_algorithm(void);
 static psm2_error_t
 psm3_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey, uint16_t* oindex);
 
@@ -92,481 +91,6 @@ int psm3_context_interrupt_isenabled(psm2_ep_t ep)
 	return psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_INTR_ENABLED);
 }
 
-
-/* returns the 8-bit hash value of an uuid. */
-static inline
-uint8_t
-psm3_get_uuid_hash(psm2_uuid_t const uuid)
-{
-	int i;
-	uint8_t hashed_uuid = 0;
-
-	for (i=0; i < sizeof(psm2_uuid_t); ++i)
-		hashed_uuid ^= *((uint8_t const *)uuid + i);
-
-	return hashed_uuid;
-}
-
-int psm3_get_current_proc_location()
-{
-        int core_id, node_id;
-
-	core_id = sched_getcpu();
-	if (core_id < 0)
-		return -EINVAL;
-
-	node_id = numa_node_of_cpu(core_id);
-	if (node_id < 0)
-		return -EINVAL;
-
-	return node_id;
-}
-
-// print a bitmask in condensed form at _HFI_VBG level
-// condensed form consolidates sequential numbers such as: "0-43,88-131"
-static void vdbg_print_bitmask(const char* prefix, struct bitmask *bmp)
-{
-	if (_HFI_VDBG_ON) {
-		int i, len;
-		char buf[1024];
-		int last=-1;
-		int first=-1;
-		int max = numa_num_possible_nodes();
-
-		snprintf(buf, sizeof(buf), "%s", prefix);
-		len = strlen(buf);
-		for (i=0; i<max; i++) {
-			if (! numa_bitmask_isbitset(bmp, i))
-				continue;
-			if (last == -1) {
-				// 1st found
-				snprintf(&buf[len], sizeof(buf)-len, "%d", i);
-				first = i;
-				last = first;
-			} else if ((i-last) > 1) {
-				if (first == last) {
-					// first in a possible sequence
-					snprintf(&buf[len], sizeof(buf)-len, ",%d", i);
-				} else {
-					// complete prior sequence, first in a new sequence
-					snprintf(&buf[len], sizeof(buf)-len, "-%d,%d", last, i);
-				}
-				first = i;
-				last = first;
-			} else {
-				last = i;
-			}
-			len = strlen(buf);
-		}
-		// complete prior sequence as needed
-		if (first>=0 && first != last)
-			snprintf(&buf[len], sizeof(buf)-len, "-%d", last);
-		_HFI_VDBG("%s\n", buf);
-	}
-}
-
-// return the largest possible numa ID of a CPU in this system
-int psm3_get_max_cpu_numa()
-{
-	static int max_cpu_numa = -1;
-	struct bitmask *cpumask, *empty_cpumask;
-	int i;
-
-	if (max_cpu_numa >= 0)
-		return max_cpu_numa;
-
-	// we don't depend on numa_num_configured_nodes since in theory there
-	// could be non-CPU memory NUMA nodes.  We only need to know the
-	// largest possible value for a CPU numa node ID
-
-	// numa_max_node - largest NUMA node which is not disabled
-	// numa_node_to_cpus - given a NUMA node, create list of CPUs
-	// numa_node_of_cpu - cpu ID to NUMA (or error if invalid CPU)
-	// numa_node_to_cpus - cpumask of CPUs on given NUMA node
-
-	max_cpu_numa = -1;
-	empty_cpumask = numa_allocate_cpumask();
-	numa_bitmask_clearall(empty_cpumask);
-	//vdbg_print_bitmask("empty_cpumask: ", empty_cpumask);
-
-	cpumask = numa_allocate_cpumask();
-	_HFI_VDBG("numa_max_node=%d\n", numa_max_node());
-	for (i=numa_max_node(); i >= 0; i--) {
-		numa_bitmask_clearall(cpumask);
-		int ret = numa_node_to_cpus(i, cpumask);
-		_HFI_VDBG("i=%d node_to_cpus ret=%d\n", i, ret);
-		vdbg_print_bitmask("cpumask: ", cpumask);
-		if (ret >= 0 && ! numa_bitmask_equal(cpumask, empty_cpumask)) {
-			max_cpu_numa = i;
-			break;
-		}
-	}
-	numa_free_cpumask(cpumask);
-	numa_free_cpumask(empty_cpumask);
-	psmi_assert_always(max_cpu_numa >= 0);
-	return max_cpu_numa;
-}
-
-/* search the list of all units for those which are active
- * and optionally match the given NUMA node_id (when node_id >= 0)
- * returns the number of active units found.
- * Note get_unit_active tests for active ports, valid addresses and
- * performs filtering as done in get_port_subnets
- */
-static int
-hfi_find_active_hfis(int nunits, int node_id, int *saved_hfis)
-{
-	int found = 0, unit_id;
-
-	for (unit_id = 0; unit_id < nunits; unit_id++) {
-		int node_id_i;
-
-		if (psmi_hal_get_unit_active(unit_id) <= 0)
-			continue;
-
-		if (node_id < 0) {
-			saved_hfis[found++] = unit_id;
-			_HFI_DBG("RoundRobinAll Found NIC unit= %d, local rank=%d.\n",
-				unit_id, psm3_get_mylocalrank());
-		} else if (!psmi_hal_get_node_id(unit_id, &node_id_i)
-				&& node_id_i == node_id) {
-			saved_hfis[found++] = unit_id;
-			_HFI_DBG("RoundRobin Found NIC unit= %d, node = %d, local rank=%d.\n",
-				unit_id, node_id, psm3_get_mylocalrank());
-		}
-	}
-	return found;
-}
-
-static void
-psmi_spread_nic_selection(psm2_uuid_t const job_key, long *unit_start,
-			     long *unit_end, int nunits)
-{
-	{
-		int found, saved_hfis[nunits];
-
-		/* else, we are going to look at:
-		   (a hash of the job key plus the local rank id) mod nunits. */
-		found = hfi_find_active_hfis(nunits, -1, saved_hfis);
-		if (found)
-			*unit_start = saved_hfis[((psm3_get_mylocalrank()+1) +
-				psm3_get_uuid_hash(job_key)) % found];
-		else
-			*unit_start = 0; // caller will fail
-		/* just in case, caller will check all other units, with wrap */
-		if (*unit_start > 0)
-			*unit_end = *unit_start - 1;
-		else
-			*unit_end = nunits-1;
-	}
-	_HFI_DBG("RoundRobinAll Will select 1st viable NIC unit= %ld to %ld.\n",
-		*unit_start, *unit_end);
-}
-
-static int
-psm3_create_and_open_affinity_shm(psm2_uuid_t const job_key)
-{
-	int shm_fd, ret;
-	int first_to_create = 0;
-	size_t shm_name_len = 256;
-
-	psmi_assert_always(psm3_affinity_semaphore_open);
-	if (psm3_affinity_shared_file_opened) {
-		/* opened and have our reference counted in shm */
-		psmi_assert_always(psm3_affinity_shm_name != NULL);
-		psmi_assert_always(psm3_shared_affinity_ptr != NULL);
-		return 0;
-	}
-
-	psm3_shared_affinity_ptr = NULL;
-	psm3_affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len);
-
-	psmi_assert_always(psm3_affinity_shm_name != NULL);
-	snprintf(psm3_affinity_shm_name, shm_name_len,
-		 AFFINITY_SHM_BASENAME".%d",
-		 psm3_get_uuid_hash(job_key));
-	shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR | O_CREAT | O_EXCL,
-			  S_IRUSR | S_IWUSR);
-	if ((shm_fd < 0) && (errno == EEXIST)) {
-		shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR);
-		if (shm_fd < 0) {
-			_HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n",
-				  psm3_affinity_shm_name, errno);
-			goto free_name;
-		}
-	} else if (shm_fd >= 0) {
-		first_to_create = 1;
-	} else {
-		_HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n",
-			  psm3_affinity_shm_name, errno);
-		goto free_name;
-	}
-
-	ret = ftruncate(shm_fd, PSMI_PAGESIZE);
-	if ( ret < 0 ) {
-		_HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n",
-			psm3_affinity_shm_name, errno);
-		goto close_shm;
-	}
-
-	psm3_shared_affinity_ptr = (uint64_t *) mmap(NULL, PSMI_PAGESIZE, PROT_READ | PROT_WRITE,
-					MAP_SHARED, shm_fd, 0);
-	if (psm3_shared_affinity_ptr == MAP_FAILED) {
-		_HFI_VDBG("Cannot mmap affinity shared memory: %s, errno=%d\n",
-			  psm3_affinity_shm_name, errno);
-		goto close_shm;
-	}
-	close(shm_fd);
-	shm_fd = -1;
-
-	if (first_to_create) {
-		_HFI_VDBG("Initializing shm to store NIC affinity per socket: %s\n", psm3_affinity_shm_name);
-
-		memset(psm3_shared_affinity_ptr, 0, PSMI_PAGESIZE);
-
-		/*
-		 * Once shm object is initialized, unlock others to be able to
-		 * use it.
-		 */
-		psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name);
-	} else {
-		_HFI_VDBG("Opened shm object to read/write NIC affinity per socket: %s\n", psm3_affinity_shm_name);
-	}
-
-	/*
-	 * Start critical section to increment reference count when creating
-	 * or opening shm object. Decrement of ref count will be done before
-	 * closing the shm.
-	 */
-	if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) {
-		_HFI_VDBG("Could not enter critical section to update shm refcount\n");
-		goto unmap_shm;
-	}
-
-	psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1;
-	_HFI_VDBG("shm refcount = %"PRId64"\n",  psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]);
-
-	/* End critical section */
-	psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name);
-
-	psm3_affinity_shared_file_opened = 1;
-
-	return 0;
-
-unmap_shm:
-	munmap(psm3_shared_affinity_ptr, PSMI_PAGESIZE);
-	psm3_shared_affinity_ptr = NULL;
-close_shm:
-	if (shm_fd >= 0) close(shm_fd);
-free_name:
-	psmi_free(psm3_affinity_shm_name);
-	psm3_affinity_shm_name = NULL;
-	return -1;
-}
-
-/*
- * Spread HFI selection between units if we find more than one within a socket.
- */
-static void
-psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id,
-			      int *saved_hfis, int found, psm2_uuid_t const job_key)
-{
-	int ret, shm_location;
-
-	/*
-	 * Take affinity lock and open shared memory region to be able to
-	 * accurately determine which HFI to pick for this process. If any
-	 * issues, bail by picking first known HFI.
-	 */
-	if (!psm3_affinity_semaphore_open)
-		goto spread_hfi_fallback;
-
-	ret = psm3_create_and_open_affinity_shm(job_key);
-	if (ret < 0)
-		goto spread_hfi_fallback;
-
-	shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id;
-	if (shm_location > PSMI_PAGESIZE)
-		goto spread_hfi_fallback;
-
-	/* Start critical section to read/write shm object */
-	if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) {
-		_HFI_VDBG("Could not enter critical section to update NIC index\n");
-		goto spread_hfi_fallback;
-	}
-
-	*unit_start = *unit_end = saved_hfis[psm3_shared_affinity_ptr[shm_location]];
-	psm3_shared_affinity_ptr[shm_location] =
-		(psm3_shared_affinity_ptr[shm_location] + 1) % found;
-	_HFI_DBG("RoundRobin Selected NIC unit= %ld, Next NIC=%ld, node = %d, local rank=%d, found=%d.\n",
-		  *unit_start, psm3_shared_affinity_ptr[shm_location], node_id,
-		  psm3_get_mylocalrank(), found);
-
-	/* End Critical Section */
-	psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name);
-
-	return;
-
-spread_hfi_fallback:
-	*unit_start = *unit_end = saved_hfis[0];
-}
-
-static void
-psm3_create_affinity_semaphores(psm2_uuid_t const job_key)
-{
-	int ret;
-	size_t sem_len = 256;
-
-	/*
-	 * If already opened, no need to do anything else.
-	 * This could be true for Multi-EP cases where a different thread has
-	 * already created the semaphores. We don't need separate locks here as
-	 * we are protected by the overall "psm3_creation_lock" which each
-	 * thread will take in psm3_ep_open()
-	 */
-	if (psm3_affinity_semaphore_open)
-		return;
-
-	psm3_sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len);
-	psmi_assert_always(psm3_sem_affinity_shm_rw_name != NULL);
-	snprintf(psm3_sem_affinity_shm_rw_name, sem_len,
-		 SEM_AFFINITY_SHM_RW_BASENAME".%d",
-		 psm3_get_uuid_hash(job_key));
-
-	ret = psmi_init_semaphore(&psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name,
-				  S_IRUSR | S_IWUSR, 0);
-	if (ret) {
-		_HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n",
-			  psm3_sem_affinity_shm_rw_name);
-		if (psm3_sem_affinity_shm_rw)
-			sem_close(psm3_sem_affinity_shm_rw);
-		psmi_free(psm3_sem_affinity_shm_rw_name);
-		psm3_sem_affinity_shm_rw_name = NULL;
-		return;
-	}
-
-	_HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n",
-		  psm3_sem_affinity_shm_rw_name);
-
-	psm3_affinity_semaphore_open = 1;
-
-	return;
-}
-
-// return set of units to consider and which to start at.
-// caller will use 1st active unit which can be opened.
-// caller will wrap around so it's valid for start > end
-// Note: When using multiple rails per PSM process, higher level code will
-// walk through desired units and unit_param will specify a specific unit
-static
-psm2_error_t
-psmi_compute_start_and_end_unit(long unit_param, long addr_index,
-				int nunitsactive,int nunits,
-				psm2_uuid_t const job_key,
-				long *unit_start,long *unit_end)
-{
-	unsigned short nic_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS;
-	int node_id, found = 0;
-	int saved_hfis[nunits];
-
-	/* if the user did not set PSM3_NIC then ... */
-	if (unit_param == PSM3_NIC_ANY)
-	{
-		if (nunitsactive > 1) {
-			// if NICs are on different planes (non-routed subnets)
-			// we need to have all ranks default to the same plane
-			// so force 1st active NIC in that case
-			int have_subnet = 0, unit_id;
-			psmi_subnet128_t got_subnet = { };
-			for (unit_id = 0; unit_id < nunits; unit_id++) {
-				psmi_subnet128_t subnet;
-				if (psmi_hal_get_unit_active(unit_id) <= 0)
-					continue;
-				if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/,
-								addr_index>0?addr_index:0,
-								&subnet, NULL, NULL, NULL))
-					continue; // can't access NIC
-				if (! have_subnet) {
-					have_subnet = 1;
-					got_subnet = subnet;
-				} else if (! psm3_subnets_match(got_subnet,
-								subnet)) {
-					// active units have different tech
-					// (IB/OPA vs Eth) or different subnets
-					// caller will pick 1st active unit
-					*unit_start = 0;
-					*unit_end = nunits - 1;
-					_HFI_DBG("Multi-Plane config: Will select 1st viable NIC unit= %ld to %ld.\n",
-						*unit_start, *unit_end);
-					return PSM2_OK;
-				}
-			}
-		}
-
-		/* Get the actual selection algorithm from the environment: */
-		nic_sel_alg = psmi_parse_nic_selection_algorithm();
-		/* If round-robin is selection algorithm and ... */
-		if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) &&
-		    /* there are more than 1 active units then ... */
-		    (nunitsactive > 1))
-		{
-			/*
-			 * Pick first HFI we find on same root complex
-			 * as current task. If none found, fall back to
-			 * RoundRobinAll load-balancing algorithm.
-			 */
-			node_id = psm3_get_current_proc_location();
-			if (node_id >= 0) {
-				found = hfi_find_active_hfis(nunits, node_id,
-								saved_hfis);
-				if (found > 1) {
-					psm3_create_affinity_semaphores(job_key);
-					psmi_spread_hfi_within_socket(unit_start, unit_end,
-								      node_id, saved_hfis,
-								      found, job_key);
-				} else if (found == 1) {
-					*unit_start = *unit_end = saved_hfis[0];
-					_HFI_DBG("RoundRobin Selected NIC unit= %ld, node = %d, local rank=%d, found=%d.\n",
-						*unit_start, node_id,
-						psm3_get_mylocalrank(), found);
-				}
-			}
-
-			if (node_id < 0 || !found) {
-				_HFI_DBG("RoundRobin No local NIC found, using RoundRobinAll, node = %d, local rank=%d, found=%d.\n",
-						node_id,
-						psm3_get_mylocalrank(), found);
-				psmi_spread_nic_selection(job_key, unit_start,
-							  unit_end, nunits);
-			}
-		} else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) &&
-			 (nunitsactive > 1)) {
-				psmi_spread_nic_selection(job_key, unit_start,
-							  unit_end, nunits);
-		}
-		else { // PSMI_UNIT_SEL_ALG_WITHIN or only 1 active unit
-			// caller will pick 1st active unit
-			*unit_start = 0;
-			*unit_end = nunits - 1;
-			_HFI_DBG("%s: Will select 1st viable NIC unit= %ld to %ld.\n",
-				(nic_sel_alg == PSMI_UNIT_SEL_ALG_WITHIN)
-					?"Packed":"Only 1 viable NIC",
-				*unit_start, *unit_end);
-		}
-	} else if (unit_param >= 0) {
-		/* the user specified PSM3_NIC, we use it. */
-		*unit_start = *unit_end = unit_param;
-		_HFI_DBG("Caller selected NIC %ld.\n", *unit_start);
-	} else {
-		psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-				 "PSM3 can't open unit: %ld for reading and writing",
-				 unit_param);
-		return PSM2_EP_DEVICE_FAILURE;
-	}
-
-	return PSM2_OK;
-}
-
 static int psmi_hash_addr_index(long unit, long port, long addr_index)
 {
 	/* if the user did not set addr_index, then use a hash */
@@ -578,6 +102,9 @@ static int psmi_hash_addr_index(long unit, long port, long addr_index)
 	return addr_index;
 }
 
+// Open a single NIC.
+// if unit_param is PSM3_NIC_ANY, the chosen PSM3_NIC_SELECTION_ALG will be
+// used to pick a single active NIC
 psm2_error_t
 psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_index,
 		  psm2_uuid_t const job_key, uint16_t network_pkey,
@@ -620,15 +147,15 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde
 
 
 	unit_start = 0; unit_end = nunits - 1;
-	err = psmi_compute_start_and_end_unit(unit_param, addr_index,
+	err = psm3_compute_start_and_end_unit(unit_param, addr_index,
 					      nunitsactive, nunits, job_key,
 					      &unit_start, &unit_end);
 	if (err != PSM2_OK)
 		goto ret;
 
-	/* this is the start of a loop that starts at unit_start and goes to unit_end.
-	   but note that the way the loop computes the loop control variable is by
-	   an expression involving the mod operator. */
+	/* Loop from unit_start to unit_end inclusive and pick 1st active found
+	 * As needed wrap, so it's valid for unit_start >= unit_end
+	 */
 	int success = 0;
 	unit_id_prev = unit_id = unit_start;
 	do
@@ -645,6 +172,10 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde
 				psmi_hash_addr_index(unit_id, port, addr_index),
 				open_timeout,
 				ep, job_key, HAL_CONTEXT_OPEN_RETRY_MAX)) {
+			// in modes where we refcount NIC use,
+			// psm3_compute_start_and_end_unit will have returned exactly
+			// 1 NIC and refcount'ed it, so we dec refcount here
+			psm3_dec_nic_refcount(unit_id);
 			/* go to next unit if failed to open. */
 			unit_id_prev = unit_id;
 			unit_id = (unit_id + 1) % nunits;
@@ -709,6 +240,7 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde
 
 close:
 	psmi_hal_close_context(ep);
+	psm3_dec_nic_refcount(ep->unit_id);
 bail:
 	_HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_id, err, strerror(errno));
 ret:
@@ -720,16 +252,21 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde
 psm2_error_t psm3_context_close(psm2_ep_t ep)
 {
 	psmi_hal_close_context(ep);
+	psm3_dec_nic_refcount(ep->unit_id);
 
 	return PSM2_OK;
 }
 
+// up to 4 digits per CPU number, plus a coma or dash
+#define MAX_CPU_AFFINITY_STRING (CPU_SETSIZE * 5)
+
 static inline char * _dump_cpu_affinity(char *buf, size_t buf_size, cpu_set_t * cpuset) {
 	int i;
-	int isfirst = 1;
-	char tmp[25]; //%d = 10 :: 10 + '-' + 10 + ',' + '\0' = 23
+	char tmp[25]; //%d, = 10+','+\0 or %d-%d, = 10 + '-' + 10 + ',' + '\0' = 23
 	int first = -1, last = -1;
+	int len = 0;
 
+	*buf = '\0';
 	for (i = 0; i < CPU_SETSIZE; i++) {
 		if (CPU_ISSET(i, cpuset)) {
 			if (first == -1) {
@@ -745,13 +282,8 @@ static inline char * _dump_cpu_affinity(char *buf, size_t buf_size, cpu_set_t *
 			}
 			first = last = -1;
 
-			if (isfirst) {
-				strncpy(buf, tmp, buf_size-1);
-				isfirst=0;
-			} else {
-				strncat(buf, tmp, buf_size-1);
-			}
-			buf[buf_size-1] = '\0';
+			snprintf(&buf[len], buf_size-len,"%s", tmp);
+			len = strlen(buf);
 		}
 	}
 
@@ -761,26 +293,48 @@ static inline char * _dump_cpu_affinity(char *buf, size_t buf_size, cpu_set_t *
 		} else {
 			snprintf(tmp, sizeof(tmp), "%d-%d,", first, last);
 		}
-		if (isfirst) {
-			strncpy(buf, tmp, buf_size-1);
-		} else {
-			strncat(buf, tmp, buf_size-1);
-		}
-		buf[buf_size-1] = '\0';
+		snprintf(&buf[len], buf_size-len,"%s", tmp);
+		len = strlen(buf);
 	}
-	char *comma = strrchr(buf, ',');
-	if (comma) comma[0] = '\0';
+	if (len)
+		buf[len-1] = '\0';	// elimate trailing coma
 
 	return buf;
 }
 
-// called by HAL context_open to set affinity consistent with
-// NIC NUMA location when NIC NUMA location is a superset of thread CPU set
-// TBD unclear when this provides value.
+// called by HAL context_open to set CPU affinity narrower consistent with
+// NIC NUMA location
+// Intel MPI sets PSM3_NO_CPUAFFINITY to disable this function
+// Suspect this is not effective or has bugs.  For Omni-Path the NIC
+// driver set affinity before this was called, and this was thus likely a noop.
+// This is a noop if:
+//     - if NIC is not NUMA local to any of CPUs in existing affinity
+//     - if existing affinity selects more cores than those local to NIC
+//		even if that set incompletely overlaps the NIC local core set
+//		suspect this is a bug and test should be opposity or just test
+//		for overlap.
+// if NIC is NUMA local to CPU, and NIC core list is larger than existing
+// affinity, will limit scope of affinity to cores NUMA local to NIC
+//  - does not consider the full set of selected NICs when multirail enabled
+//  - may only provide value if CPU set from caller is small but > 1 CPU NUMA
+//    domain in which case this will reduce it to a single CPU NUMA domain
+//    matching the NIC's NUMA location.
+//
+// By default this is enabled, but two undocumented variables
+// PSM3_FORCE_CPUAFFINITY and PSM3_NO_CPUAFFINITY can control this
+// as well as the ep_open skip_affinity flag.
+//
 // May be better if we analyzed NIC NUMA location and various other
 // process and thread locations when NIC NUMA is a subset of CPU affinity
 // and guide a good choice for CPU affinity, but that would require
 // intra-node process coordination to avoid duplicate CPU selections
+//
+// TBD for GPU affinity this may not make sense.  Also PSM3 can't force a GPU
+// selection for an app.
+//
+// TBD when PSM3 is using multiple NICs (PSM3_MULTIRAIL > 0) this should
+// be enhanced to attempt to select a CPU based on location of all NICs being
+// used, not just a single NIC.
 int
 psm3_context_set_affinity(psm2_ep_t ep, int unit)
 {
@@ -796,8 +350,9 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit)
 	}
 
 	if (_HFI_DBG_ON) {
-		char cpu_buf[128] = {0};
-		_HFI_DBG_ALWAYS( "CPU affinity Before set: %s\n", _dump_cpu_affinity(cpu_buf, 128, &cpuset));
+		char cpu_buf[MAX_CPU_AFFINITY_STRING] = {0};
+		_HFI_DBG_ALWAYS( "CPU affinity Before set: %s\n",
+				_dump_cpu_affinity(cpu_buf, MAX_CPU_AFFINITY_STRING, &cpuset));
 	}
 
 	/*
@@ -837,10 +392,11 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit)
 			//err = -PSM_HAL_ERROR_GENERAL_ERROR;
 			goto bail;
 		} else if (cpu_and_count == 0 && _HFI_DBG_ON) {
-			char buf1[128] = {0};
-			char buf2[128] = {0};
+			char buf1[MAX_CPU_AFFINITY_STRING] = {0};
+			char buf2[MAX_CPU_AFFINITY_STRING] = {0};
 			_HFI_DBG_ALWAYS( "CPU affinity not set, NIC selected is not on the same socket as thread (\"%s\" & \"%s\" == 0).\n",
-				_dump_cpu_affinity(buf1, 128, &nic_cpuset), _dump_cpu_affinity(buf2, 128, &cpuset));
+				_dump_cpu_affinity(buf1, MAX_CPU_AFFINITY_STRING, &nic_cpuset),
+				_dump_cpu_affinity(buf2, MAX_CPU_AFFINITY_STRING, &cpuset));
 		}
 	}
 skip_affinity:
@@ -852,8 +408,9 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit)
 				"Can't get CPU affinity: %s\n", strerror(errno));
 			goto bail;
 		}
-		char cpu_buf[128] = {0};
-		_HFI_DBG_ALWAYS( "CPU affinity After set: %s\n", _dump_cpu_affinity(cpu_buf, 128, &cpuset));
+		char cpu_buf[MAX_CPU_AFFINITY_STRING] = {0};
+		_HFI_DBG_ALWAYS( "CPU affinity After set: %s\n",
+				_dump_cpu_affinity(cpu_buf, MAX_CPU_AFFINITY_STRING, &cpuset));
 	}
 	return 0;
 
@@ -904,39 +461,3 @@ psm3_ep_verify_pkey(psm2_ep_t ep, uint16_t pkey, uint16_t *opkey, uint16_t* oind
 
 	return PSM2_OK;
 }
-
-static
-int psmi_parse_nic_selection_algorithm(void)
-{
-	union psmi_envvar_val env_nic_alg;
-	int nic_alg = PSMI_UNIT_SEL_ALG_ACROSS;
-
-	const char* PSM3_NIC_SELECTION_ALG_HELP =
-		    "NIC Device Selection Algorithm to use. Round Robin[RoundRobin or rr] (Default) "
-		    ", Packed[p] or Round Robin All[RoundRobinAll or rra].";
-
-	/* If a specific unit is set in the environment, use that one. */
-	psm3_getenv("PSM3_NIC_SELECTION_ALG", PSM3_NIC_SELECTION_ALG_HELP,
-		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
-		    (union psmi_envvar_val)"rr", &env_nic_alg);
-
-	if (!strcasecmp(env_nic_alg.e_str, "Round Robin")
-		|| !strcasecmp(env_nic_alg.e_str, "RoundRobin")
-		|| !strcasecmp(env_nic_alg.e_str, "rr"))
-		nic_alg = PSMI_UNIT_SEL_ALG_ACROSS;
-	else if (!strcasecmp(env_nic_alg.e_str, "Packed")
-			 || !strcasecmp(env_nic_alg.e_str, "p"))
-		nic_alg = PSMI_UNIT_SEL_ALG_WITHIN;
-	else if (!strcasecmp(env_nic_alg.e_str, "Round Robin All")
-			 || !strcasecmp(env_nic_alg.e_str, "RoundRobinAll")
-			 || !strcasecmp(env_nic_alg.e_str, "rra"))
-		nic_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL;
-	else {
-		_HFI_INFO(
-		    "Invalid value for PSM3_NIC_SELECTION_ALG ('%s') %-40s Using: %s\n",
- 			env_nic_alg.e_str, PSM3_NIC_SELECTION_ALG_HELP, "RoundRobin");
-		nic_alg = PSMI_UNIT_SEL_ALG_ACROSS;
-	}
-
-	return nic_alg;
-}
diff --git a/prov/psm3/psm3/psm_context.h b/prov/psm3/psm3/psm_context.h
index 188e1284cc4..28339284bcf 100644
--- a/prov/psm3/psm3/psm_context.h
+++ b/prov/psm3/psm3/psm_context.h
@@ -76,21 +76,4 @@ psm3_context_set_affinity(psm2_ep_t ep, int unit);
 psm2_error_t psm3_context_interrupt_set(psm2_ep_t ep, int enable);
 int psm3_context_interrupt_isenabled(psm2_ep_t ep);
 
-/*
- * round robin contexts across HFIs, then
- * ports; this is the default.
- * This option spreads the HFI selection within the local socket.
- * If it is preferred to spread job over over entire set of
- * HFIs within the system, see ALG_ACROSS_ALL below.
- */
-#define PSMI_UNIT_SEL_ALG_ACROSS     PSM_HAL_ALG_ACROSS
-
-#define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL
-
-/*
- * use all contexts on an HFI (round robin
- * active ports within), then next HFI
- */
-#define PSMI_UNIT_SEL_ALG_WITHIN     PSM_HAL_ALG_WITHIN
-
 #endif /* PSM_CONTEXT_H */
diff --git a/prov/psm3/psm3/psm_ep.c b/prov/psm3/psm3/psm_ep.c
index 9e31af3e65c..36dbf40abfa 100644
--- a/prov/psm3/psm3/psm_ep.c
+++ b/prov/psm3/psm3/psm_ep.c
@@ -119,385 +119,6 @@ psm2_error_t psm3_ep_num_devunits(uint32_t *num_units_o)
 	return PSM2_OK;
 }
 
-struct rail_info {
-	psmi_subnet128_t subnet;
-	unsigned unit;
-	unsigned port;
-	unsigned addr_index;
-};
-
-static int cmpfunc(const void *p1, const void *p2)
-{
-	struct rail_info *a = ((struct rail_info *) p1);
-	struct rail_info *b = ((struct rail_info *) p2);
-	int ret;
-
-	ret = psmi_subnet128_cmp(a->subnet, b->subnet);
-	if (ret == 0) {
-		if (a->addr_index < b->addr_index)
-			return -1;
-		else if (a->addr_index > b->addr_index)
-			return 1;
-	}
-	return ret;
-}
-
-// process PSM3_MULTIRAIL and PSM3_MULTIRAIL_MAP and return the
-// list of unit/port/addr_index in unit[0-(*num_rails-1)],
-// port[0-(*num_rails-1)] and addr_index[0-(*num_rails-1)]
-// When *num_rails is returned as 0, multirail is not enabled and
-// other mechanisms (PSM3_NIC, PSM3_NIC_SELECTION_ALG) must be
-// used by the caller to select a single NIC for the process
-static psm2_error_t
-psm3_ep_multirail(int *num_rails, uint32_t *unit, uint16_t *port, int *addr_index)
-{
-	uint32_t num_units = 0;
-	psmi_subnet128_t subnet;
-	unsigned i, j, k, count = 0;
-	int ret;
-	psm2_error_t err = PSM2_OK;
-	struct rail_info rail_info[PSMI_MAX_RAILS];
-	union psmi_envvar_val env_multirail;
-	union psmi_envvar_val env_multirail_map;
-	int multirail_within_socket_used = 0;
-	int node_id = -1, found = 0;
-
-	psm3_getenv("PSM3_MULTIRAIL",
-			"Use all available NICs in the system for communication.\n"
-			 "-1: No NIC autoselection,\n"
-			 "0: Disabled (default),\n"
-			 "1: Enable multirail across all available NICs,\n"
-			 "2: Enable multirail within socket.\n"
-			 "\t For multirail within a socket, we try to find at\n"
-			 "\t least one NIC on the same socket as current task.\n"
-			 "\t If none found, we continue to use other NICs within\n"
-			 "\t the system.",
-			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
-			(union psmi_envvar_val)0,
-			&env_multirail);
-	if (env_multirail.e_int <= 0) {
-		*num_rails = 0;
-		return PSM2_OK;
-	}
-
-	if (env_multirail.e_int == 2)
-		multirail_within_socket_used = 1;
-
-/*
- * map is in format: unit:port-addr_index,unit:port-addr_index,...
- * where :port is optional (default of 1) and unit can be name or number
- * -addr_index is also optionall and defaults to "all"
- * addr_index can be an integer between 0 and PSM3_ADDR_PER_NIC-1
- * or "any" or "all".  "any" selects a single address using the hash and
- * "all" setups a rail for each address.
- */
-#define MAX_MAP_LEN (PSMI_MAX_RAILS*128)
-	if (!psm3_getenv("PSM3_MULTIRAIL_MAP",
-		"NIC selections for each rail in format:\n"
-		"     rail,rail,...\n"
-#if 0
-		"Where rail can be: unit:port-addr_index or unit\n"
-#else
-		"Where rail can be: unit-addr_index or unit\n"
-#endif
-		"unit can be device name or unit number\n"
-#if 0
-		"where :port is optional (default of 1)\n"
-#endif
-		"addr_index can be 0 to PSM3_ADDR_PER_NIC-1, or 'any' or 'all'\n"
-		"When addr_index is omitted, it defaults to 'all'\n"
-		"default autoselects",
-			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
-			(union psmi_envvar_val)"", &env_multirail_map)) {
-
-		char temp[MAX_MAP_LEN+1];
-		char *s;
-		char *delim;
-
-		strncpy(temp, env_multirail_map.e_str, MAX_MAP_LEN);
-		if (temp[MAX_MAP_LEN-1] != 0)
-			return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-					"PSM3_MULTIRAIL_MAP too long: '%s'",
-					env_multirail_map.e_str);
-		s = temp;
-		psmi_assert(*s);
-		do {
-			int u, p = 1;
-			int skip_port = 0;
-			int skip_addr_index = 0;
-			int a_index = PSM3_ADDR_INDEX_ALL;
-
-			if (! *s)	// trailing ',' on 2nd or later loop
-				break;
-			if (count >= PSMI_MAX_RAILS)
-				return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-						"PSM3_MULTIRAIL_MAP exceeds %u rails: '%s'",
-						PSMI_MAX_RAILS, env_multirail_map.e_str);
-
-			// find end of unit field and put in \0 as needed
-			delim = strpbrk(s, ":-,");
-			if (!delim || *delim == ',') {
-				skip_port = 1; skip_addr_index = 1;
-			} else if (*delim == '-') {
-				skip_port = 1;
-			}
-			if (delim)
-				*delim = '\0';
-			// parse unit
-			u = psm3_sysfs_find_unit(s);
-			if (u < 0)
-				return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-						"PSM3_MULTIRAIL_MAP invalid unit: '%s'", s);
-			// find next field
-			if (delim)
-				s = delim+1;
-			if (! skip_port) {
-				// find end of port field and put in \0 as needed
-				delim = strpbrk(s, "-,");
-				if (!delim || *delim == ',')
-					skip_addr_index = 1;
-				if (delim)
-					*delim = '\0';
-				// parse port
-				p = psm3_parse_str_long(s);
-				if (p < 0)
-					return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-						"PSM3_MULTIRAIL_MAP invalid port: '%s'", s);
-				// find next field
-				if (delim)
-					s = delim+1;
-			}
-			if (! skip_addr_index) {
-				// find end of addr_index field and put in \0 as needed
-				delim = strchr(s, ',');
-				if (delim)
-					*delim = '\0';
-				// parse addr_index
-				if (0 == strcmp(s, "all"))
-					a_index = PSM3_ADDR_INDEX_ALL;	// we will loop below
-				else if (0 == strcmp(s, "any"))
-					a_index = PSM3_ADDR_INDEX_ANY;	// caller will pick
-				else {
-					a_index = psm3_parse_str_long(s);
-					if (a_index < 0 || a_index >= psm3_addr_per_nic)
-						return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-							"PSM3_MULTIRAIL_MAP invalid addr index: '%s'", s);
-				}
-				// find next field
-				if (delim)
-					s = delim+1;
-			}
-
-			if (a_index == PSM3_ADDR_INDEX_ALL) { // all
-				for (a_index = 0; a_index < psm3_addr_per_nic; a_index++) {
-					if (count >= PSMI_MAX_RAILS)
-						return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-								"PSM3_MULTIRAIL_MAP exceeds %u rails: '%s' due to multi-ip",
-								PSMI_MAX_RAILS, env_multirail_map.e_str);
-					unit[count] = u;
-					port[count] = p;
-					addr_index[count] = a_index;
-					count++;
-				}
-			} else {
-				unit[count] = u;
-				port[count] = p;
-				addr_index[count] = a_index;
-				count++;
-			}
-		} while (delim);
-		*num_rails = count;
-
-/*
- * Check if any of the port is not usable.  Just use addr_index 0 for check
- */
-		for (i = 0; i < count; i++) {
-			_HFI_VDBG("rail %d:  %u(%s) %u\n", i,
-				unit[i], psm3_sysfs_unit_dev_name(unit[i]), port[i]);
-			ret = psmi_hal_get_port_active(unit[i], port[i]);
-			if (ret <= 0)
-				return psm3_handle_error(NULL,
-						PSM2_EP_DEVICE_FAILURE,
-						"PSM3_MULTIRAIL_MAP: Unit/port: %d(%s):%d is not active.",
-						unit[i], psm3_sysfs_unit_dev_name(unit[i]),
-						port[i]);
-			ret = psmi_hal_get_port_lid(unit[i], port[i], 0 /* addr_index*/);
-			if (ret <= 0)
-				return psm3_handle_error(NULL,
-						PSM2_EP_DEVICE_FAILURE,
-						"PSM3_MULTIRAIL_MAP: unit %d(%s):%d was filtered out, unable to use",
-						unit[i], psm3_sysfs_unit_dev_name(unit[i]),
-						port[i]);
-			ret = psmi_hal_get_port_subnet(unit[i], port[i], 0 /* addr_index*/, NULL, NULL, NULL, NULL);
-			if (ret == -1)
-				return psm3_handle_error(NULL,
-						PSM2_EP_DEVICE_FAILURE,
-						"PSM3_MULTIRAIL_MAP: Couldn't get subnet for unit %d(%s):%d",
-						unit[i], psm3_sysfs_unit_dev_name(unit[i]),
-						port[i]);
-		}
-		return PSM2_OK;
-	}
-
-	if ((err = psm3_ep_num_devunits(&num_units))) {
-		return err;
-	}
-	if (num_units > PSMI_MAX_RAILS) {
-		_HFI_INFO
-		    ("Found %d units, max %d units are supported, use %d\n",
-		     num_units, PSMI_MAX_RAILS, PSMI_MAX_RAILS);
-		num_units = PSMI_MAX_RAILS;
-	}
-
-	/*
-	 * PSM3_MULTIRAIL=2 functionality-
-	 *   - Try to find at least find one HFI in the same root
-	 *     complex. If none found, continue to run and
-	 *     use remaining HFIs in the system.
-	 *   - If we do find at least one HFI in same root complex, we
-	 *     go ahead and add to list.
-	 */
-	if (multirail_within_socket_used) {
-		node_id = psm3_get_current_proc_location();
-		for (i = 0; i < num_units; i++) {
-			if (psmi_hal_get_unit_active(i) <= 0)
-				continue;
-			int node_id_i;
-
-			if (!psmi_hal_get_node_id(i, &node_id_i)) {
-				if (node_id_i == node_id) {
-					found = 1;
-					break;
-				}
-			}
-		}
-	}
-/*
- * Get all the ports and addr_index with a valid lid and gid, one port per unit.
- * but up to PSM3_ADDR_PER_NIC addresses
- */
-	for (i = 0; i < num_units; i++) {
-		int node_id_i;
-
-		if (!psmi_hal_get_node_id(i, &node_id_i))
-		{
-			if (multirail_within_socket_used &&
-			    found && (node_id_i != node_id))
-				continue;
-		}
-
-		for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) {
-			int got_port = 0;
-			for (k = 0; k < psm3_addr_per_nic; k++) {
-				ret = psmi_hal_get_port_lid(i, j, k);
-				if (ret <= 0)
-					continue;
-				ret = psmi_hal_get_port_subnet(i, j, k, &subnet, NULL, NULL, NULL);
-				if (ret == -1)
-					continue;
-
-				rail_info[count].subnet = subnet;
-				rail_info[count].unit = i;
-				rail_info[count].port = j;
-				rail_info[count].addr_index = k;
-				got_port = 1;
-				count++;
-			}
-			if (got_port)	// one port per unit
-				break;
-		}
-	}
-
-/*
- * Sort all the ports within rail_info from small to big.
- * This is for multiple fabrics, and we use fabric with the
- * smallest subnet to make the master connection.
- */
-	qsort(rail_info, count, sizeof(rail_info[0]), cmpfunc);
-
-	for (i = 0; i < count; i++) {
-		unit[i] = rail_info[i].unit;
-		port[i] = rail_info[i].port;
-		addr_index[i] = rail_info[i].addr_index;
-	}
-	*num_rails = count;
-	return PSM2_OK;
-}
-
-// this is used to find devices with the same address as another process,
-// implying intra-node comms.
-// we poplate hfi_nids and nnids with the set of network ids (NID) for
-// all the local NICs.
-// The caller will see if any of these NIDs match the NID of the remote process.
-// Note that NIDs are globally unique and include both subnet and NIC address
-// information, so we can compare them regardless of their subnet.
-// NIDs which are not on the same subnet will not match.
-// NIDs on the same subnet only match if they are the same NIC.
-// Two local NICs with the same subnet and same address is an unexpected
-// invalid config, and will silently match the two NICs.
-#define MAX_GID_IDX 31
-static psm2_error_t
-psm3_ep_devnids(psm2_nid_t **nids, uint32_t *num_nids_o)
-{
-	uint32_t num_units = 0;
-	int i;
-	psm2_error_t err = PSM2_OK;
-
-	PSMI_ERR_UNLESS_INITIALIZED(NULL);
-
-	if (hfi_nids == NULL) {
-		if ((err = psm3_ep_num_devunits(&num_units)))
-			goto fail;
-		hfi_nids = (psm2_nid_t *)
-		    psmi_calloc(PSMI_EP_NONE, UNDEFINED,
-				num_units * psmi_hal_get_num_ports()*psm3_addr_per_nic, sizeof(*hfi_nids));
-		if (hfi_nids == NULL) {
-			err = psm3_handle_error(NULL, PSM2_NO_MEMORY,
-						"Couldn't allocate memory for dev_nids structure");
-			goto fail;
-		}
-
-		for (i = 0; i < num_units; i++) {
-			int j;
-			for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) {
-				int k;
-				for (k = 0; k < psm3_addr_per_nic; k++) {
-					int lid = psmi_hal_get_port_lid(i, j, k);
-					int ret, idx = 0;
-					psmi_subnet128_t subnet = { };
-					psmi_naddr128_t addr = { };
-					psmi_gid128_t gid = { };
-
-					// skip ports which aren't ready for use
-					if (lid <= 0)
-						continue;
-					ret = psmi_hal_get_port_subnet(i, j, k, &subnet, &addr, &idx, &gid);
-					if (ret == -1)
-						continue;
-					hfi_nids[nnids] = psm3_build_nid(i, addr, lid);
-					_HFI_VDBG("NIC unit %d, port %d addr_index %d, found %s "
-						  "GID[%d] %s subnet %s\n",
-						i, j, k,
-						psm3_nid_fmt(hfi_nids[nnids], 0),
-						idx, psm3_gid128_fmt(gid, 1),
-						psm3_subnet128_fmt(subnet, 2));
-					nnids++;
-				}
-			}
-		}
-		if (nnids == 0) {
-			err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-						"Couldn't find any unfiltered units");
-			goto fail;
-		}
-	}
-	*nids = hfi_nids;
-	*num_nids_o = nnids;
-
-fail:
-	return err;
-}
-
 psm2_error_t psm3_ep_query(int *num_of_epinfo, psm2_epinfo_t *array_of_epinfo)
 {
 	psm2_error_t err = PSM2_OK;
@@ -632,6 +253,80 @@ psm2_error_t psm3_epaddr_to_epid(psm2_epaddr_t epaddr, psm2_epid_t *epid)
 	return err;
 }
 
+// this is used to find devices with the same address as another process,
+// implying intra-node comms.
+// we poplate hfi_nids and nnids with the set of network ids (NID) for
+// all the local NICs.
+// The caller will see if any of these NIDs match the NID of the remote process.
+// Note that NIDs are globally unique and include both subnet and NIC address
+// information, so we can compare them regardless of their subnet.
+// NIDs which are not on the same subnet will not match.
+// NIDs on the same subnet only match if they are the same NIC.
+// Two local NICs with the same subnet and same address is an unexpected
+// invalid config, and will silently match the two NICs.
+#define MAX_GID_IDX 31
+static psm2_error_t
+psm3_ep_devnids(psm2_nid_t **nids, uint32_t *num_nids_o)
+{
+	uint32_t num_units = 0;
+	int i;
+	psm2_error_t err = PSM2_OK;
+
+	PSMI_ERR_UNLESS_INITIALIZED(NULL);
+
+	if (hfi_nids == NULL) {
+		if ((err = psm3_ep_num_devunits(&num_units)))
+			goto fail;
+		hfi_nids = (psm2_nid_t *)
+		    psmi_calloc(PSMI_EP_NONE, UNDEFINED,
+				num_units * psmi_hal_get_num_ports()*psm3_addr_per_nic, sizeof(*hfi_nids));
+		if (hfi_nids == NULL) {
+			err = psm3_handle_error(NULL, PSM2_NO_MEMORY,
+						"Couldn't allocate memory for dev_nids structure");
+			goto fail;
+		}
+
+		for (i = 0; i < num_units; i++) {
+			int j;
+			for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) {
+				int k;
+				for (k = 0; k < psm3_addr_per_nic; k++) {
+					int lid = psmi_hal_get_port_lid(i, j, k);
+					int ret, idx = 0;
+					psmi_subnet128_t subnet = { };
+					psmi_naddr128_t addr = { };
+					psmi_gid128_t gid = { };
+
+					// skip ports which aren't ready for use
+					if (lid <= 0)
+						continue;
+					ret = psmi_hal_get_port_subnet(i, j, k, &subnet, &addr, &idx, &gid);
+					if (ret == -1)
+						continue;
+					hfi_nids[nnids] = psm3_build_nid(i, addr, lid);
+					_HFI_VDBG("NIC unit %d, port %d addr_index %d, found %s "
+						  "GID[%d] %s subnet %s\n",
+						i, j, k,
+						psm3_nid_fmt(hfi_nids[nnids], 0),
+						idx, psm3_gid128_fmt(gid, 1),
+						psm3_subnet128_fmt(subnet, 2));
+					nnids++;
+				}
+			}
+		}
+		if (nnids == 0) {
+			err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+						"Couldn't find any unfiltered units");
+			goto fail;
+		}
+	}
+	*nids = hfi_nids;
+	*num_nids_o = nnids;
+
+fail:
+	return err;
+}
+
 // Indicate if the given epid is a local process.
 // In which case we can use intra-node shared memory comms with it.
 psm2_error_t
@@ -714,6 +409,12 @@ psm2_error_t psm3_ep_open_opts_get_defaults(struct psm3_ep_open_opts *opts)
 
 psm2_error_t psm3_poll_noop(ptl_t *ptl, int replyonly, bool force);
 
+// open a single internal EP for a single NIC
+// For 1st internal EP opts may indicate PSM3_NIC_ANY in which case
+// psm3_ep_open_device will let psm3_context_open pick the NIC based on
+// PSM3_NIC_SELECTION_ALG.
+// For multirail and when opening additional QPs for the NIC, opts will
+// select a specific NIC.
 psm2_error_t
 psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled,
 		       struct psm3_ep_open_opts const *opts_i, psm2_mq_t mq,
@@ -821,11 +522,13 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled,
 	/* Get immediate data size - transfers less than immediate data size do
 	 * not consume a send buffer and require just a send descriptor.
 	 */
-	if (!psm3_getenv("PSM3_SEND_IMMEDIATE_SIZE",
-			 "Immediate data send size not requiring a buffer [128]",
-			 PSMI_ENVVAR_LEVEL_HIDDEN,
-			 PSMI_ENVVAR_TYPE_UINT,
-			 (union psmi_envvar_val)128, &envvar_val)) {
+	if (!psm3_getenv_range("PSM3_SEND_IMMEDIATE_SIZE",
+			 "Immediate data send size not requiring a buffer. Default 128.",
+			 "Actual permitted upper limit is NIC dependent.",
+			 PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+			 (union psmi_envvar_val)128,
+			 (union psmi_envvar_val)0, (union psmi_envvar_val)1024,
+			 NULL, NULL, &envvar_val)) {
 		opts.imm_size = envvar_val.e_uint;
 	}
 
@@ -1075,12 +778,10 @@ psm3_ep_open(psm2_uuid_t const unique_job_key,
 	psm2_mq_t mq;
 	psm2_epid_t epid;
 	psm2_ep_t ep, tmp;
-	uint32_t units[PSMI_MAX_QPS];
-	uint16_t ports[PSMI_MAX_QPS];
-	int addr_indexes[PSMI_MAX_QPS];
-	int i, num_rails = 0;
+	int i;
 	int devid_enabled[PTL_MAX_INIT];
 	struct psm3_ep_open_opts opts = *opts_i;
+	struct multirail_config multirail_config = { 0 };
 
 	PSM2_LOG_MSG("entering");
 	PSMI_ERR_UNLESS_INITIALIZED(NULL);
@@ -1127,15 +828,15 @@ psm3_ep_open(psm2_uuid_t const unique_job_key,
 		goto fail;
 
 	if (psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS)) {
-		err = psm3_ep_multirail(&num_rails, units, ports, addr_indexes);
+		err = psm3_ep_multirail(&multirail_config);
 		if (err != PSM2_OK)
 			goto fail;
 
 		/* If multi-rail is used, set the first ep unit/port */
-		if (num_rails > 0) {
-			opts.unit = units[0];
-			opts.port = ports[0];
-			opts.addr_index = addr_indexes[0];
+		if (multirail_config.num_rails > 0) {
+			opts.unit = multirail_config.units[0];
+			opts.port = multirail_config.ports[0];
+			opts.addr_index = multirail_config.addr_indexes[0];
 		}
 	}
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
@@ -1183,13 +884,13 @@ psm3_ep_open(psm2_uuid_t const unique_job_key,
 		psmi_hal_context_initstats(ep);
 		union psmi_envvar_val envvar_val;
 
-		if (num_rails <= 0) {
+		if (multirail_config.num_rails <= 0) {
 			// the NIC has now been selected for our process
 			// use the same NIC for any additional QPs below
-			num_rails = 1;
-			units[0] = ep->unit_id;
-			ports[0] = ep->portnum;
-			addr_indexes[0] = ep->addr_index;
+			multirail_config.num_rails = 1;
+			multirail_config.units[0] = ep->unit_id;
+			multirail_config.ports[0] = ep->portnum;
+			multirail_config.addr_indexes[0] = ep->addr_index;
 		}
 		// When QP_PER_NIC >1, creates more than 1 QP on each NIC and then
 		// uses the multi-rail algorithms to spread the traffic across QPs
@@ -1204,22 +905,28 @@ psm3_ep_open(psm2_uuid_t const unique_job_key,
 			PSMI_ENVVAR_TYPE_UINT,
 			(union psmi_envvar_val)1, &envvar_val);
 
-		if ((num_rails * envvar_val.e_uint) > PSMI_MAX_QPS) {
+		if ((multirail_config.num_rails * envvar_val.e_uint) > PSMI_MAX_QPS) {
 			err = psm3_handle_error(NULL, PSM2_TOO_MANY_ENDPOINTS,
 				"PSM3_QP_PER_NIC (%u) * num_rails (%d) > Max Support QPs (%u)",
-				envvar_val.e_uint, num_rails, PSMI_MAX_QPS);
+				envvar_val.e_uint, multirail_config.num_rails, PSMI_MAX_QPS);
 			goto fail;
 		}
 
 		for (j= 0; j< envvar_val.e_uint; j++) {
-			for (i = 0; i < num_rails; i++) {
-				_HFI_VDBG("rail %d unit %u port %u addr_index %d\n", i, units[i], ports[i], addr_indexes[i]);
+			// loop will open additional internal EPs for all
+			// the additional QPs on 1st rail and for all the
+			// additional rails and all the QPs on those rails
+			for (i = 0; i < multirail_config.num_rails; i++) {
+				_HFI_VDBG("rail %d unit %u port %u addr_index %d\n", i,
+							multirail_config.units[i],
+							multirail_config.ports[i],
+							multirail_config.addr_indexes[i]);
 				// did 0, 0 already above
 				if (i == 0 && j== 0)
 					continue;
-				opts.unit = units[i];
-				opts.port = ports[i];
-				opts.addr_index = addr_indexes[i];
+				opts.unit = multirail_config.units[i];
+				opts.port = multirail_config.ports[i];
+				opts.addr_index = multirail_config.addr_indexes[i];
 
 				/* Create secondary EP */
 				err = psm3_ep_open_internal(unique_job_key,
@@ -1542,6 +1249,15 @@ psm3_parse_devices(int devices[PTL_MAX_INIT])
 	int len;
 	int i = 0;
 	union psmi_envvar_val devs;
+	static int have_value = 0;
+	static int saved[PTL_MAX_INIT];
+
+	// only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times
+	if (have_value) {
+		for (i=0; i < PTL_MAX_INIT; i++)
+			devices[i] = saved[i];
+		return PSM2_OK;
+	}
 
 	/* See which ptl devices we want to use for this ep to be opened */
 	psm3_getenv("PSM3_DEVICES",
@@ -1605,6 +1321,9 @@ psm3_parse_devices(int devices[PTL_MAX_INIT])
 		*(b_new - 1) = '\0';
 
 	_HFI_PRDBG("PSM Device allocation order: %s\n", devstr);
+	for (i=0; i < PTL_MAX_INIT; i++)
+		saved[i] = devices[i];
+	have_value = 1;
 fail:
 	if (devstr != NULL)
 		psmi_free(devstr);
diff --git a/prov/psm3/psm3/psm_ep.h b/prov/psm3/psm3/psm_ep.h
index 609c75ea8b6..c1ec006eff9 100644
--- a/prov/psm3/psm3/psm_ep.h
+++ b/prov/psm3/psm3/psm_ep.h
@@ -123,6 +123,7 @@ struct psm2_ep {
 	uint16_t network_pkey_index;  /**> Pkey index */
 	int did_syslog;
 	const char *dev_name;	/* just for logging */
+	const char *addl_nic_info;	/* just for logging */
 	psm2_uuid_t uuid;
 	uint16_t jkey;
 	uint64_t service_id;	/* OPA service ID */
@@ -271,8 +272,6 @@ struct psm2_epaddr {
 	PSMI_PROFILE_UNBLOCK();						\
 } while (0)
 
-psm2_error_t psm3_parse_devices(int devices[PTL_MAX_INIT]);
-int psm3_device_is_enabled(const int devices[PTL_MAX_INIT], int devid);
 int psm3_ep_device_is_enabled(const psm2_ep_t ep, int devid);
 
 #ifdef PSM_HAVE_RNDV_MOD
diff --git a/prov/psm3/psm3/psm_mpool.c b/prov/psm3/psm3/psm_mpool.c
index d6b6445a154..6bf33b7d74a 100644
--- a/prov/psm3/psm3/psm_mpool.c
+++ b/prov/psm3/psm3/psm_mpool.c
@@ -470,8 +470,10 @@ void
 MOCKABLE(psm3_mpool_get_obj_info)(mpool_t mp, uint32_t *num_obj_per_chunk,
 			uint32_t *num_obj_max_total)
 {
-	*num_obj_per_chunk = mp->mp_num_obj_per_chunk;
-	*num_obj_max_total = mp->mp_num_obj_max_total;
+	if (num_obj_per_chunk)
+		*num_obj_per_chunk = mp->mp_num_obj_per_chunk;
+	if (num_obj_max_total)
+		*num_obj_max_total = mp->mp_num_obj_max_total;
 	return;
 }
 MOCK_DEF_EPILOGUE(psm3_mpool_get_obj_info);
diff --git a/prov/psm3/psm3/psm_mq.c b/prov/psm3/psm3/psm_mq.c
index ca6cd100b7c..5203715fff8 100644
--- a/prov/psm3/psm3/psm_mq.c
+++ b/prov/psm3/psm3/psm_mq.c
@@ -1445,6 +1445,18 @@ psm2_error_t psm3_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get)
 		_HFI_VDBG("RNDV_SHM_SZ = %d (%s)\n",
 			  mq->shm_thresh_rv, get ? "GET" : "SET");
 		break;
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	case PSM2_MQ_GPU_RNDV_SHM_SZ:
+		if (get)
+			*((uint32_t *) value) = mq->shm_gpu_thresh_rv;
+		else {
+			val32 = *((uint32_t *) value);
+			mq->shm_gpu_thresh_rv = val32;
+		}
+		_HFI_VDBG("RNDV_GPU_SHM_SZ = %d (%s)\n",
+			  mq->shm_gpu_thresh_rv, get ? "GET" : "SET");
+		break;
+#endif
 	case PSM2_MQ_MAX_SYSBUF_MBYTES:
 		/* Deprecated: this option no longer does anything. */
 		break;
@@ -1597,6 +1609,169 @@ psm3_mq_print_stats_finalize(psm2_mq_t mq)
 	}
 }
 
+/* parse a list of window_rv:limit values for
+ * PSM3_RNDV_NIC_WINDOW and PSM3_GPU_RNDV_NIC_WINDOW
+ * format is window:limit,window:limit,window
+ * limit value must be increasing, limit for last entry is optional and
+ * will be UINT32_MAX even if a value is specified.
+ * 0 - successfully parsed, *list points to malloced list
+ * -1 - str empty, *list unchanged
+ * -2 - syntax error, *list unchanged
+ */
+static int psm3_mq_parse_window_rv(const char *str,
+							size_t errstr_size, char errstr[],
+							struct psm3_mq_window_rv_entry **list)
+{
+#define MAX_WINDOW_STR_LEN 1024
+	char temp[MAX_WINDOW_STR_LEN+1];
+	char *s;
+	char *delim;
+	struct psm3_mq_window_rv_entry *ret = NULL;
+	int i;
+	unsigned int win, limit;
+	int skip_limit;
+
+	if (!str || ! *str)
+		return -1;
+
+	strncpy(temp, str, MAX_WINDOW_STR_LEN);
+	if (temp[MAX_WINDOW_STR_LEN-1] != 0) {
+		// string too long
+		if (errstr_size)
+			snprintf(errstr, errstr_size,
+				" Value too long, limit %u characters",
+				MAX_WINDOW_STR_LEN-1);
+		return -2;
+	}
+
+	s = temp;
+	i = 0;
+	do {
+		if (! *s)	// trailing ',' on 2nd or later loop
+			break;
+		// find end of window field and put in \0 as needed
+		delim = strpbrk(s, ":,");
+		skip_limit = (!delim || *delim == ',');
+		if (delim)
+			*delim = '\0';
+		// parse window
+		if (psm3_parse_str_uint(s, &win, 1, PSM_MQ_NIC_MAX_RNDV_WINDOW)) {
+			if (errstr_size)
+				snprintf(errstr, errstr_size, " Invalid window_rv: %s", s);
+			goto fail;
+		}
+		// find next field
+		if (delim)
+			s = delim+1;
+		if (skip_limit) {
+			limit = UINT32_MAX;
+		} else {
+			delim = strpbrk(s, ",");
+			if (delim)
+				*delim = '\0';
+			//parse limit
+			if (!strcasecmp(s, "max") || !strcasecmp(s, "maximum")) {
+				limit = UINT32_MAX;
+			} else {
+				if (psm3_parse_str_uint(s, &limit, 1, UINT32_MAX)) {
+					if (errstr_size)
+						snprintf(errstr, errstr_size, " Invalid limit: %s", s);
+					goto fail;
+				}
+			}
+			// find next field
+			if (delim)
+				s = delim+1;
+		}
+		if (i && ret[i-1].limit >= limit) {
+			if (errstr_size)
+				snprintf(errstr, errstr_size, " Limit not increasing: %u", limit);
+			goto fail;
+		}
+
+		ret = (struct psm3_mq_window_rv_entry*)psmi_realloc(PSMI_EP_NONE,
+				UNDEFINED, ret, sizeof(struct psm3_mq_window_rv_entry)*(i+1));
+		if (! ret)	// keep scans happy
+			return -2;
+		ret[i].window_rv = ROUNDUP(win, PSMI_PAGESIZE);
+		ret[i].limit = limit;
+		i++;
+	} while (delim);
+	if (! i)
+		return -1;
+	// force last entry limit to UINT32_MAX so used for all remaining lengths
+	ret[i-1].limit = UINT32_MAX;
+	if (list)
+		*list = ret;
+	else
+		psmi_free(ret);
+	return 0;
+
+fail:
+	psmi_free(ret);
+	return -2;
+}
+
+static int psm3_mq_parse_check_window_rv(int type,
+										const union psmi_envvar_val val,
+										void * ptr,
+										size_t errstr_size, char errstr[])
+{
+	psmi_assert(type == PSMI_ENVVAR_TYPE_STR);
+	return psm3_mq_parse_window_rv(val.e_str, errstr_size, errstr, NULL);
+}
+
+PSMI_ALWAYS_INLINE(uint32_t search_window(struct psm3_mq_window_rv_entry *e,
+					uint32_t len))
+{
+	for (; len > e->limit; e++)
+		;
+	return e->window_rv;
+}
+
+// for CPU build, gpu argument ignored, but avoids needing ifdef in callers
+uint32_t psm3_mq_max_window_rv(psm2_mq_t mq, int gpu)
+{
+	// must do search since window_rv may not be increasing (but usually is)
+	uint32_t ret = 0;
+	struct psm3_mq_window_rv_entry *e;
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	if (gpu)
+		e = mq->ips_gpu_window_rv;
+	else
+#endif
+		e = mq->ips_cpu_window_rv;
+	do {
+		ret = max(ret, e->window_rv);
+	} while ((e++)->limit < UINT32_MAX);
+	return ret;
+}
+
+uint32_t psm3_mq_get_window_rv(psm2_mq_req_t req)
+{
+	if (! req->window_rv) {
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+		if (req->is_buf_gpu_mem) {
+			req->window_rv = search_window(
+						req->mq->ips_gpu_window_rv,
+						req->req_data.send_msglen);
+		} else
+#endif	/* PSM_CUDA || PSM_ONEAPI */
+		req->window_rv = search_window(req->mq->ips_cpu_window_rv,
+						req->req_data.send_msglen);
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+		_HFI_VDBG("Selected Window of %u for %u byte %s msg\n",
+			req->window_rv,
+			req->req_data.send_msglen,
+			req->is_buf_gpu_mem?"GPU":"CPU");
+#else
+		_HFI_VDBG("Selected Window of %u for %u byte msg\n",
+			req->window_rv, req->req_data.send_msglen);
+#endif
+	}
+	return req->window_rv;
+}
+
 /*
  * This is the API for the user.  We actually allocate the MQ much earlier, but
  * the user can set options after obtaining an endpoint
@@ -2402,6 +2577,9 @@ psm2_error_t psm3_mq_malloc(psm2_mq_t *mqo)
 	// shm_thresh_rv is N/A to NIC and HAL, so we set this here and let
 	// HAL set the rest of the defaults
 	mq->shm_thresh_rv = MQ_SHM_THRESH_RNDV;
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	mq->shm_gpu_thresh_rv = MQ_SHM_GPU_THRESH_RNDV;
+#endif
 
 	psmi_hal_mq_init_defaults(mq);
 
@@ -2426,6 +2604,9 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq)
 {
 	union psmi_envvar_val env_hfitiny, env_rvwin, env_hfirv,
 		env_shmrv, env_hash, env_stats;
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	union psmi_envvar_val env_shmgpurv;
+#endif
 
 	// a limit of PSM_MQ_MAX_TINY btyes is hardcoded into the PSM protocol
 	psm3_getenv("PSM3_MQ_TINY_NIC_LIMIT",
@@ -2440,11 +2621,66 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq)
 		    (union psmi_envvar_val)mq->hfi_thresh_rv, &env_hfirv);
 	mq->hfi_thresh_rv = env_hfirv.e_uint;
 
-	psm3_getenv("PSM3_MQ_RNDV_NIC_WINDOW",
-		    "NIC rendezvous window size, max 4M",
-		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-		    (union psmi_envvar_val)mq->hfi_base_window_rv, &env_rvwin);
-	mq->hfi_base_window_rv = min(PSM_MQ_NIC_MAX_RNDV_WINDOW, env_rvwin.e_uint);
+#define WINDOW_SYNTAX "Specified as window_size:limit,window_size:limit, ...\nwhere limit is the largest message size the window_size is applicable to.\nThe last window_size in the list will be used for all remaining message\nsizes (eg. its limit is optional and ignored).\nwindow_size must be <= 4194304 and the limit in each entry must be larger\nthan the prior entry."
+
+	// for loopback, no ips so no window_rv
+	if (mq->ips_cpu_window_rv_str) {
+		int got_depwin = 0;	// using deprecated PSM3_MQ_RNDV_NIC_WINDOW
+
+		// PSM3_RNDV_NIC_WINDOW overrides deprecated PSM3_MQ_RNDV_NIC_WINDOW.
+		// only parse PSM3_MQ_RNDV_NIC_WINDOW if used default for
+		// PSM3_RNDV_NIC_WINDOW because it was not specified.
+		if (psm3_getenv_range("PSM3_RNDV_NIC_WINDOW",
+			"List of NIC rendezvous windows sizes for messges to and from a CPU buffer.",
+			WINDOW_SYNTAX,
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+			(union psmi_envvar_val)(char*)(mq->ips_cpu_window_rv_str),
+			(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+			psm3_mq_parse_check_window_rv, NULL, &env_rvwin) > 0) {
+			// new syntax is superset of old
+			got_depwin = (0 == psm3_getenv_range("PSM3_MQ_RNDV_NIC_WINDOW",
+					"[Deprecated, use PSM3_RNDV_NIC_WINDOW and PSM3_GPU_RNDV_NIC_WINDOW]",
+					"NIC rendezvous window size, max 4194304",
+					PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
+					(union psmi_envvar_val)(char*)(mq->ips_cpu_window_rv_str),
+					(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+					psm3_mq_parse_check_window_rv, NULL, &env_rvwin));
+		}
+		if (psm3_mq_parse_window_rv(env_rvwin.e_str, 0, NULL,
+								 &mq->ips_cpu_window_rv) < 0) {
+			// already checked, shouldn't get parse errors nor empty strings
+			psmi_assert(0);
+		}
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+		if (PSMI_IS_GPU_ENABLED && mq->ips_gpu_window_rv_str) {
+			union psmi_envvar_val env_gpurvwin;
+			char *env;
+
+			env =  psm3_env_get("PSM3_GPU_RNDV_NIC_WINDOW");
+			if (env && *env)
+				got_depwin = 0;	// use new default as default
+			// PSM3_GPU_RNDV_NIC_WINDOW overrides deprecated
+			// PSM3_MQ_RNDV_NIC_WINDOW.
+			// If PSM3_GPU_RNDV_NIC_WINDOW not specified and user specified
+			// PSM3_MQ_RNDV_NIC_WINDOW, use it for GPU too.
+			(void)psm3_getenv_range("PSM3_GPU_RNDV_NIC_WINDOW",
+					"List of NIC rendezvous windows sizes for messages to or from a GPU buffer.",
+					WINDOW_SYNTAX,
+					PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+					got_depwin?env_rvwin:
+					  (union psmi_envvar_val)(char*)(mq->ips_gpu_window_rv_str),
+					(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+					psm3_mq_parse_check_window_rv, NULL, &env_gpurvwin);
+			if (psm3_mq_parse_window_rv(env_gpurvwin.e_str, 0, NULL,
+								 &mq->ips_gpu_window_rv)< 0) {
+				// already checked, shouldn't get parse errors nor empty strings
+				psmi_assert(0);
+			}
+		}
+#else
+		(void)got_depwin;	// keep compiler happy
+#endif /* PSM_CUDA || PSM_ONEAPI */
+	}
 
 	/* Re-evaluate this since it may have changed after initializing the shm
 	 * device */
@@ -2455,6 +2691,17 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq)
 		    (union psmi_envvar_val)mq->shm_thresh_rv, &env_shmrv);
 	mq->shm_thresh_rv = env_shmrv.e_uint;
 
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	if (PSMI_IS_GPU_ENABLED) {
+		mq->shm_gpu_thresh_rv = psm3_shm_mq_gpu_rv_thresh;
+		psm3_getenv("PSM3_MQ_RNDV_SHM_GPU_THRESH",
+			"shm eager-to-rendezvous switchover for GPU send",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)mq->shm_gpu_thresh_rv, &env_shmgpurv);
+		mq->shm_gpu_thresh_rv = env_shmgpurv.e_uint;
+	}
+#endif
+
 	psm3_getenv("PSM3_MQ_HASH_THRESH",
 		    "linear list to hash tag matching switchover",
 		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
@@ -2486,6 +2733,10 @@ psm2_error_t MOCKABLE(psm3_mq_free)(psm2_mq_t mq)
 	psm3_mq_req_fini(mq);
 	psm3_mq_sysbuf_fini(mq);
 	psm3_stats_deregister_type(PSMI_STATSTYPE_MQ, mq);
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	psmi_free(mq->ips_gpu_window_rv);
+#endif
+	psmi_free(mq->ips_cpu_window_rv);
 	psmi_free(mq);
 	return PSM2_OK;
 }
diff --git a/prov/psm3/psm3/psm_mq_internal.h b/prov/psm3/psm3/psm_mq_internal.h
index f83e50bbffd..6c7127b0245 100644
--- a/prov/psm3/psm3/psm_mq_internal.h
+++ b/prov/psm3/psm3/psm_mq_internal.h
@@ -85,6 +85,11 @@ struct psm2_mq_perf_data
 	int perf_print_stats;
 };
 
+struct psm3_mq_window_rv_entry {
+	uint32_t window_rv;
+	uint32_t limit;
+};
+
 #ifdef LEARN_HASH_SELECTOR
 // When transition back to nohash mode, should the prior
 // learned table_sel be retained for use next time transition to hash mode.
@@ -175,9 +180,15 @@ struct psm2_mq {
 	uint32_t hfi_thresh_tiny;
 	uint32_t hfi_thresh_rv;
 	uint32_t shm_thresh_rv;
-	uint32_t hfi_base_window_rv;	/**> this is a base rndv window size,
-					     will be further trimmed down per-connection based
-					     on the peer's MTU */
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	uint32_t shm_gpu_thresh_rv;
+#endif
+	const char *ips_cpu_window_rv_str;	// default input to parser
+	struct psm3_mq_window_rv_entry *ips_cpu_window_rv;
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	const char *ips_gpu_window_rv_str;	// default input to parser
+	struct psm3_mq_window_rv_entry *ips_gpu_window_rv;
+#endif
 	uint32_t hash_thresh;
 	int memmode;
 
@@ -313,6 +324,7 @@ struct psm2_mq_req {
 	mq_rts_callback_fn_t rts_callback;
 	psm2_epaddr_t rts_peer;
 	uintptr_t rts_sbuf;
+	uint32_t window_rv;	// window size chosen by receiver or GPU send prefetcher
 
 #ifdef PSM_HAVE_REG_MR
 	psm3_verbs_mr_t	mr;	// local registered memory for app buffer
@@ -752,6 +764,9 @@ psm2_mq_req_t psm3_mq_req_match(psm2_mq_t mq, psm2_epaddr_t src, psm2_mq_tag_t *
 psm2_error_t psm3_mq_malloc(psm2_mq_t *mqo);
 psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq);
 psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid);
+extern uint32_t psm3_mq_max_window_rv(psm2_mq_t mq, int gpu);
+uint32_t psm3_mq_get_window_rv(psm2_mq_req_t req);
+
 
 psm2_error_t MOCKABLE(psm3_mq_free)(psm2_mq_t mq);
 MOCK_DCL_EPILOGUE(psm3_mq_free);
diff --git a/prov/psm3/psm3/psm_mq_recv.c b/prov/psm3/psm3/psm_mq_recv.c
index bc90d07c5cf..7b481351843 100644
--- a/prov/psm3/psm3/psm_mq_recv.c
+++ b/prov/psm3/psm3/psm_mq_recv.c
@@ -199,11 +199,13 @@ psm3_mq_req_copy(psm2_mq_req_t req,
 	}
 	if (msgptr != buf) {
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+		// for loopback HAL, invalid to call psm3_mq_get_window_rv()
+		// however, for loopback HAL, gdr copy is disabled
 		if (use_gdrcopy)
 			psm3_mq_req_gpu_copy((uint64_t)req->req_data.buf,
 					     req->req_data.recv_msglen,
 					     (uint64_t)msgptr, msglen_this,
-					     req->mq->hfi_base_window_rv, buf,
+					     psm3_mq_get_window_rv(req), buf,
 					     ep);
 		else
 #endif
diff --git a/prov/psm3/psm3/psm_mq_utils.c b/prov/psm3/psm3/psm_mq_utils.c
index af2988f64f1..7e80739373a 100644
--- a/prov/psm3/psm3/psm_mq_utils.c
+++ b/prov/psm3/psm3/psm_mq_utils.c
@@ -82,9 +82,9 @@ psm2_mq_req_t MOCKABLE(psm3_mq_req_alloc)(psm2_mq_t mq, uint32_t type)
 		return req;
 	} else {	/* we're out of reqs */
 		int issend = (type == MQE_TYPE_SEND);
-		uint32_t reqmax, reqchunk;
+		uint32_t reqmax;
 		psm3_mpool_get_obj_info(issend ? mq->sreq_pool : mq->rreq_pool,
-					&reqchunk, &reqmax);
+					NULL, &reqmax);
 
 		psm3_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR,
 				  "Exhausted %d MQ %s request descriptors, which usually indicates "
diff --git a/prov/psm3/psm3/psm_nic_select.c b/prov/psm3/psm3/psm_nic_select.c
new file mode 100644
index 00000000000..1a451f5eb67
--- /dev/null
+++ b/prov/psm3/psm3/psm_nic_select.c
@@ -0,0 +1,2098 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2024 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2024 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include "psm_user.h"
+#include "psm2_hal.h"
+#ifdef PSM_USE_HWLOC
+#include <hwloc.h>
+#include <dlfcn.h>
+#endif
+
+#define MAX_MAP_LEN (PSMI_MAX_RAILS*128)
+
+// sanity check, psm_user.h should ensure this, unless user tried to
+// manually set PSM_HAVE_GPU_CENTRIC_AFFINITY
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+#ifndef PSM_USE_HWLOC
+#error "PSM_HAVE_GPU_CENTRIC_AFFINITY set without PSM_USE_HWLOC"
+#endif
+#endif
+
+// subnuma is risky right now, so disable and explore in future
+//#ifdef PSM_USE_HWLOC
+//#define PSM3_HAVE_CPU_SUBNUMA
+//#endif
+#undef PSM3_HAVE_CPU_SUBNUMA
+
+#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA)
+struct pci_addr {
+	uint32_t domain;
+	uint32_t bus;
+	uint32_t dev;
+	uint32_t func;
+};
+#endif
+
+// table of refcount per unit_id counting references by endpoints within
+// local process
+// protected by psm3_creation_lock (held in psm_ep.c during EP open and close)
+static uint64_t psm3_nic_refcount[PSMI_MAX_RAILS];
+
+// psm3_shared_affinity_nic_refcount_ptr is the pointer to table of refcount
+// per unit_id countting references by all processes within node.
+// protected by psm3_sem_affinity_shm_rw semaphore
+
+static int psmi_parse_nic_selection_algorithm(void);
+
+#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA)
+static hwloc_topology_t psm3_hwloc_topology;
+static int psm3_hwloc_topology_initialized;
+static int psm3_hwloc_topology_init_failed;
+static void psm3_deferred_hwloc_topology_init();
+#endif
+
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+static int psm3_get_distance_between_pcis(const struct pci_addr *pci_addr_1,
+                                          const struct pci_addr *pci_addr_2);
+#endif
+#ifdef PSM3_HAVE_CPU_SUBNUMA
+static hwloc_obj_t psm3_get_non_io_ancestor_obj(
+											const struct pci_addr *pci_addr);
+#endif
+
+// As we consider and select NICs, we fill in additional information
+// or set filtered to exclude the NIC from further consideration.
+// The use of filtered avoids the cost of repeatedly compressing the list.
+struct nic_info {
+	uint8_t filtered;	// has NIC been filtered out from possible selection
+	psmi_subnet128_t subnet;
+	unsigned unit;
+	unsigned port;
+	unsigned addr_index;
+	int numa_id;	// CPU NUMA location of NIC
+#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA)
+	struct pci_addr pci_addr;
+#endif
+#ifdef PSM3_HAVE_CPU_SUBNUMA
+	int cpu_close;	// is CPU sub-numa close to NIC
+#endif
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+	int gpu_distance;
+#endif
+};
+
+
+/* returns the 8-bit hash value of an uuid. */
+static inline
+uint8_t
+psm3_get_uuid_hash(psm2_uuid_t const uuid)
+{
+	int i;
+	uint8_t hashed_uuid = 0;
+
+	for (i=0; i < sizeof(psm2_uuid_t); ++i)
+		hashed_uuid ^= *((uint8_t const *)uuid + i);
+
+	return hashed_uuid;
+}
+
+int psm3_get_current_proc_location()
+{
+        int core_id, node_id;
+
+	core_id = sched_getcpu();
+	if (core_id < 0)
+		return -EINVAL;
+
+	node_id = numa_node_of_cpu(core_id);
+	if (node_id < 0)
+		return -EINVAL;
+
+	return node_id;
+}
+
+// print a bitmask in condensed form at _HFI_VBG level
+// condensed form consolidates sequential numbers such as: "0-43,88-131"
+static void vdbg_print_bitmask(const char* prefix, struct bitmask *bmp)
+{
+	if (_HFI_VDBG_ON) {
+		int i, len;
+		char buf[1024];
+		int last=-1;
+		int first=-1;
+		int max = numa_num_possible_nodes();
+
+		snprintf(buf, sizeof(buf), "%s", prefix);
+		len = strlen(buf);
+		for (i=0; i<max; i++) {
+			if (! numa_bitmask_isbitset(bmp, i))
+				continue;
+			if (last == -1) {
+				// 1st found
+				snprintf(&buf[len], sizeof(buf)-len, "%d", i);
+				first = i;
+				last = first;
+			} else if ((i-last) > 1) {
+				if (first == last) {
+					// first in a possible sequence
+					snprintf(&buf[len], sizeof(buf)-len, ",%d", i);
+				} else {
+					// complete prior sequence, first in a new sequence
+					snprintf(&buf[len], sizeof(buf)-len, "-%d,%d", last, i);
+				}
+				first = i;
+				last = first;
+			} else {
+				last = i;
+			}
+			len = strlen(buf);
+		}
+		// complete prior sequence as needed
+		if (first>=0 && first != last)
+			snprintf(&buf[len], sizeof(buf)-len, "-%d", last);
+		_HFI_VDBG("%s\n", buf);
+	}
+}
+
+// return the largest possible numa ID of a CPU in this system
+int psm3_get_max_cpu_numa()
+{
+	static int max_cpu_numa = -1;
+	struct bitmask *cpumask, *empty_cpumask;
+	int i;
+
+	if (max_cpu_numa >= 0)
+		return max_cpu_numa;
+
+	// we don't depend on numa_num_configured_nodes since in theory there
+	// could be non-CPU memory NUMA nodes.  We only need to know the
+	// largest possible value for a CPU numa node ID
+
+	// numa_max_node - largest NUMA node which is not disabled
+	// numa_node_to_cpus - given a NUMA node, create list of CPUs
+	// numa_node_of_cpu - cpu ID to NUMA (or error if invalid CPU)
+	// numa_node_to_cpus - cpumask of CPUs on given NUMA node
+
+	max_cpu_numa = -1;
+	empty_cpumask = numa_allocate_cpumask();
+	numa_bitmask_clearall(empty_cpumask);
+	//vdbg_print_bitmask("empty_cpumask: ", empty_cpumask);
+
+	cpumask = numa_allocate_cpumask();
+	_HFI_VDBG("numa_max_node=%d\n", numa_max_node());
+	for (i=numa_max_node(); i >= 0; i--) {
+		numa_bitmask_clearall(cpumask);
+		int ret = numa_node_to_cpus(i, cpumask);
+		_HFI_VDBG("i=%d node_to_cpus ret=%d\n", i, ret);
+		vdbg_print_bitmask("cpumask: ", cpumask);
+		if (ret >= 0 && ! numa_bitmask_equal(cpumask, empty_cpumask)) {
+			max_cpu_numa = i;
+			break;
+		}
+	}
+	numa_free_cpumask(cpumask);
+	numa_free_cpumask(empty_cpumask);
+	psmi_assert_always(max_cpu_numa >= 0);
+	return max_cpu_numa;
+}
+
+/* search the list of all units for those which are active
+ * and optionally match the given NUMA node_id (when node_id >= 0)
+ * returns the number of active units found.
+ * Note get_unit_active tests for active ports, valid addresses and
+ * performs filtering as done in get_port_subnets
+ */
+static int
+hfi_find_active_hfis(int nunits, int node_id, int *saved_hfis)
+{
+	int found = 0, unit_id;
+
+	for (unit_id = 0; unit_id < nunits; unit_id++) {
+		int node_id_i;
+
+		if (psmi_hal_get_unit_active(unit_id) <= 0)
+			continue;
+
+		if (node_id < 0) {
+			saved_hfis[found++] = unit_id;
+			_HFI_DBG("RoundRobinAll Found NIC unit= %d, local rank=%d.\n",
+				unit_id, psm3_get_mylocalrank());
+		} else if (!psmi_hal_get_node_id(unit_id, &node_id_i)
+				&& node_id_i == node_id) {
+			saved_hfis[found++] = unit_id;
+			_HFI_DBG("RoundRobin Found NIC unit= %d, node = %d, local rank=%d.\n",
+				unit_id, node_id, psm3_get_mylocalrank());
+		}
+	}
+	return found;
+}
+
+// select NIC across all NICs, use a hash of job_id and local rank to
+// distribute local ranks across NICs and to attempt to distribute
+// jobs across NICs.
+// TBD - if know never have >1 job per node, could ignore job_id, perhaps
+// have an env to exclude job_id from hash so NIC selection is deterministic
+static void
+psmi_spread_nic_selection(psm2_uuid_t const job_key, long *unit_start,
+			     long *unit_end, int nunits)
+{
+	{
+		int found, saved_hfis[nunits];
+
+		/* else, we are going to look at:
+		   (a hash of the job key plus the local rank id) mod nunits. */
+		found = hfi_find_active_hfis(nunits, -1, saved_hfis);
+		if (found)
+			*unit_start = saved_hfis[((psm3_get_mylocalrank()+1) +
+				psm3_get_uuid_hash(job_key)) % found];
+		else
+			// none found, caller will fail, start is a don't care
+			*unit_start = 0;
+		/* just in case, caller will check all other units, with wrap */
+		if (*unit_start > 0)
+			*unit_end = *unit_start - 1;
+		else
+			*unit_end = nunits-1;
+	}
+	_HFI_DBG("RoundRobinAll Will select 1st viable NIC unit= %ld to %ld.\n",
+		*unit_start, *unit_end);
+}
+
+static int
+psm3_create_and_open_affinity_shm(psm2_uuid_t const job_key)
+{
+	int shm_fd, ret;
+	int first_to_create = 0;
+	size_t shm_name_len = 256;
+
+	psmi_assert_always(psm3_affinity_semaphore_open);
+	if (psm3_affinity_shared_file_opened) {
+		/* opened and have our reference counted in shm */
+		psmi_assert_always(psm3_affinity_shm_name != NULL);
+		psmi_assert_always(psm3_shared_affinity_ptr != NULL);
+		return 0;
+	}
+
+	psm3_shared_affinity_ptr = NULL;
+	psm3_affinity_shm_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, shm_name_len);
+
+	psmi_assert_always(psm3_affinity_shm_name != NULL);
+	snprintf(psm3_affinity_shm_name, shm_name_len,
+		 AFFINITY_SHM_BASENAME".%d",
+		 psm3_get_uuid_hash(job_key));
+	shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR | O_CREAT | O_EXCL,
+			  S_IRUSR | S_IWUSR);
+	if ((shm_fd < 0) && (errno == EEXIST)) {
+		shm_fd = shm_open(psm3_affinity_shm_name, O_RDWR, S_IRUSR | S_IWUSR);
+		if (shm_fd < 0) {
+			_HFI_VDBG("Cannot open affinity shared mem fd:%s, errno=%d\n",
+				  psm3_affinity_shm_name, errno);
+			goto free_name;
+		}
+	} else if (shm_fd >= 0) {
+		first_to_create = 1;
+	} else {
+		_HFI_VDBG("Cannot create affinity shared mem fd:%s, errno=%d\n",
+			  psm3_affinity_shm_name, errno);
+		goto free_name;
+	}
+
+	ret = ftruncate(shm_fd, PSMI_PAGESIZE);
+	if ( ret < 0 ) {
+		_HFI_VDBG("Cannot truncate affinity shared mem fd:%s, errno=%d\n",
+			psm3_affinity_shm_name, errno);
+		goto close_shm;
+	}
+
+	psm3_shared_affinity_ptr = (uint64_t *) mmap(NULL, PSMI_PAGESIZE, PROT_READ | PROT_WRITE,
+					MAP_SHARED, shm_fd, 0);
+	if (psm3_shared_affinity_ptr == MAP_FAILED) {
+		_HFI_VDBG("Cannot mmap affinity shared memory: %s, errno=%d\n",
+			  psm3_affinity_shm_name, errno);
+		goto close_shm;
+	}
+	close(shm_fd);
+	shm_fd = -1;
+
+	if (first_to_create) {
+		_HFI_VDBG("Initializing shm to store NIC affinity per socket: %s\n", psm3_affinity_shm_name);
+
+		memset(psm3_shared_affinity_ptr, 0, PSMI_PAGESIZE);
+
+		/*
+		 * Once shm object is initialized, unlock others to be able to
+		 * use it.
+		 */
+		psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name);
+	} else {
+		_HFI_VDBG("Opened shm object to read/write NIC affinity per socket: %s\n", psm3_affinity_shm_name);
+	}
+
+	/*
+	 * Start critical section to increment reference count when creating
+	 * or opening shm object. Decrement of ref count will be done before
+	 * closing the shm.
+	 */
+	if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) {
+		_HFI_VDBG("Could not enter critical section to update shm refcount\n");
+		goto unmap_shm;
+	}
+
+	psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION] += 1;
+	_HFI_VDBG("shm refcount = %"PRId64"\n",  psm3_shared_affinity_ptr[AFFINITY_SHM_REF_COUNT_LOCATION]);
+
+	/* End critical section */
+	psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name);
+
+	psm3_affinity_shared_file_opened = 1;
+
+	return 0;
+
+unmap_shm:
+	munmap(psm3_shared_affinity_ptr, PSMI_PAGESIZE);
+	psm3_shared_affinity_ptr = NULL;
+close_shm:
+	if (shm_fd >= 0) close(shm_fd);
+free_name:
+	psmi_free(psm3_affinity_shm_name);
+	psm3_affinity_shm_name = NULL;
+	return -1;
+}
+
+/*
+ * Spread HFI selection between units if we find more than one within a socket.
+ */
+static void
+psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id,
+			      int *saved_hfis, int found, psm2_uuid_t const job_key)
+{
+	int ret, shm_location;
+
+	/*
+	 * Take affinity lock and open shared memory region to be able to
+	 * accurately determine which HFI to pick for this process. If any
+	 * issues, bail by picking first known HFI.
+	 */
+	if (!psm3_affinity_semaphore_open)
+		goto spread_hfi_fallback;
+
+	ret = psm3_create_and_open_affinity_shm(job_key);
+	if (ret < 0)
+		goto spread_hfi_fallback;
+
+	// one shm entry per CPU NUMA domain
+	// The entry contains the next round robin NIC to use
+	// in the form of a index into saved_hfis
+	// saved_hfis has a list of all the NUMA local active NICs
+	shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id;
+	if (shm_location > PSMI_PAGESIZE)
+		goto spread_hfi_fallback;
+
+	/* Start critical section to read/write shm object */
+	if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) {
+		_HFI_VDBG("Could not enter critical section to update NIC index\n");
+		goto spread_hfi_fallback;
+	}
+
+	*unit_start = *unit_end = saved_hfis[psm3_shared_affinity_ptr[shm_location]];
+	psm3_shared_affinity_ptr[shm_location] =
+		(psm3_shared_affinity_ptr[shm_location] + 1) % found;
+	_HFI_DBG("RoundRobin Selected NIC unit= %ld, Next NIC=%ld, node = %d, local rank=%d, found=%d.\n",
+		  *unit_start, psm3_shared_affinity_ptr[shm_location], node_id,
+		  psm3_get_mylocalrank(), found);
+
+	/* End Critical Section */
+	psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name);
+
+	return;
+
+spread_hfi_fallback:
+	*unit_start = *unit_end = saved_hfis[0];
+}
+
+static void
+psm3_create_affinity_semaphores(psm2_uuid_t const job_key)
+{
+	int ret;
+	size_t sem_len = 256;
+
+	/*
+	 * If already opened, no need to do anything else.
+	 * This could be true for Multi-EP cases where a different thread has
+	 * already created the semaphores. We don't need separate locks here as
+	 * we are protected by the overall "psm3_creation_lock" which each
+	 * thread will take in psm3_ep_open()
+	 */
+	if (psm3_affinity_semaphore_open)
+		return;
+
+	psm3_sem_affinity_shm_rw_name = (char *) psmi_malloc(PSMI_EP_NONE, UNDEFINED, sem_len);
+	psmi_assert_always(psm3_sem_affinity_shm_rw_name != NULL);
+	snprintf(psm3_sem_affinity_shm_rw_name, sem_len,
+		 SEM_AFFINITY_SHM_RW_BASENAME".%d",
+		 psm3_get_uuid_hash(job_key));
+
+	ret = psmi_init_semaphore(&psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name,
+				  S_IRUSR | S_IWUSR, 0);
+	if (ret) {
+		_HFI_VDBG("Cannot initialize semaphore: %s for read-write access to shm object.\n",
+			  psm3_sem_affinity_shm_rw_name);
+		if (psm3_sem_affinity_shm_rw)
+			sem_close(psm3_sem_affinity_shm_rw);
+		psmi_free(psm3_sem_affinity_shm_rw_name);
+		psm3_sem_affinity_shm_rw_name = NULL;
+		return;
+	}
+
+	_HFI_VDBG("Semaphore: %s created for read-write access to shm object.\n",
+		  psm3_sem_affinity_shm_rw_name);
+
+	psm3_affinity_semaphore_open = 1;
+
+	return;
+}
+
+/*
+ * Get all the ports and optionally addr_index'es with a valid lid and gid,
+ * one port per unit but up to PSM3_ADDR_PER_NIC addresses.
+ *
+ * Returns count of entries put in nic_info
+ *
+ * There will be exactly per_addr_index entries per active unit all for the
+ * same port within the unit
+ */
+unsigned nic_info_init(struct nic_info *nic_info, unsigned nunits, int per_addr_index)
+{
+	unsigned unit, port, addr_index;
+	unsigned num_addr_index = per_addr_index?psm3_addr_per_nic:1;
+	int ret;
+	unsigned count = 0;
+
+	for (unit = 0; unit < nunits; unit++) {
+		// get_unit_active is redundant since it loops on all ports and
+		// confirms at least 1 port has a valid lid.  We test that below.
+		//if (psmi_hal_get_unit_active(unit) <= 0)
+		//	continue;
+		for (port = PSM3_NIC_MIN_PORT; port <= PSM3_NIC_MAX_PORT; port++) {
+			int got_port = 0;
+			for (addr_index = 0; addr_index < num_addr_index; addr_index++) {
+				psmi_subnet128_t subnet;
+				ret = psmi_hal_get_port_lid(unit, port, addr_index);
+				if (ret <= 0)
+					continue;
+				ret = psmi_hal_get_port_subnet(unit, port, addr_index, &subnet, NULL, NULL, NULL);
+				if (ret == -1)
+					continue;
+
+				nic_info[count].filtered = 0;
+				nic_info[count].subnet = subnet;
+				nic_info[count].unit = unit;
+				nic_info[count].port = port;
+				nic_info[count].addr_index = addr_index;
+#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA)
+				nic_info[count].pci_addr.domain = UINT32_MAX;
+#endif
+				got_port = 1;
+				count++;
+			}
+			if (got_port)	// one port per unit
+				break;
+		}
+	}
+	return count;
+}
+
+/* If at least 1 NIC matches the current CPUs NUMA id,
+ * filter out all NICs which do not match.
+ * If none match, noop.
+ * Also initializes nic_info.numa_id
+ */
+void nic_info_filter_numa(struct nic_info *nic_info, unsigned ninfo)
+{
+	unsigned i;
+	int found = 0;
+
+	int cpu_numa_id = psm3_get_current_proc_location();
+	if (cpu_numa_id < 0) {
+		_HFI_DBG("Unable to determine CPU NUMA location, skipping filter of NIC CPU NUMA location\n");
+		return;
+	}
+
+	for (i=0; i < ninfo; i++)
+	{
+		if (nic_info[i].filtered)
+			continue;
+
+		if (psmi_hal_get_node_id(nic_info[i].unit, &nic_info[i].numa_id) != 0) {
+			// assume match (don't filter this NIC)
+			_HFI_DBG("Unable to determine NIC NUMA location for unit %d (%s), assuming local to CPU NUMA (%d)\n",
+				nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit),
+				cpu_numa_id);
+			nic_info[i].numa_id = cpu_numa_id;
+		} else {
+			_HFI_DBG("NIC NUMA location for unit %d (%s) is %d\n",
+				nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit),
+				nic_info[i].numa_id);
+		}
+		found += (nic_info[i].numa_id == cpu_numa_id);
+	}
+	if (found) {
+		_HFI_DBG("Found %d unfiltered NUMA local NICs for CPU NUMA id = %d\n",
+					found, cpu_numa_id);
+		// filter out NICs not in cpu_numa_id
+		for (i=0; i < ninfo; i++)
+		{
+			if (nic_info[i].filtered)
+				continue;
+			nic_info[i].filtered = (nic_info[i].numa_id != cpu_numa_id);
+		}
+	} else {
+		_HFI_DBG("No NUMA local NIC found, CPU NUMA id = %d\n", cpu_numa_id);
+	}
+}
+
+/* If at least 1 NIC matches the current CPUs sub-NUMA group,
+ * filter out all NICs which do not match.
+ * If none match, noop.
+ * Also initializes nic_info.pci_addr and nic_info.cpu_close
+ */
+void nic_info_filter_sub_numa(struct nic_info *nic_info, unsigned ninfo)
+{
+#ifdef PSM3_HAVE_CPU_SUBNUMA
+	unsigned i;
+	int found = 0;
+	hwloc_cpuset_t cpu_bind_set;
+
+	psm3_deferred_hwloc_topology_init();
+	if (psm3_hwloc_topology_init_failed)
+		return;	// hwloc incorrect version
+	psmi_assert(psm3_hwloc_topology_initialized);
+
+	// here we use entire CPU bind set, (should match pthread_getaffinity_np)
+	// as opposed to just the current process location.
+	cpu_bind_set = hwloc_bitmap_alloc();
+	if (! cpu_bind_set) {
+		_HFI_DBG("Unable to allocate CPU set, skipping filter of CPU sub-NUMA location\n");
+		return;
+	}
+#if 0
+	// use current process affinity
+	if (hwloc_get_cpubind(psm3_hwloc_topology, cpu_bind_set,
+							HWLOC_CPUBIND_PROCESS)) {
+		_HFI_DBG("Unable to determine process CPU binding, skipping filter of CPU sub-NUMA location\n");
+		goto fail;
+	}
+#else
+	// use current thread affinity
+	pthread_t mythread = pthread_self();
+	if (hwloc_get_thread_cpubind(psm3_hwloc_topology, mythread,
+					cpu_bind_set, HWLOC_CPUBIND_THREAD)) {
+		_HFI_DBG("Unable to determine thread CPU binding, skipping filter of CPU sub-NUMA location\n");
+		goto fail;
+	}
+#endif
+
+	for (i=0; i < ninfo; i++)
+	{
+		if (nic_info[i].filtered)
+			continue;
+		if (nic_info[i].pci_addr.domain == UINT32_MAX
+			&& psmi_hal_get_unit_pci_bus(nic_info[i].unit,
+					 &nic_info[i].pci_addr.domain, &nic_info[i].pci_addr.bus,
+					 &nic_info[i].pci_addr.dev, &nic_info[i].pci_addr.func)) {
+			_HFI_DBG("Unable to get NIC PCIe address for unit %d (%s)\n",
+				nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit));
+			// can't filter out NIC because if all fail we won't have any.
+			// Unsure how to rank this NIC vs others, so assume not close
+			nic_info[i].cpu_close = 0;
+			continue;
+		}
+
+		hwloc_obj_t ancestor = psm3_get_non_io_ancestor_obj(
+													&nic_info[i].pci_addr);
+		if (! ancestor) {
+			_HFI_DBG("Unable to determine NIC ancestor for unit %d (%s) at PCIe %04x:%02x:%02x.%x\n",
+				nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit),
+				nic_info[i].pci_addr.domain, nic_info[i].pci_addr.bus,
+				nic_info[i].pci_addr.dev, nic_info[i].pci_addr.func);
+			// can't filter out NIC because if all fail we won't have any.
+			// Unsure how to rank this NIC vs others, so assume not close
+			nic_info[i].cpu_close = 0;
+			continue;
+		}
+
+		// If any overlap of NIC and process CPU sets, consider it close
+		nic_info[i].cpu_close =
+				hwloc_bitmap_isincluded(cpu_bind_set, ancestor->cpuset)
+				|| hwloc_bitmap_isincluded(ancestor->cpuset, cpu_bind_set);
+
+		if (_HFI_DBG_ON) {
+			char buf[256] = {0};;
+			hwloc_bitmap_list_snprintf(buf, sizeof(buf), ancestor->cpuset);
+			buf[sizeof(buf)-1] = '\0';	// paranoid, hwloc doc not clear
+			_HFI_DBG_ALWAYS("NIC closeness to CPU for unit %d (%s) at %u:%u:%u:%u is %d, NIC close to CPUs: %s\n",
+				nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit),
+				nic_info[i].pci_addr.domain, nic_info[i].pci_addr.bus,
+				nic_info[i].pci_addr.dev, nic_info[i].pci_addr.func,
+				nic_info[i].cpu_close, buf);
+		}
+		found += nic_info[i].cpu_close;
+	}
+	if (found) {
+		if (_HFI_DBG_ON) {
+			char buf[256] = {0};;
+			hwloc_bitmap_list_snprintf(buf, sizeof(buf), cpu_bind_set);
+			buf[sizeof(buf)-1] = '\0';	// paranoid, hwloc doc not clear
+			_HFI_DBG_ALWAYS("Found %d unfiltered NICs close to CPUs: %s\n", found, buf);
+		}
+		// filter out NICs not close
+		for (i=0; i < ninfo; i++)
+		{
+			if (nic_info[i].filtered)
+				continue;
+			nic_info[i].filtered = ! nic_info[i].cpu_close;
+		}
+	} else {
+		if (_HFI_DBG_ON) {
+			char buf[256] = {0};;
+			hwloc_bitmap_list_snprintf(buf, sizeof(buf), cpu_bind_set);
+			buf[sizeof(buf)-1] = '\0';	// paranoid, hwloc doc not clear
+			_HFI_DBG_ALWAYS("No NICs found close to CPUs: %s\n", buf);
+		}
+	}
+fail:
+	hwloc_bitmap_free(cpu_bind_set);
+#else
+	//_HFI_DBG("Filtering based on CPU closeness to NIC disabled\n");
+#endif
+}
+
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+/* Find the closest NIC to the current GPU and then fiter out all NICs
+ * which are further from the GPU than that closest NIC
+ * If no GPU for the process yet, or PSM3 GPU support not enabled, noop.
+ * Also initializes nic_info.pci_addr and nic_info.gpu_distance
+ */
+void nic_info_filter_gpu_distance(struct nic_info *nic_info, unsigned ninfo)
+{
+	unsigned i;
+	int min_distance = INT_MAX;	// smallest distance found
+	unsigned found = 0;
+	struct pci_addr gpu_pci_addr;
+
+	if (! PSMI_IS_GPU_ENABLED)
+		return;
+
+	psm3_deferred_hwloc_topology_init();
+	if (psm3_hwloc_topology_init_failed)
+		return;	// hwloc incorrect version
+	psmi_assert(psm3_hwloc_topology_initialized);
+
+	// Get current GPU PCIe address to gpu_pci_addr;
+#ifdef PSM_CUDA
+	{
+		int domain, bus, dev;
+		int num_devices;
+		CUdevice device;
+
+		PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
+		_HFI_DBG("%d Cuda GPUs found\n", num_devices);
+		if (! num_devices)
+			return;
+
+		if (num_devices == 1) {
+			PSMI_CUDA_CALL(cuDeviceGet, &device, 0);
+		} else {
+			// all GPUs will be visible to process, see if app chose one first
+			CUcontext ctxt = {0};
+			if (! psmi_cuCtxGetCurrent || psmi_cuCtxGetCurrent(&ctxt) || ! ctxt) {
+				_HFI_DBG("Unable to get Cuda ctxt\n");
+				//PSMI_CUDA_CALL(cuDeviceGet, &device, 0);
+				return;
+			} else {
+				PSMI_CUDA_CALL(cuCtxGetDevice, &device);
+			}
+		}
+		_HFI_DBG("Using Cuda GPU %d\n", device);
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+								&domain,
+								CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID,
+								device);
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+								&bus,
+								CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
+								device);
+		PSMI_CUDA_CALL(cuDeviceGetAttribute,
+								&dev,
+								CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
+								device);
+		gpu_pci_addr.domain = domain;
+		gpu_pci_addr.bus = bus;
+		gpu_pci_addr.dev = dev;
+		gpu_pci_addr.func = 0;
+	}
+#elif defined(PSM_ONEAPI)
+	{
+		ze_pci_ext_properties_t PciProperties;
+
+		_HFI_DBG("%d Level Zero GPUs found\n", num_ze_devices);
+		if (! num_ze_devices)
+			return;
+
+		// caling middleware will have limited GPUs visible to process
+		PSMI_ONEAPI_ZE_CALL(zeDevicePciGetPropertiesExt,
+							ze_devices[0].dev, &PciProperties);
+		gpu_pci_addr.domain = PciProperties.address.domain;
+		gpu_pci_addr.bus = PciProperties.address.bus;
+		gpu_pci_addr.dev = PciProperties.address.device;
+		gpu_pci_addr.func = PciProperties.address.function;
+	}
+#endif
+	_HFI_DBG("GPU PCIe address is %04x:%02x:%02x.%x\n",
+				gpu_pci_addr.domain, gpu_pci_addr.bus,
+				gpu_pci_addr.dev, gpu_pci_addr.func);
+
+	for (i=0; i < ninfo; i++) {
+		if (nic_info[i].filtered)
+			continue;
+		if (nic_info[i].pci_addr.domain == UINT32_MAX
+			&& psmi_hal_get_unit_pci_bus(nic_info[i].unit,
+					 &nic_info[i].pci_addr.domain, &nic_info[i].pci_addr.bus,
+					 &nic_info[i].pci_addr.dev, &nic_info[i].pci_addr.func)) {
+			_HFI_DBG("Unable to get NIC PCIe address for unit %d (%s)\n",
+				nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit));
+			// can't filter out NIC because if all fail we won't have any.
+			// Unsure how to rank this NIC vs others, so use max distance
+			nic_info[i].gpu_distance = INT_MAX;
+			continue;
+		}
+		nic_info[i].gpu_distance = psm3_get_distance_between_pcis(
+										&nic_info[i].pci_addr, &gpu_pci_addr);
+		_HFI_DBG("NIC PCIe address for unit %d (%s) is %04x:%02x:%02x.%x distance to GPU: %d\n",
+				nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit),
+				nic_info[i].pci_addr.domain, nic_info[i].pci_addr.bus,
+				nic_info[i].pci_addr.dev, nic_info[i].pci_addr.func,
+				nic_info[i].gpu_distance);
+		if (nic_info[i].gpu_distance < min_distance) {
+			min_distance = nic_info[i].gpu_distance;
+		}
+	}
+	if (min_distance == INT_MAX) {
+		_HFI_DBG("No NIC found with a known distance\n");
+		return;	// noop
+	}
+
+	// filter out all NICs with a distance > min_distance
+	for (i=0; i < ninfo; i++) {
+		if (nic_info[i].filtered)
+			continue;
+		psmi_assert(nic_info[i].gpu_distance >= min_distance);
+		nic_info[i].filtered = (nic_info[i].gpu_distance > min_distance);
+		found += ! nic_info[i].filtered;
+	}
+	_HFI_DBG("Found %d unfiltered NICs with GPU distance of %d\n",
+					found, min_distance);
+}
+#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */
+
+// filter down the list of NICs with a CPU locality focus as priority
+// if present, the GPU is considered last.  If the GPU is NUMA local
+// to the CPU, the GPU filter can further limit NICs to those close to the
+// GPU (same PCIe switch).  But if the GPU is not NUMA local to the CPU,
+// the gpu distance filter may still limit distance or end up being a noop.
+static void nic_info_filter_cpu_centric(struct nic_info *nic_info,
+										unsigned ninfo)
+{
+	_HFI_DBG("Filtering NICs with CPU Centric Strategy\n");
+	nic_info_filter_sub_numa(nic_info, ninfo);
+	nic_info_filter_numa(nic_info, ninfo);
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+	nic_info_filter_gpu_distance(nic_info, ninfo);
+#endif
+}
+
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+// filter down the list of NICs with a GPU locality focus as priority
+// When there is a GPU, once we have selected NICs closest to that
+// GPU we are likely to have limited ourselves to NICs in the same
+// NUMA as the GPU, so the CPU NUMA tests will become noops.
+// For example, a GPU and NIC on the same PCIe switch will by definition
+// be in the same CPU root complex and hence same CPU NUMA.
+// But if there is no GPU or none of the NICs are close to the GPU
+// the CPU numa tests may narrow the list of NICs.
+static void nic_info_filter_gpu_centric(struct nic_info *nic_info,
+										unsigned ninfo)
+{
+	_HFI_DBG("Filtering NICs with GPU Centric Strategy\n");
+	nic_info_filter_gpu_distance(nic_info, ninfo);
+	nic_info_filter_numa(nic_info, ninfo);
+	nic_info_filter_sub_numa(nic_info, ninfo);
+}
+#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */
+
+// analyze the refcount table and filter out NICs with refcounts
+// higher than the lowest found.
+// If all NICs have equal refcounts, noop.
+static void
+nic_info_filter_refcount(struct nic_info *nic_info, unsigned ninfo,
+					uint64_t *refcount, unsigned nunits, const char *str)
+{
+	unsigned i;
+	uint64_t min_refcount = UINT64_MAX;	// smallest refcount found
+	unsigned found = 0;
+
+	for (i=0; i < ninfo; i++) {
+		if (nic_info[i].filtered)
+			continue;
+		psmi_assert(nic_info[i].unit < nunits);
+		_HFI_DBG("NIC %s reference count for unit %d (%s) is %"PRIu64"\n", str,
+				nic_info[i].unit, psm3_sysfs_unit_dev_name(nic_info[i].unit),
+				refcount[nic_info[i].unit]);
+		if (refcount[nic_info[i].unit] < min_refcount) {
+			min_refcount = refcount[nic_info[i].unit];
+			psmi_assert(nic_info[i].unit < nunits);;
+		}
+	}
+	if (min_refcount == UINT64_MAX) {
+		// unexpected, should have found a smaller value
+		_HFI_DBG("No NIC found with a low %s reference count\n", str);
+		return;	// noop
+	}
+
+	// filter out all NICs with a refcount > min_refcount
+	for (i=0; i < ninfo; i++) {
+		if (nic_info[i].filtered)
+			continue;
+		psmi_assert(refcount[nic_info[i].unit] >= min_refcount);
+		nic_info[i].filtered = (refcount[nic_info[i].unit] > min_refcount);
+		found += ! nic_info[i].filtered;
+	}
+	_HFI_DBG("Found %d unfiltered NICs with %s reference count of %"PRIu64"\n",
+					found, str, min_refcount);
+}
+
+// return index in nic_info of 1st unfiltered NIC
+static unsigned
+nic_info_get_first_unfiltered_nic(struct nic_info *nic_info, unsigned ninfo)
+{
+	unsigned i;
+	for (i=0; i < ninfo; i++) {
+		if (! nic_info[i].filtered)
+			return i;
+	}
+	psmi_assert(0);
+	return 0;
+}
+
+/*
+ * Select NIC among the unfiltered NICs in nic_info while
+ * scoreboarding use of each NIC and picking the one with lowest
+ * unit number and least use.
+ *
+ * Scoreboarding starts with the local process's NIC usage across all EPs
+ * This helps to ensure a given process balances itself across unfiltered NICs
+ * on the assumption that all local processes will ultimately have the same
+ * number of endpoints.
+ *
+ * After the local process scoreboarding, the shm scoreboard is checked
+ * to pick a NIC based on lowest refcount within the server.  Thus balancing
+ * NIC usage within the server.
+ *
+ * Among NICs with the lowest reference counts, the lowest entry in nic_info
+ * (also lowest unit_id) is selected.
+ * This assumes only one entry appears in nic_info for each unit_id
+ * (eg. nic_info_init was given per_addr_index of 1) and the entries in
+ * nic_info are sorted by unit_id (in order built by nic_info_init).
+ *
+ * Due to call sequence prior to this, nic_info list will already be sorted by
+ * unit_id since it was built in that order by nic_info_init.
+ * Returns index in nic_info of selected NIC.
+ * On any issues, selects 1st NIC
+ */
+static int
+psm3_open_shm_scoreboard_and_select_nic(
+						struct nic_info *nic_info, unsigned ninfo,
+						psm2_uuid_t const job_key, unsigned nunits)
+{
+	int ret, shm_location, index;
+
+	psmi_assert(nunits > 0);
+	psmi_assert(ninfo > 0);
+
+	// balance among endpoints within current process
+	nic_info_filter_refcount(nic_info, ninfo,
+							psm3_nic_refcount, nunits, "local process");
+
+	psm3_create_affinity_semaphores(job_key);
+	/*
+	 * Take affinity lock and open shared memory region to be able to
+	 * accurately determine which NIC to pick for this process. If any
+	 * issues, bail by picking first unfiltered NIC in nic_info
+	 */
+	if (!psm3_affinity_semaphore_open)
+		goto fallback;
+
+	ret = psm3_create_and_open_affinity_shm(job_key);
+	if (ret < 0)
+		goto fallback;
+
+	// start of scoreboard area, we keep refcount for each unit_id.
+	// Note that some other modes may organize the shm area differently,
+	// so it's important that all processes and all endpoints use the same
+	// fundamental modes for PSM3_MULTIRAIL and PSM3_NIC_SELECTION_ALG
+	shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION;
+	if (shm_location + sizeof(*psm3_shared_affinity_ptr)*nunits > PSMI_PAGESIZE)
+		goto fallback;
+
+	// At psm3_shm_refcount_ptr in Linux shared memory is a table indexed
+	// by unit_id with a reference count per NIC showing the total
+	// endpoints within the job which are using the NIC.
+	psm3_shared_affinity_nic_refcount_ptr =
+									&psm3_shared_affinity_ptr[shm_location];
+
+	/* Start critical section to read/write shm object */
+	if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) {
+		_HFI_VDBG("Could not enter critical section to update NIC index\n");
+		goto fallback;
+	}
+
+	// balance among procceses within current node
+	nic_info_filter_refcount(nic_info, ninfo,
+					psm3_shared_affinity_nic_refcount_ptr,
+					nunits, "local node");
+
+	// use lowest index among those which remain
+	index = nic_info_get_first_unfiltered_nic(nic_info, ninfo);
+
+	// update reference counts for node level and process level
+	psm3_shared_affinity_nic_refcount_ptr[nic_info[index].unit]++;
+	psm3_nic_refcount[nic_info[index].unit]++;
+
+	/* End Critical Section */
+	psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name);
+
+	psmi_assert(index >= 0 && index <= ninfo);
+	_HFI_DBG("Selected NIC unit %d(%s)\n",
+		nic_info[index].unit, psm3_sysfs_unit_dev_name(nic_info[index].unit));
+	return index;
+
+fallback:
+	index = nic_info_get_first_unfiltered_nic(nic_info, ninfo);
+	psm3_nic_refcount[nic_info[index].unit]++;	// inc process level refcount
+	return index;
+}
+
+// decrement reference counts which were incremented in local process
+// and in shm within node
+// For modes which do not track this style of refcounts psm3_nic_refcount
+// will be zero for every unit_id and psm3_shared_affinity_nic_refcount_ptr will
+// be NULL (or if psm3 has been finalized)
+void psm3_dec_nic_refcount(int unit_id)
+{
+	// in some modes we don't track refcount, in which case do nothing
+	if (psm3_nic_refcount[unit_id])
+		psm3_nic_refcount[unit_id]--;
+	if (psm3_affinity_shared_file_opened && psm3_affinity_semaphore_open
+		&& psm3_shared_affinity_nic_refcount_ptr) {
+		/* Start critical section to read/write shm object */
+		if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) {
+			_HFI_VDBG("Could not enter critical section to update NIC refcount\n");
+		} else {
+			psm3_shared_affinity_nic_refcount_ptr[unit_id]--;
+			/* End Critical Section */
+			psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name);
+		}
+	}
+}
+
+psm2_error_t
+psm3_compute_start_and_end_unit_cpu_centric(
+				psm2_uuid_t const job_key,
+				long *unit_start,long *unit_end, int nunits)
+{
+	unsigned index;
+	unsigned ninfo;
+	struct nic_info nic_info[PSMI_MAX_RAILS];
+
+	// caller will enumerate addr_index, just just get all active ports
+	ninfo = nic_info_init(nic_info, nunits, 0);
+	if (! ninfo) {
+		// should not happen, caller already confirmed there is >1 active unit
+		// mimic what caller of psm3_compute_start_and_end_unit would do
+		return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+                    "PSM3 no nic units are active");
+	}
+
+	nic_info_filter_cpu_centric(nic_info, ninfo);
+
+	index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo,
+								job_key, nunits);
+	psmi_assert(index >= 0 && index < ninfo);
+
+	// caller will select 1st active port and an addr_index within unit
+	*unit_start = *unit_end = nic_info[index].unit;
+	return PSM2_OK;
+}
+
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+psm2_error_t
+psm3_compute_start_and_end_unit_gpu_centric(
+				psm2_uuid_t const job_key,
+				long *unit_start,long *unit_end, int nunits)
+{
+	unsigned index;
+	unsigned ninfo;
+	struct nic_info nic_info[PSMI_MAX_RAILS];
+
+	// caller will enumerate addr_index, just just get all active ports
+	ninfo = nic_info_init(nic_info, nunits, 0);
+	if (! ninfo) {
+		// should not happen, caller already confirmed there is >1 active unit
+		// mimic what caller of psm3_compute_start_and_end_unit would do
+		return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+                    "PSM3 no nic units are active");
+	}
+
+	nic_info_filter_gpu_centric(nic_info, ninfo);
+
+	index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo,
+								job_key, nunits);
+	psmi_assert(index >= 0 && index < ninfo);
+
+	// caller will select 1st active port and an addr_index within unit
+	*unit_start = *unit_end = nic_info[index].unit;
+	return PSM2_OK;
+}
+#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */
+
+// return set of units to consider and which to start at.
+// caller will use 1st active unit which can be opened.
+// caller will wrap around so it's valid for start >= end
+// Note: When using multiple rails per PSM process, higher level code will
+// walk through desired units and unit_param will specify a specific unit
+// if unit_param is PSM3_NIC_ANY, this will pick starting point for nic search
+psm2_error_t
+psm3_compute_start_and_end_unit(long unit_param, long addr_index,
+				int nunitsactive, int nunits,
+				psm2_uuid_t const job_key,
+				long *unit_start, long *unit_end)
+{
+	unsigned short nic_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS;
+	int node_id, found = 0;
+	int saved_hfis[nunits];
+
+	/* if the user did not set PSM3_NIC then ... */
+	if (unit_param == PSM3_NIC_ANY)
+	{
+		if (nunitsactive > 1) {
+			// if NICs are on different planes (non-routed subnets)
+			// we need to have all ranks default to the same plane
+			// so force 1st active NIC in that case
+			int have_subnet = 0, unit_id;
+			psmi_subnet128_t got_subnet = { };
+			for (unit_id = 0; unit_id < nunits; unit_id++) {
+				psmi_subnet128_t subnet;
+				if (psmi_hal_get_unit_active(unit_id) <= 0)
+					continue;
+				if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/,
+								addr_index>0?addr_index:0,
+								&subnet, NULL, NULL, NULL))
+					continue; // can't access NIC
+				if (! have_subnet) {
+					have_subnet = 1;
+					got_subnet = subnet;
+				} else if (! psm3_subnets_match(got_subnet,
+								subnet)) {
+					// active units have different tech
+					// (IB/OPA vs Eth) or different subnets
+					// caller will pick 1st active unit
+					*unit_start = 0;
+					*unit_end = nunits - 1;
+					_HFI_DBG("Multi-Plane config: Will select 1st viable NIC unit= %ld to %ld.\n",
+						*unit_start, *unit_end);
+					return PSM2_OK;
+				}
+			}
+		}
+
+		/* Get the actual selection algorithm from the environment: */
+		nic_sel_alg = psmi_parse_nic_selection_algorithm();
+		/* If round-robin is selection algorithm and ... */
+		if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) &&
+		    /* there are more than 1 active units then ... */
+		    (nunitsactive > 1))
+		{
+			/*
+			 * Pick an HFI on same root complex as current task.
+			 * linux IPC ensures balanced NIC usage within job.
+			 * If none found, fall back to
+			 * RoundRobinAll load-balancing algorithm.
+			 */
+			node_id = psm3_get_current_proc_location();
+			if (node_id >= 0) {
+				found = hfi_find_active_hfis(nunits, node_id,
+								saved_hfis);
+				if (found > 1) {
+					psm3_create_affinity_semaphores(job_key);
+					psmi_spread_hfi_within_socket(unit_start, unit_end,
+								      node_id, saved_hfis,
+								      found, job_key);
+				} else if (found == 1) {
+					*unit_start = *unit_end = saved_hfis[0];
+					_HFI_DBG("RoundRobin Selected NIC unit= %ld, node = %d, local rank=%d, found=%d.\n",
+						*unit_start, node_id,
+						psm3_get_mylocalrank(), found);
+				}
+			}
+
+			if (node_id < 0 || !found) {
+				_HFI_DBG("RoundRobin No local NIC found, using RoundRobinAll, node = %d, local rank=%d, found=%d.\n",
+						node_id,
+						psm3_get_mylocalrank(), found);
+				psmi_spread_nic_selection(job_key, unit_start,
+							  unit_end, nunits);
+			}
+		} else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) &&
+			 (nunitsactive > 1)) {
+				psmi_spread_nic_selection(job_key, unit_start,
+							  unit_end, nunits);
+		} else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_CPU_CENTRIC) &&
+			 (nunitsactive > 1)) {
+				return psm3_compute_start_and_end_unit_cpu_centric(job_key,
+							unit_start, unit_end, nunits);
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+		} else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_GPU_CENTRIC) &&
+			 (nunitsactive > 1)) {
+				return psm3_compute_start_and_end_unit_gpu_centric(job_key,
+							unit_start, unit_end, nunits);
+#endif
+		} else { // PSMI_UNIT_SEL_ALG_WITHIN or only 1 active unit
+			// caller will pick 1st active unit
+			*unit_start = 0;
+			*unit_end = nunits - 1;
+			_HFI_DBG("%s: Will select 1st viable NIC unit= %ld to %ld.\n",
+				(nic_sel_alg == PSMI_UNIT_SEL_ALG_WITHIN)
+					?"Packed":"Only 1 viable NIC",
+				*unit_start, *unit_end);
+		}
+	} else if (unit_param >= 0) {
+		/* the user specified PSM3_NIC, we use it. */
+		*unit_start = *unit_end = unit_param;
+		_HFI_DBG("Caller selected NIC %ld.\n", *unit_start);
+	} else {
+		psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+				 "PSM3 can't open unit: %ld for reading and writing",
+				 unit_param);
+		return PSM2_EP_DEVICE_FAILURE;
+	}
+
+	return PSM2_OK;
+}
+
+static
+int psmi_parse_nic_selection_algorithm(void)
+{
+	union psmi_envvar_val env_nic_alg;
+	int nic_alg = PSMI_UNIT_SEL_ALG_ACROSS;
+
+	const char* PSM3_NIC_SELECTION_ALG_HELP =
+		    "NIC Device Selection Algorithm to use. Round Robin[RoundRobin or rr] (Default) "
+		    ", Packed[p], Round Robin All[RoundRobinAll or rra],"
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+			" CPU Centric Round Robin [CpuRoundRobin or crr]"
+			", or GPU Centric Round Robin [GpuRoundRobin or grr]";
+#else
+			" or CPU Centric Round Robin [CpuRoundRobin or crr]";
+#endif
+
+
+	/* If a specific unit is set in the environment, use that one. */
+	psm3_getenv("PSM3_NIC_SELECTION_ALG", PSM3_NIC_SELECTION_ALG_HELP,
+		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+		    (union psmi_envvar_val)"rr", &env_nic_alg);
+
+	if (!strcasecmp(env_nic_alg.e_str, "Round Robin")
+		|| !strcasecmp(env_nic_alg.e_str, "RoundRobin")
+		|| !strcasecmp(env_nic_alg.e_str, "rr"))
+		nic_alg = PSMI_UNIT_SEL_ALG_ACROSS;
+	else if (!strcasecmp(env_nic_alg.e_str, "Packed")
+			 || !strcasecmp(env_nic_alg.e_str, "p"))
+		nic_alg = PSMI_UNIT_SEL_ALG_WITHIN;
+	else if (!strcasecmp(env_nic_alg.e_str, "Round Robin All")
+			 || !strcasecmp(env_nic_alg.e_str, "RoundRobinAll")
+			 || !strcasecmp(env_nic_alg.e_str, "rra"))
+		nic_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL;
+	else if (!strcasecmp(env_nic_alg.e_str, "CPU Centric Round Robin")
+			 || !strcasecmp(env_nic_alg.e_str, "CpuRoundRobin")
+			 || !strcasecmp(env_nic_alg.e_str, "crr"))
+		nic_alg = PSMI_UNIT_SEL_ALG_CPU_CENTRIC;
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+	else if (!strcasecmp(env_nic_alg.e_str, "GPU Centric Round Robin")
+			 || !strcasecmp(env_nic_alg.e_str, "GpuRoundRobin")
+			 || !strcasecmp(env_nic_alg.e_str, "grr"))
+		nic_alg = PSMI_UNIT_SEL_ALG_GPU_CENTRIC;
+#endif
+	else {
+		_HFI_INFO(
+		    "Invalid value for PSM3_NIC_SELECTION_ALG ('%s') %-40s Using: %s\n",
+			env_nic_alg.e_str, PSM3_NIC_SELECTION_ALG_HELP, "RoundRobin");
+		nic_alg = PSMI_UNIT_SEL_ALG_ACROSS;
+	}
+
+	return nic_alg;
+}
+
+/* parse a list of NIC rails for PSM3_MULTIRAIL_MAP
+ * map is in format: unit:port-addr_index,unit:port-addr_index,...;unit....
+ * where :port is optional (default of 1) and unit can be name or number
+ * -addr_index is also optional and defaults to "all"
+ * addr_index can be an integer between 0 and PSM3_ADDR_PER_NIC-1
+ * or "any" or "all".  "any" selects a single address using the hash and
+ * "all" setups a rail for each address.
+ * ; may separate sets of rails.  When more than 1 set is presented, the
+ * map_index selects which set is used.
+ * Returns:
+ * 0 - successfully parsed, config_out updated
+ * -1 - str empty, config_out unchanged
+ * -2 - syntax error, config_out partially updated
+ */
+static int psm3_parse_multirail_map(const char *str, int map_index,
+		size_t errstr_size, char errstr[],
+		struct multirail_config *config_out)
+{
+	char temp[MAX_MAP_LEN+1];
+	char *s;
+	char *delim;
+	char delim_char = '\0';
+	unsigned i;
+	int ret;
+	int set_index = 0;
+
+	if (!str || ! *str)
+		return -1;
+
+	strncpy(temp, str, MAX_MAP_LEN);
+	if (temp[MAX_MAP_LEN-1] != 0) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size,
+				" Value too long, limit %u characters",
+				MAX_MAP_LEN-1);
+		return -2;
+	}
+	config_out->num_rails = 0;
+	s = temp;
+	psmi_assert(*s);
+	do {
+		int u;
+		unsigned int p = 1;
+		int skip_port = 0;
+		int skip_addr_index = 0;
+		long a_index = PSM3_ADDR_INDEX_ALL;
+
+		if (! *s) {	// trailing ',' or ';' on 2nd or later loop
+			if (delim_char == ';')
+				set_index--;	// never started next set
+			break;
+		}
+		if (delim_char == ';') {
+			// start of a new set
+			config_out->num_rails = 0;
+		}
+		if (config_out->num_rails >= PSMI_MAX_RAILS) {
+			if (errstr_size)
+				snprintf(errstr, errstr_size,
+					" Value too long, limit %u rails",
+					PSMI_MAX_RAILS);
+			return -2;
+		}
+
+		// find end of unit field and put in \0 as needed
+		delim = strpbrk(s, ":-,;");
+		if (!delim || *delim == ',' || *delim == ';') {
+			skip_port = 1; skip_addr_index = 1;
+		} else if (*delim == '-') {
+			skip_port = 1;
+		}
+		if (delim) {
+			delim_char = *delim;
+			*delim = '\0';
+		} else {
+			delim_char = '\0';
+		}
+		// parse unit
+		u = psm3_sysfs_find_unit(s);
+		if (u < 0) {
+			if (errstr_size)
+				snprintf(errstr, errstr_size,
+					" Invalid unit: '%s'", s);
+			return -2;
+		}
+		// find next field
+		if (delim)
+			s = delim+1;
+		if (! skip_port) {
+			// find end of port field and put in \0 as needed
+			delim = strpbrk(s, "-,;");
+			if (!delim || *delim == ',' || *delim == ';')
+				skip_addr_index = 1;
+			if (delim) {
+				delim_char = *delim;
+				*delim = '\0';
+			} else {
+				delim_char = '\0';
+			}
+			// parse port
+			if (psm3_parse_str_uint(s, &p, 0, UINT_MAX) < 0) {
+				if (errstr_size)
+					snprintf(errstr, errstr_size,
+						" Invalid port: '%s'", s);
+				return -2;
+			}
+			// find next field
+			if (delim)
+				s = delim+1;
+		}
+		if (! skip_addr_index) {
+			// find end of addr_index field and put in \0 as needed
+			delim = strpbrk(s, ",;");
+			if (delim) {
+				delim_char = *delim;
+				*delim = '\0';
+			} else {
+				delim_char = '\0';
+			}
+			// parse addr_index
+			if (0 == strcmp(s, "all"))
+				a_index = PSM3_ADDR_INDEX_ALL;	// we will loop below
+			else if (0 == strcmp(s, "any"))
+				a_index = PSM3_ADDR_INDEX_ANY;	// caller will pick
+			else if (psm3_parse_str_long(s, &a_index, 0, psm3_addr_per_nic-1)) {
+				if (errstr_size)
+					snprintf(errstr, errstr_size,
+						" Invalid addr index: '%s'", s);
+				return -2;
+			}
+			// find next field
+			if (delim)
+				s = delim+1;
+		}
+
+		if (a_index == PSM3_ADDR_INDEX_ALL) { // all
+			for (a_index = 0; a_index < psm3_addr_per_nic; a_index++) {
+				if (config_out->num_rails >= PSMI_MAX_RAILS) {
+					if (errstr_size)
+						snprintf(errstr, errstr_size,
+							" Limit of %u rails exceeded due to multi-addr",
+							PSMI_MAX_RAILS);
+					return -2;
+				}
+				config_out->units[config_out->num_rails] = u;
+				config_out->ports[config_out->num_rails] = p;
+				config_out->addr_indexes[config_out->num_rails] = a_index;
+				config_out->num_rails++;
+			}
+		} else {
+			config_out->units[config_out->num_rails] = u;
+			config_out->ports[config_out->num_rails] = p;
+			config_out->addr_indexes[config_out->num_rails] = a_index;
+			config_out->num_rails++;
+		}
+		if (delim_char == ';') {
+			if (set_index == map_index)
+				break;	// found it, stop parsing
+			set_index++;	// start of next
+		}
+	} while (delim);
+
+	// if only 1 set input, we use it, otherwise must have enough sets for us
+	psmi_assert(set_index >= 0);
+	if (set_index > 0 && set_index != map_index) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size,
+						" Insufficient sets specified: %d need at least %d",
+						set_index+1, map_index+1);
+		return -2;
+	}
+	psmi_assert(set_index == 0 || set_index == map_index);
+
+	// must have at least 1 rail.  Since we caught empty string above,
+	// if we get here without any rails input must be something like "," or ";"
+	// and we'll treat that as a syntax error
+	if (! config_out->num_rails) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " No rails specified");
+		return -2;
+	}
+
+	// Check if any of the ports are not usable. Just use addr_index 0 for check
+	for (i = 0; i < config_out->num_rails; i++) {
+		_HFI_VDBG("rail %d:  %u(%s) %u\n", i,
+			config_out->units[i],
+			psm3_sysfs_unit_dev_name(config_out->units[i]),
+			config_out->ports[i]);
+
+		ret = psmi_hal_get_port_active(config_out->units[i],
+										config_out->ports[i]);
+		if (ret <= 0) {
+			if (errstr_size)
+				snprintf(errstr, errstr_size,
+					" Unit:port: %d(%s):%d is not active.",
+					config_out->units[i],
+					psm3_sysfs_unit_dev_name(config_out->units[i]),
+					config_out->ports[i]);
+			return -2;
+		}
+
+		ret = psmi_hal_get_port_lid(config_out->units[i],
+								config_out->ports[i], 0 /* addr_index*/);
+		if (ret <= 0) {
+			if (errstr_size)
+				snprintf(errstr, errstr_size,
+					" Unit:port: %d(%s):%d was filtered out, unable to use",
+					config_out->units[i],
+					psm3_sysfs_unit_dev_name(config_out->units[i]),
+					config_out->ports[i]);
+			return -2;
+		}
+
+		ret = psmi_hal_get_port_subnet(config_out->units[i],
+								config_out->ports[i], 0 /* addr_index*/,
+								NULL, NULL, NULL, NULL);
+		if (ret == -1) {
+			if (errstr_size)
+				snprintf(errstr, errstr_size,
+					" Couldn't get subnet for unit %d (%s):%d",
+					config_out->units[i],
+					psm3_sysfs_unit_dev_name(config_out->units[i]),
+					config_out->ports[i]);
+			return -2;
+		}
+	}
+
+	// valid input
+	return 0;
+}
+
+static int psm3_parse_check_multirail_map(int type,
+		const union psmi_envvar_val val, void *ptr,
+		size_t errstr_size, char errstr[])
+{
+	struct multirail_config temp;
+	int map_index = *(int*)ptr;
+	psmi_assert(type == PSMI_ENVVAR_TYPE_STR);
+	return psm3_parse_multirail_map(val.e_str, map_index, errstr_size, errstr,
+						&temp);
+}
+
+// comparison function for qsort
+// Sort by subnet 1st, then by nic unit, then by addr_index.
+// Nics are already numbered in alphabetic order so this effectively
+// sorts by subnet, then nic name, then addr_index..
+// We simply ignore the filtered field, filtered NICs will also get sorted
+// but omitted from final output list by caller
+static int niccmpfunc(const void *p1, const void *p2)
+{
+	struct nic_info *a = ((struct nic_info *) p1);
+	struct nic_info *b = ((struct nic_info *) p2);
+	int ret;
+
+	ret = psmi_subnet128_cmp(a->subnet, b->subnet);
+	if (ret == 0) {
+		if (a->unit < b->unit)
+			return -1;
+		else if (a->unit > b->unit)
+			return 1;
+
+		if (a->addr_index < b->addr_index)
+			return -1;
+		else if (a->addr_index > b->addr_index)
+			return 1;
+	}
+	return ret;
+}
+
+/*
+ * Sort all the ports within nic_info from small to big.
+ * So, when there are multiple fabrics, and we will use fabric with the
+ * smallest subnet to make the master connection.
+ */
+static void
+psm3_copy_nic_info_to_multitrail_config(
+							struct nic_info *nic_info, unsigned ninfo,
+							struct multirail_config *multirail_config)
+{
+	unsigned i, j;
+
+	qsort(nic_info, ninfo, sizeof(nic_info[0]), niccmpfunc);
+
+	multirail_config->num_rails = 0;
+	j = 0;
+	for (i = 0; i < ninfo; i++) {
+		if (nic_info[i].filtered)
+			continue;
+		multirail_config->units[j] = nic_info[i].unit;
+		multirail_config->ports[j] = nic_info[i].port;
+		multirail_config->addr_indexes[j] = nic_info[i].addr_index;
+		multirail_config->num_rails++;
+		j++;
+	}
+}
+
+// select a list of NICs to use, optimizing for CPU locality first
+static psm2_error_t
+psm3_ep_multirail_autoselect_cpu_centric(uint32_t nunits,
+							struct multirail_config *multirail_config)
+{
+	unsigned ninfo;
+	struct nic_info nic_info[PSMI_MAX_RAILS];
+
+	// enumerate addr_index too
+	ninfo = nic_info_init(nic_info, nunits, 1);
+	if (! ninfo) {
+		// caller will try single NIC selection next
+		multirail_config->num_rails = 0;
+		return PSM2_OK;
+	}
+
+	nic_info_filter_cpu_centric(nic_info, ninfo);
+
+	// we will use all unfiltered units
+
+	// ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU
+	// selection, it will be called per rail and if rails are in
+	// different CPU NUMA could have an undesired impact
+	setenv("PSM3_NO_AFFINITY", "1", 1);
+
+	psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config);
+	return PSM2_OK;
+}
+
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+// select a list of NICs to use, optimizing for GPU locality first
+static psm2_error_t
+psm3_ep_multirail_autoselect_gpu_centric(uint32_t nunits,
+							struct multirail_config *multirail_config)
+{
+	unsigned ninfo;
+	struct nic_info nic_info[PSMI_MAX_RAILS];
+
+	// enumerate addr_index too
+	ninfo = nic_info_init(nic_info, nunits, 1);
+	if (! ninfo) {
+		// caller will try single NIC selection next
+		multirail_config->num_rails = 0;
+		return PSM2_OK;
+	}
+
+	nic_info_filter_gpu_centric(nic_info, ninfo);
+
+	// we will use all unfiltered units
+
+	// ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU
+	// selection, it will be called per rail and if rails are in
+	// different CPU NUMA could have an undesired impact
+	setenv("PSM3_NO_AFFINITY", "1", 1);
+
+	psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config);
+	return PSM2_OK;
+}
+#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */
+
+// for use in psm3_ep_multirail_autoselect so can sort rails
+// by subnet and addr_index
+struct rail_info {
+	psmi_subnet128_t subnet;
+	unsigned unit;
+	unsigned port;
+	unsigned addr_index;
+};
+
+static int cmpfunc(const void *p1, const void *p2)
+{
+	struct rail_info *a = ((struct rail_info *) p1);
+	struct rail_info *b = ((struct rail_info *) p2);
+	int ret;
+
+	ret = psmi_subnet128_cmp(a->subnet, b->subnet);
+	if (ret == 0) {
+		if (a->addr_index < b->addr_index)
+			return -1;
+		else if (a->addr_index > b->addr_index)
+			return 1;
+	}
+	return ret;
+}
+
+// Multirail enabled, autoselect one or more NICs for this process
+// multirail_mode is PSM3_MULTIRAIL selection (1=all NICs, 2=NUMA local NICs)
+static psm2_error_t
+psm3_ep_multirail_autoselect(int multirail_mode,
+							struct multirail_config *multirail_config)
+{
+	uint32_t num_units = 0;
+	psmi_subnet128_t subnet;
+	unsigned i, j, k, count = 0;
+	int ret;
+	psm2_error_t err = PSM2_OK;
+	struct rail_info rail_info[PSMI_MAX_RAILS];
+	int multirail_within_socket_used = 0;
+	int node_id = -1, found = 0;
+
+	if (multirail_mode == 2)
+		multirail_within_socket_used = 1;
+
+
+	if ((err = psm3_ep_num_devunits(&num_units))) {
+		return err;
+	}
+
+	if (num_units > PSMI_MAX_RAILS) {
+		_HFI_INFO
+		    ("Found %d units, max %d units are supported, using first %d\n",
+		     num_units, PSMI_MAX_RAILS, PSMI_MAX_RAILS);
+		num_units = PSMI_MAX_RAILS;
+	}
+
+	if (multirail_mode == 3)
+		return psm3_ep_multirail_autoselect_cpu_centric(num_units, multirail_config);
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+	if (multirail_mode == 4)
+		return psm3_ep_multirail_autoselect_gpu_centric(num_units, multirail_config);
+#endif
+
+	/*
+	 * PSM3_MULTIRAIL=2 functionality-
+	 *   - Try to find at least find one NIC in the same root
+	 *     complex. If none found, continue to run and
+	 *     use remaining NIC in the system.
+	 *   - If we do find at least one NIC in same root complex, we
+	 *     go ahead and add to list.
+	 */
+	if (multirail_within_socket_used) {
+		node_id = psm3_get_current_proc_location();
+		for (i = 0; i < num_units; i++) {
+			if (psmi_hal_get_unit_active(i) <= 0)
+				continue;
+			int node_id_i;
+
+			if (!psmi_hal_get_node_id(i, &node_id_i)) {
+				if (node_id_i == node_id) {
+					found = 1;
+					break;
+				}
+			}
+		}
+	}
+/*
+ * Get all the ports and addr_index with a valid lid and gid, one port per unit.
+ * but up to PSM3_ADDR_PER_NIC addresses.  If we are using the NUMA selection
+ * algorithm and found at list 1 NUMA local NIC above, limit the list to NUMA
+ * local NICs, otherwise list all NICs
+ */
+	for (i = 0; i < num_units; i++) {
+		int node_id_i;
+
+		if (!psmi_hal_get_node_id(i, &node_id_i))
+		{
+			if (multirail_within_socket_used &&
+			    found && (node_id_i != node_id))
+				continue;
+		}
+
+		for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) {
+			int got_port = 0;
+			for (k = 0; k < psm3_addr_per_nic; k++) {
+				ret = psmi_hal_get_port_lid(i, j, k);
+				if (ret <= 0)
+					continue;
+				ret = psmi_hal_get_port_subnet(i, j, k, &subnet, NULL, NULL, NULL);
+				if (ret == -1)
+					continue;
+
+				rail_info[count].subnet = subnet;
+				rail_info[count].unit = i;
+				rail_info[count].port = j;
+				rail_info[count].addr_index = k;
+				got_port = 1;
+				count++;
+			}
+			if (got_port)	// one port per unit
+				break;
+		}
+	}
+
+/*
+ * Sort all the ports within rail_info from small to big.
+ * This is for multiple fabrics, and we use fabric with the
+ * smallest subnet to make the master connection.
+ */
+	qsort(rail_info, count, sizeof(rail_info[0]), cmpfunc);
+
+	for (i = 0; i < count; i++) {
+		multirail_config->units[i] = rail_info[i].unit;
+		multirail_config->ports[i] = rail_info[i].port;
+		multirail_config->addr_indexes[i] = rail_info[i].addr_index;
+	}
+	multirail_config->num_rails = count;
+	return PSM2_OK;
+}
+
+// process PSM3_MULTIRAIL and PSM3_MULTIRAIL_MAP and return the
+// list of unit/port/addr_index in multirail_config.
+// When multirail_config->num_rails is returned as 0, multirail is not enabled
+// and other mechanisms (PSM3_NIC, PSM3_NIC_SELECTION_ALG) must be
+// used by the caller to select a single NIC for the process.
+// This can return num_rails==1 if exactly 1 NIC is to be used by this process
+// or num_rails>1 if this process is to stripe data across multiple NICs
+// in which case the 1st NIC in multirail_config should be used as the
+// primary NIC for job communications setup.
+psm2_error_t
+psm3_ep_multirail(struct multirail_config *multirail_config)
+{
+	int ret;
+	union psmi_envvar_val env_multirail;
+	union psmi_envvar_val env_multirail_map;
+	int map_index;
+
+	psm3_getenv_range("PSM3_MULTIRAIL",
+			"Control use of multiple NICs",
+			"-1: No PSM3 NIC autoselection (middleware selects 1 NIC per process).\n"
+			" 0: (default) Middleware may select NICs or use PSM3 'autoselect_one'\n"
+			"    interface. 'autoselect_one' interface will pick 1 NIC per process\n"
+			"    based on PSM3_NIC_SELECTION_ALG.\n"
+			" 1: Enable multirail, each process uses all available NICs. Only 'autoselect'\n"
+			"    interface presented to middleware.\n"
+			" 2: Enable multirail, each process uses all NUMA local NICs. Only 'autoselect'\n"
+			"    interface presented to middleware. If no NUMA local NICs found for a given\n"
+			"    process, PSM3 will use all available NICs for that process.\n"
+			" 3: Enable multirail, each process selects only ideally located NICs with\n"
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+			"    consideration of NIC, CPU"
+#ifdef PSM3_HAVE_CPU_SUBNUMA
+									" sub-NUMA"
+#endif
+									" and GPU locations with priority given\n"
+			"    to CPU locality. Only 'autoselect' interface presented to middleware.\n"
+			"    If no NUMA local NICs are found for a given process and all NICs are equal\n"
+			"    distance to the GPU, PSM3 will use all available NICs for that process.\n"
+#else
+			"    consideration of NIC and CPU"
+#ifdef PSM3_HAVE_CPU_SUBNUMA
+											"  sub-NUMA"
+#endif
+											" locations.\n"
+			"    Only 'autoselect' interface presented to middleware.\n"
+			"    If no NUMA local NICs are found for a given process, PSM3 will use all\n"
+			"    available NICs for that process.\n"
+#endif
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+			" 4: Enable multirail, each process selects only ideally located NICs with\n"
+			"    consideration of NIC, GPU, and CPU"
+#ifdef PSM3_HAVE_CPU_SUBNUMA
+											" sub-NUMA"
+#endif
+											" locations with priority given\n"
+			"    to GPU locality. Only 'autoselect' interface presented to middleware.\n"
+			"    If no NUMA local NICs are found for a given process, PSM3 will use all\n"
+			"    available NICs of equal distance to the GPU for that process."
+#endif
+			,
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val)0,
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+			(union psmi_envvar_val)-1, (union psmi_envvar_val)4,
+#else
+			(union psmi_envvar_val)-1, (union psmi_envvar_val)3,
+#endif
+			NULL, NULL, &env_multirail);
+	if (env_multirail.e_int <= 0) {
+		// will pick 1 NIC per process
+		multirail_config->num_rails = 0;
+		return PSM2_OK;
+	}
+
+	if (env_multirail.e_int == 1 || env_multirail.e_int == 2) {
+		// TBD - move this code to a separate function
+		// for PSM3_MULTIRAIL=1 or 2, PSM3_MULTIRAIL_MAP can explicitly select NICs.
+		// We treat invalid input, such as bad syntax or selection of an unusable
+		// port (down/missing/etc), as a fatal error instead of attempting to run
+		// on the default PSM3_MULTIRAIL_MAP config.  This helps avoid
+		// inconsistent NIC selections, especially for down ports, which may
+		// cause confusing behaviors or errors.
+		// If PSM3_MULTIRAIL_MAP contains multiple lists of NICs, then
+		// if PSM3_MULTIRAIL=1 - use local rank index (0, ...) to select
+		// if PSM3_MULTIRAIL=2 - use process NUMA (0, ...) to select
+		if (env_multirail.e_int == 1) {
+			map_index = psm3_get_mylocalrank();
+		} else if (env_multirail.e_int == 2) {
+			map_index = psm3_get_current_proc_location();
+			if (map_index < 0) {
+				return psm3_handle_error(PSMI_EP_NORETURN,
+					PSM2_EP_DEVICE_FAILURE,
+					"Unable to get NUMA location of current process\n");
+			}
+		} else {
+			psmi_assert(0);
+		}
+		ret = psm3_getenv_range("PSM3_MULTIRAIL_MAP",
+			"Explicit NIC selections for each rail",
+			"Specified as:\n"
+			"     rail,rail,...;rail,rail,...\n"
+#if	 0
+			"Where rail can be: unit:port-addr_index or unit\n"
+#else
+			"Where rail can be: unit-addr_index or unit\n"
+#endif
+			"unit can be device name or unit number\n"
+#if 0
+			"where :port is optional (default of 1)\n"
+#endif
+			"addr_index can be 0 to PSM3_ADDR_PER_NIC-1, or 'any' or 'all'\n"
+			"When addr_index is omitted, it defaults to 'all'\n"
+			"When more than 1 set of rails is present (each set is separated by ;),\n"
+			"the set to use for a given process is selected based on PSM3_MULTIRAIL.\n"
+			"    1 - use local rank number to select\n"
+			"    2 - use local CPU NUMA to select\n"
+			"When empty, PSM3 will autoselect NICs as controlled by PSM3_MULTIRAIL.",
+				PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_FATAL, PSMI_ENVVAR_TYPE_STR,
+				(union psmi_envvar_val)"",
+				(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+				psm3_parse_check_multirail_map, &map_index, &env_multirail_map);
+		if (ret < 0) {	// syntax error in input, ret error instead of using default
+			psmi_assert(0); // should not get here since specified FLAG_FATAL
+			multirail_config->num_rails = 0;
+			return psm3_handle_error(PSMI_EP_NORETURN,
+					PSM2_EP_DEVICE_FAILURE,
+					"Invalid value for PSM3_MULTIRAIL_MAP: '%s', can't proceed\n",
+					env_multirail_map.e_str);
+		}
+		if (! ret) {
+			// valid input
+			if (psm3_parse_multirail_map(env_multirail_map.e_str, map_index, 0, NULL,
+				multirail_config) < 0) {
+				// already checked, shouldn't get parse errors nor empty strings
+				psmi_assert(0);
+			}
+			return PSM2_OK;
+		}
+	}
+
+	// multirail enabled, automatically select 1 or more NICs
+	return psm3_ep_multirail_autoselect(env_multirail.e_int, multirail_config);
+}
+
+// potential job start hwloc initialization.  To avoid overhead
+// when hwloc is not needed, we defer to the 1st actual need for hwloc
+void
+psm3_hwloc_topology_init()
+{
+}
+
+#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA)
+// deferred hwloc initialization.  Caller must hold psm3_creation_lock
+static void psm3_deferred_hwloc_topology_init()
+{
+	unsigned version;
+	Dl_info info_hwloc;
+	const char *location;
+
+	// only try once
+	if (psm3_hwloc_topology_initialized || psm3_hwloc_topology_init_failed)
+		return;
+
+#define SHOW_HWLOC_VERSION(ver) (ver)>>16, ((ver) >> 8) & 0xff, (ver) & 0xff
+	version = hwloc_get_api_version();
+	location = dladdr(hwloc_topology_init, &info_hwloc) ?
+					 info_hwloc.dli_fname : "hwloc path not available";
+	if ((version >> 16) != (HWLOC_API_VERSION >> 16)) {
+		_HFI_ERROR("PSM3 was compiled for hwloc API %u.%u.%u but found library API %u.%u.%u at %s.\n"
+		   "You may need to point LD_LIBRARY_PATH to the right hwloc library.\n"
+		   "Disabling some NIC selection affinity features\n",
+		   SHOW_HWLOC_VERSION(HWLOC_API_VERSION), SHOW_HWLOC_VERSION(version),
+		   location);
+		psm3_hwloc_topology_init_failed = 1;
+		return;
+	}
+	// HWLOC_VERSION string mentioned in docs, but not defined in headers
+	psm3_print_identify("%s %s hwloc runtime API %u.%u.%u at %s, built against API %u.%u.%u\n",
+			psm3_get_mylabel(), psm3_ident_tag,
+			SHOW_HWLOC_VERSION(version), location,
+			SHOW_HWLOC_VERSION(HWLOC_API_VERSION));
+
+	hwloc_topology_init(&psm3_hwloc_topology);
+	// detection configuration, need all PCI devices and CPU sub-numa
+	// HWLOC_API_VERSION is rev X.Y.Z as (X<<16)+(Y<<8)+Z
+	// significant API changes from 1.0 to 2.0, including ABI changes
+#if HWLOC_API_VERSION < 0x20000
+	hwloc_topology_set_flags(psm3_hwloc_topology,
+		HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_IO_BRIDGES);
+#else
+	hwloc_topology_set_io_types_filter(psm3_hwloc_topology,
+		HWLOC_TYPE_FILTER_KEEP_ALL);
+#endif
+	hwloc_topology_load(psm3_hwloc_topology);
+	psm3_hwloc_topology_initialized = 1;
+}
+#endif /* defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) */
+
+void
+psm3_hwloc_topology_destroy()
+{
+#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA)
+	if (psm3_hwloc_topology_initialized) {
+		psm3_hwloc_topology_initialized = 0;
+		hwloc_topology_destroy(psm3_hwloc_topology);
+	}
+#endif
+}
+
+#if defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA)
+/* Get the next PCI device in the system.
+ *
+ * return the first PCI device if prev is NULL.
+ * looping on this allows iterating through all PCIe devices
+ * device=any PCIe component (root controller, bridge, switch, device, etc)
+ */
+static inline hwloc_obj_t
+get_next_pcidev(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PCI_DEVICE, prev);
+}
+
+/* Find the PCI device hwloc object matching the PCI bus id
+ * given domain, bus, device and func PCI bus id.
+ */
+static hwloc_obj_t
+get_pcidev_by_busid(hwloc_topology_t topology,
+						const struct pci_addr *addr)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = get_next_pcidev(topology, obj)) != NULL) {
+    if (obj->attr->pcidev.domain == addr->domain
+        && obj->attr->pcidev.bus == addr->bus
+        && obj->attr->pcidev.dev == addr->dev
+        && obj->attr->pcidev.func == addr->func)
+      return obj;
+  }
+  return NULL;
+}
+#endif /* defined(PSM_HAVE_GPU_CENTRIC_AFFINITY) || defined(PSM3_HAVE_CPU_SUBNUMA) */
+
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+// compare two hwloc objects for equality
+// 1 on match, 0 on mismatch
+static int equal_hwlocobj(const hwloc_obj_t obj1, const hwloc_obj_t obj2)
+{
+	return (obj1->type == obj2->type
+			&& obj1->depth == obj2->depth
+			&& obj1->logical_index == obj2->logical_index);
+}
+
+// compute distance in between objects (PCIe devices).
+// If the devices are on different PCIe controllers and/or different CPU sockets
+// returns INT_MAX
+static int get_distance_to_common_ancestor(const hwloc_obj_t obj1, const hwloc_obj_t obj2)
+{
+    int d1 = 0;
+    int d2 = 0;
+    hwloc_obj_t temp1 = obj1;
+
+    while (temp1) {
+
+        hwloc_obj_t temp2 = obj2;
+        d2 = 0;
+
+        while (temp2) {
+
+            /* common ancestor found */
+            if (equal_hwlocobj(temp1, temp2)) {
+                return d1 + d2;
+            }
+            temp2 = temp2->parent;
+            d2++;
+        }
+        temp1 = temp1->parent;
+        d1++;
+    }
+
+    /* No common ancestor found, return INT_MAX as the distance */
+    return INT_MAX;
+}
+
+// compute distance in PCIe hops between devices.  If the
+// If the devices are on different PCIe controllers and/or different CPU sockets
+// returns INT_MAX
+static int psm3_get_distance_between_pcis(const struct pci_addr *pci_addr_1,
+                                          const struct pci_addr *pci_addr_2)
+{
+    hwloc_obj_t obj1 = get_pcidev_by_busid(psm3_hwloc_topology, pci_addr_1);
+    hwloc_obj_t obj2 = get_pcidev_by_busid(psm3_hwloc_topology, pci_addr_2);
+    return get_distance_to_common_ancestor(obj1, obj2);
+}
+#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */
+
+#ifdef PSM3_HAVE_CPU_SUBNUMA
+// find ancestor of a device, namely the PCIe controller in the CPU socket
+static hwloc_obj_t psm3_get_non_io_ancestor_obj(
+										const struct pci_addr *pci_addr)
+{
+	hwloc_obj_t obj = get_pcidev_by_busid(psm3_hwloc_topology, pci_addr);
+	if (! obj)
+		return NULL;
+	return hwloc_get_non_io_ancestor_obj(psm3_hwloc_topology, obj);
+}
+#endif /* PSM3_HAVE_CPU_SUBNUMA */
diff --git a/prov/psm3/psm3/psm_nic_select.h b/prov/psm3/psm3/psm_nic_select.h
new file mode 100644
index 00000000000..cfd23ea1081
--- /dev/null
+++ b/prov/psm3/psm3/psm_nic_select.h
@@ -0,0 +1,116 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2024 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2024 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_nic_select.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSM_NIC_SELECT_H
+#define _PSM_NIC_SELECT_H
+
+// PSM3_NIC_SELECTION_ALG choices
+/*
+ * round robin contexts across HFIs, then
+ * ports; this is the default.
+ * This option spreads the HFI selection within the local socket.
+ * If it is preferred to spread job over over entire set of
+ * HFIs within the system, see ALG_ACROSS_ALL below.
+ */
+#define PSMI_UNIT_SEL_ALG_ACROSS     PSM_HAL_ALG_ACROSS
+
+#define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL
+
+/*
+ * use all contexts on an HFI (round robin
+ * active ports within), then next HFI
+ */
+#define PSMI_UNIT_SEL_ALG_WITHIN     PSM_HAL_ALG_WITHIN
+
+#define PSMI_UNIT_SEL_ALG_CPU_CENTRIC     PSM_HAL_ALG_CPU_CENTRIC
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+#define PSMI_UNIT_SEL_ALG_GPU_CENTRIC     PSM_HAL_ALG_GPU_CENTRIC
+#endif
+
+struct multirail_config {
+    int num_rails;
+    uint32_t units[PSMI_MAX_RAILS];
+    uint16_t ports[PSMI_MAX_RAILS];
+    int addr_indexes[PSMI_MAX_RAILS];
+};
+
+// return set of units to consider and which to start at.
+// caller will use 1st active unit which can be opened.
+// caller will wrap around so it's valid for start >= end
+// Note: When using multiple rails per PSM process, higher level code will
+// walk through desired units and unit_param will specify a specific unit
+// if unit_param is PSM3_NIC_ANY, this will pick starting point for nic search
+psm2_error_t
+psm3_compute_start_and_end_unit(long unit_param, long addr_index,
+				int nunitsactive,int nunits,
+				psm2_uuid_t const job_key,
+				long *unit_start,long *unit_end);
+
+psm2_error_t
+psm3_ep_multirail(struct multirail_config *multirail_config);
+
+// decrement any NIC refcounts which may have been
+// incremented by psm3_compute_start_and_end_unit
+void psm3_dec_nic_refcount(int unit_id);
+
+// manage hwloc topology discovery.  These will be Noops when ! PSM_USE_HWLOC
+void psm3_hwloc_topology_init();
+void psm3_hwloc_topology_destroy();
+
+#endif /* PSM_NIC_SELECT_H */
diff --git a/prov/psm3/psm3/psm_oneapi_ze.c b/prov/psm3/psm3/psm_oneapi_ze.c
index 568581ad84b..2090fb68326 100644
--- a/prov/psm3/psm3/psm_oneapi_ze.c
+++ b/prov/psm3/psm3/psm_oneapi_ze.c
@@ -70,6 +70,7 @@ int psm3_num_ze_dev_fds;
 #endif
 int psm3_oneapi_immed_sync_copy;
 int psm3_oneapi_immed_async_copy;
+unsigned psm3_oneapi_parallel_dtod_copy_thresh;
 
 const char* psmi_oneapi_ze_result_to_string(const ze_result_t result) {
 #define ZE_RESULT_CASE(RES) case ZE_RESULT_##RES: return STRINGIFY(RES)
@@ -203,6 +204,72 @@ void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size)
 	}
 }
 
+// synchronous GPU memcpy DTOD (xeLink)
+void psmi_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size)
+{
+	struct ze_dev_ctxt *ctxt;
+
+	psmi_assert(size > 0);
+	ctxt = psmi_oneapi_dev_ctxt_get(dstptr);
+	if (!ctxt) {
+		_HFI_ERROR("dst %p src %p not GPU buf for copying\n",
+			   dstptr, srcptr);
+		return;
+	}
+	if (size <= psm3_oneapi_parallel_dtod_copy_thresh) {
+		if (psm3_oneapi_immed_sync_copy) {
+			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl,
+					dstptr, srcptr, size, NULL, 0, NULL);
+		} else {
+			PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl);
+			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl,
+					dstptr, srcptr, size, NULL, 0, NULL);
+			PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl);
+			PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq,
+					1, &ctxt->cl, NULL);
+			PSMI_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX);
+		}
+	} else {
+		// for large DTOD copies, start 2 parallel commands
+		// then wait for both
+		size_t size0 = ROUNDUP64P2(size/2, 64*1024);
+		size_t size1 = size - size0;
+
+		if (psm3_oneapi_immed_sync_copy) {
+			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0,
+					dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL);
+
+			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1,
+					(void*)((uintptr_t)dstptr+size0),
+					(void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1,
+					0, NULL);
+		} else {
+			PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl0);
+			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0,
+					dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL);
+			PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl0);
+			PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq0,
+					1, &ctxt->async_cl0, NULL);
+
+			PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl1);
+			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1,
+					(void*)((uintptr_t)dstptr+size0),
+					(void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1,
+					0, NULL);
+			PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl1);
+			PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq1,
+					1, &ctxt->async_cl1, NULL);
+		}
+		// 2nd copy may be slightly smaller so waity for it first so
+		// can potentially hide its Reset latency while 1st copy completes
+		PSMI_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status1, UINT32_MAX);
+		PSMI_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status1);
+
+		PSMI_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status0, UINT32_MAX);
+		PSMI_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status0);
+	}
+}
+
 // for pipelined async GPU memcpy
 // *p_cq is left as NULL when psm3_oneapi_immed_async_copy enabled
 void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt,
diff --git a/prov/psm3/psm3/psm_perf.c b/prov/psm3/psm3/psm_perf.c
index 6b30ca60eeb..5e2f6c4f169 100644
--- a/prov/psm3/psm3/psm_perf.c
+++ b/prov/psm3/psm3/psm_perf.c
@@ -207,7 +207,7 @@ static void psmi_rdpmc_perf_framework_init()
  *
  * Read the current value of a running performance counter.
  */
-unsigned long long rdpmc_read(struct rdpmc_ctx *ctx)
+unsigned long long psm3_rdpmc_read(struct rdpmc_ctx *ctx)
 {
 	static __thread int rdpmc_perf_initialized = 0;
 
diff --git a/prov/psm3/psm3/psm_perf.h b/prov/psm3/psm3/psm_perf.h
index db51ceb2fa7..8fdea147fca 100644
--- a/prov/psm3/psm3/psm_perf.h
+++ b/prov/psm3/psm3/psm_perf.h
@@ -87,7 +87,7 @@ extern char global_rdpmc_slot_name[RDPMC_PERF_MAX_SLOT_NUMBER][RDPMC_PERF_MAX_SL
 extern unsigned int global_rdpmc_type;
 extern unsigned int global_rdpmc_config;
 
-extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx);
+extern unsigned long long psm3_rdpmc_read(struct rdpmc_ctx *ctx);
 
 #define RDPMC_PERF_INIT() \
 {                         \
@@ -111,12 +111,12 @@ extern unsigned long long rdpmc_read(struct rdpmc_ctx *ctx);
 
 #define RDPMC_PERF_BEGIN(slot_number) \
 {                                     \
-    global_rdpmc_begin[(slot_number)] = rdpmc_read(&global_rdpmc_ctx); \
+    global_rdpmc_begin[(slot_number)] = psm3_rdpmc_read(&global_rdpmc_ctx); \
 }
 
 #define RDPMC_PERF_END(slot_number) \
 {                        \
-    global_rdpmc_summ[(slot_number)] += (rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \
+    global_rdpmc_summ[(slot_number)] += (psm3_rdpmc_read(&global_rdpmc_ctx) - global_rdpmc_begin[(slot_number)]); \
     global_rdpmc_number[(slot_number)]++;                                                                    \
 }
 
diff --git a/prov/psm3/psm3/psm_stats.c b/prov/psm3/psm3/psm_stats.c
index 400a8e8c55e..4ae33fe9a85 100644
--- a/prov/psm3/psm3/psm_stats.c
+++ b/prov/psm3/psm3/psm_stats.c
@@ -641,30 +641,54 @@ psm2_error_t
 psm3_stats_initialize(void)
 {
 	union psmi_envvar_val env_stats_freq;
+	union psmi_envvar_val env_stats_prefix;
 	union psmi_envvar_val env_stats_help;
 	union psmi_envvar_val env_statsmask;
-	int got_stats_freq;
-	int got_stats_help;
-	int got_statsmask;
+	int noenv_stats_freq;	// env var not specified, used default
+	int noenv_stats_prefix;	// env var not specified, used default
+	int noenv_stats_help;	// env var not specified, used default
+	int noenv_statsmask;	// env var not specified, used default
 
 	psmi_assert(! perf_stats_initialized);
 
-	got_stats_freq = psm3_getenv("PSM3_PRINT_STATS",
-			"Prints performance stats every n seconds to file "
-			"./psm3-perf-stat-[hostname]-pid-[pid] when set to -1 stats are "
-			"printed only once on 1st ep close",
-			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-			(union psmi_envvar_val) 0, &env_stats_freq);
-	print_stats_freq = env_stats_freq.e_uint;
-
-	got_stats_help = psm3_getenv("PSM3_PRINT_STATS_HELP",
+	noenv_stats_freq = (0 < psm3_getenv_range("PSM3_PRINT_STATS",
+			"Prints performance stats every n seconds",
+			"  0 - disable output\n"
+			"  -1 - only output once at end of job on 1st ep close\n"
+			"  >=1 - output every n seconds\n"
+			"  val: - limit output to rank 0 (for val of -1 or >=1)\n"
+			"  val:pattern - limit output to processes whose label matches\n    "
+#ifdef FNM_EXTMATCH
+                                "extended "
+#endif
+                                "glob pattern (for val of -1 or >=1)\n"
+			"Output goes to file ${PSM3_PRNT_STATS_PREFIX}psm3-perf-stat-[hostname]-pid-[pid]",
+			PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_NOABBREV,
+			PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT,
+			(union psmi_envvar_val)"0",
+			(union psmi_envvar_val)-1, (union psmi_envvar_val)INT_MAX,
+                        NULL, NULL, &env_stats_freq));
+	(void)psm3_parse_val_pattern_int(env_stats_freq.e_str, 0,
+			&print_stats_freq,
+			PSMI_ENVVAR_FLAG_NOABBREV, -1, INT_MAX);
+
+	noenv_stats_prefix = (0 < psm3_getenv_range("PSM3_PRINT_STATS_PREFIX",
+			"Prefix for filename for performance stats output",
+			"May be used to add a prefix possibly including directory for output",
+			PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_NOABBREV,
+			PSMI_ENVVAR_TYPE_STR,
+			(union psmi_envvar_val)"./",
+			(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+                        NULL, NULL, &env_stats_prefix));
+
+	noenv_stats_help = (0 < psm3_getenv("PSM3_PRINT_STATS_HELP",
 			"Prints performance stats help text on rank 0 to file "
-			"./psm3-perf-stat-help-[hostname]-pid-[pid]",
+			"${PSM3_PRINT_STATS_PREFIX}psm3-perf-stat-help-[hostname]-pid-[pid]",
 			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-			(union psmi_envvar_val) 0, &env_stats_help);
+			(union psmi_envvar_val) 0, &env_stats_help));
 	print_stats_help = env_stats_help.e_uint && (psm3_get_myrank() == 0);
 
-	got_statsmask = psm3_getenv("PSM3_PRINT_STATSMASK",
+	noenv_statsmask = (0 < psm3_getenv("PSM3_PRINT_STATSMASK",
 			"Mask of statistic types to print: "
 			"MQ=1, RCVTHREAD=0x100, IPS=0x200"
 #if   defined(PSM_HAVE_REG_MR)
@@ -681,21 +705,21 @@ psm3_stats_initialize(void)
 #endif
 			".  0x100000 causes zero values to also be shown",
 			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT_FLAGS,
-			(union psmi_envvar_val) PSMI_STATSTYPE_ALL, &env_statsmask);
+			(union psmi_envvar_val) PSMI_STATSTYPE_ALL, &env_statsmask));
 	print_statsmask = env_statsmask.e_uint;
 
 	stats_start = time(NULL);
 
 	snprintf(perf_file_name, sizeof(perf_file_name),
-			"./psm3-perf-stat-%s-pid-%d",
-			psm3_gethostname(), getpid());
+			"%spsm3-perf-stat-%s-pid-%d",
+			env_stats_prefix.e_str, psm3_gethostname(), getpid());
 
 	if (print_stats_help) {
 		// a few optons, such as CUDA, ONEAPI_ZE, RDMA affect what is
 		// included in help, so use a unique filename per job
 		snprintf(perf_help_file_name, sizeof(perf_help_file_name),
-				"./psm3-perf-stat-help-%s-pid-%d",
-				psm3_gethostname(), getpid());
+				"%spsm3-perf-stat-help-%s-pid-%d",
+				env_stats_prefix.e_str, psm3_gethostname(), getpid());
 		perf_help_fd = fopen(perf_help_file_name, "w");
 		if (!perf_help_fd)
 			_HFI_ERROR("Failed to create fd for performance logging help: %s: %s\n",
@@ -706,13 +730,19 @@ psm3_stats_initialize(void)
 	print_job_info_help();
 	print_basic_job_info();
 
-	if (got_stats_freq)
+	// if got a valid value or an invalid value, psm3_getenv will have
+	// stashed it and print_basic_job_info will have put in stats file
+	// otherwise we want to always report the STATS variable settings
+	if (noenv_stats_freq)
 		psm3_stats_print_env_val("PSM3_PRINT_STATS",
 								PSMI_ENVVAR_TYPE_UINT, env_stats_freq);
-	if (got_stats_help)
+	if (noenv_stats_prefix)
+		psm3_stats_print_env_val("PSM3_PRINT_STATS_PREFIX",
+								PSMI_ENVVAR_TYPE_STR, env_stats_prefix);
+	if (noenv_stats_help)
 		psm3_stats_print_env_val("PSM3_PRINT_STATS_HELP",
 								PSMI_ENVVAR_TYPE_UINT, env_stats_help);
-	if (got_statsmask)
+	if (noenv_statsmask)
 		psm3_stats_print_env_val("PSM3_PRINT_STATSMASK",
 								PSMI_ENVVAR_TYPE_UINT_FLAGS, env_statsmask);
 
diff --git a/prov/psm3/psm3/psm_sysbuf.c b/prov/psm3/psm3/psm_sysbuf.c
index f9bee0be199..698507e8528 100644
--- a/prov/psm3/psm3/psm_sysbuf.c
+++ b/prov/psm3/psm3/psm_sysbuf.c
@@ -77,11 +77,46 @@ struct psmi_mem_block_ctrl {
 void psm3_mq_sysbuf_init(psm2_mq_t mq)
 {
     int i;
+    // sysbuf is used for unexpected eager messages in nic, shm and self
+    // for self, unexpected is a courtesy to bad apps, app should always post
+    //    recv before send when sendint to self.
+    // for nic, eager is only messages below rendezvous threshold.
+    //    In TCP and CPU jobs threshold can be larger.  TCP allows up to 256K.
+    //    Typical verbs rendezvous threshold is 8000-64K bytes, with GPU
+    //    tending to use a lower threshold as GPU copies are expensive.
+    // for shm, GPU messages use rendezvous anytime GPU supports Scale-Up
+    //    GPU to GPU comms, such as xeLink or nvLink.
+    // A message which exceeds largest block_size[], will have a temporary
+    // sysbuf allocated and freed. For CPU this is ok as malloc is not
+    // terribly expensive.  However for GPU, the subsequent copy will pay
+    // a GPU DMA registration cost in Cuda or Level Zero, so it is best to
+    // avoid temporary buffers.  Fortunately GPU apps tend to have fewer
+    // processes per node and hence more available CPU memory to hold the
+    // buffers.
+    //
+    // So for GPU jobs, we allow a few larger block sizes just in case
+    // rendezvous threshold is set high or TCP is being used with a large
+    // eager message size (aka PSM3_MTU).
+    // replenishing_rate is how many we add to pool at a time, there is
+    // no upper bound to the pool.
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+    uint32_t gpu_block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, 65536, 262144, (uint32_t)-1};
+    uint32_t gpu_replenishing_rate[] = {128, 64, 32, 16, 8, 4, 2, 2, 0};
+    uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1, (uint32_t)-1,  (uint32_t)-1};
+    uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0, 0, 0};
+#else
     uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1};
     uint32_t replenishing_rate[] = {128, 64, 32, 16, 8, 4, 0};
+#endif
 
     if (mq->mem_ctrl_is_init)
         return;
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+    if (PSMI_IS_GPU_ENABLED) {
+        memcpy(block_sizes, gpu_block_sizes, sizeof(block_sizes));
+        memcpy(replenishing_rate, gpu_replenishing_rate, sizeof(replenishing_rate));
+    }
+#endif
     mq->mem_ctrl_is_init = 1;
 
     for (i=0; i < MM_NUM_OF_POOLS; i++) {
@@ -125,9 +160,35 @@ void psm3_mq_sysbuf_fini(psm2_mq_t mq)  // free all buffers that is currently no
     for (i=0; i < MM_NUM_OF_POOLS; i++) {
         while ((block = mq->handler_index[i].free_list) != NULL) {
             mq->handler_index[i].free_list = block->next;
+#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER)
+            if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
+                /* ignore NOT_REGISTERED in case cuda initialized late */
+                /* ignore other errors as context could be destroyed before this */
+                CUresult cudaerr;
+                //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+                //               cuMemHostUnregister, block);
+                psmi_count_cuMemHostUnregister++;
+                cudaerr = psmi_cuMemHostUnregister(block);
+                if (cudaerr) {
+                    const char *pStr = NULL;
+                    psmi_count_cuGetErrorString++;
+                    psmi_cuGetErrorString(cudaerr, &pStr);
+                    _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n",
+                            cudaerr, pStr?pStr:"Unknown");
+                }
+            }
+#endif
 #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-            if (PSMI_IS_GPU_ENABLED)
-                PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block);
+            if (PSMI_IS_GPU_ENABLED) {
+                ze_result_t result;
+                //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block);
+                psmi_count_zexDriverReleaseImportedPointer++;
+                result = psmi_zexDriverReleaseImportedPointer(ze_driver,
+                        block);
+                if (result != ZE_RESULT_SUCCESS) {
+                    _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result));
+                }
+            }
 #endif
             psmi_free(block);
         }
@@ -168,6 +229,13 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size)
             new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
 
             if (new_block) {
+#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER)
+                // for transient buffers, no use Importing, adds cost for
+                // CPU copy, just pay GPU cost on the copy, we use once & free
+                //if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
+                //    PSMI_CUDA_CALL(cuMemHostRegister, new_block, newsz,
+                //                   CU_MEMHOSTALLOC_PORTABLE);
+#endif
 #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
                 // for transient buffers, no use Importing, adds cost for
                 // CPU copy, just pay GPU cost on the copy, we use once & free
@@ -189,6 +257,14 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size)
             new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
 
             if (new_block) {
+#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER)
+                // By registering memory with Cuds, we make
+                // cuMemcpy* run faster for copies between
+                // GPU and this sysbuf
+                if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
+                    PSMI_CUDA_CALL(cuMemHostRegister, new_block, newsz,
+                                   CU_MEMHOSTALLOC_PORTABLE);
+#endif
 #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
                 // By registering memory with Level Zero, we make
                 // zeCommandListAppendMemoryCopy run faster for copies between
@@ -233,11 +309,21 @@ void psm3_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free)
     mm_handler = block_to_free->mem_handler;
 
     if (mm_handler->flags & MM_FLAG_TRANSIENT) {
+#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER)
+        // for transient buffers, no use Importing, adds cost for
+        // CPU copy, just pay GPU cost on the copy, we use once & free
+        //if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
+        //        /* ignore NOT_REGISTERED in case cuda initialized late */
+        //        CUresult cudaerr;
+        //        PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+        //                        cuMemHostUnregister, block_to_free);
+        //}
+#endif
 #if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
         // for transient buffers, no use Importing, adds cost for
         // CPU copy, just pay GPU cost on the copy, we use once & free
         //if (PSMI_IS_GPU_ENABLED)
-        //    PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block);
+        //    PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block_to_free);
 #endif
         psmi_free(block_to_free);
     } else {
diff --git a/prov/psm3/psm3/psm_sysbuf.h b/prov/psm3/psm3/psm_sysbuf.h
index 90945d520ed..31ff116d088 100644
--- a/prov/psm3/psm3/psm_sysbuf.h
+++ b/prov/psm3/psm3/psm_sysbuf.h
@@ -58,7 +58,11 @@
 
 #include "psm_user.h"
 
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#define MM_NUM_OF_POOLS 9
+#else
 #define MM_NUM_OF_POOLS 7
+#endif
 
 typedef struct psmi_mem_ctrl {
     struct psmi_mem_block_ctrl *free_list;
diff --git a/prov/psm3/psm3/psm_user.h b/prov/psm3/psm3/psm_user.h
index 38e9b8d9310..18c58d9934d 100644
--- a/prov/psm3/psm3/psm_user.h
+++ b/prov/psm3/psm3/psm_user.h
@@ -60,6 +60,13 @@
 extern "C" {
 #endif
 
+#if defined(PSM_CUDA)
+// if defined, do not use cuMemHostRegister for malloced pipeline
+// copy bounce buffers
+// otherwise, use cuMemHostRegister when malloc buffer
+//#define PSM3_NO_CUDA_REGISTER
+#endif
+
 #if defined(PSM_ONEAPI)
 // if defined, use malloc for pipeline copy bounce buffers
 // otherwise, use zeMemAllocHost
@@ -116,6 +123,10 @@ extern "C" {
 #endif /* RNDV_MOD */
 
 
+#if (defined(PSM_CUDA) || defined(PSM_ONEAPI)) && defined(PSM_USE_HWLOC)
+#define PSM_HAVE_GPU_CENTRIC_AFFINITY
+#endif
+
 #include "psm_config.h"
 #include <inttypes.h>
 #include <pthread.h>
@@ -166,6 +177,7 @@ typedef void *psmi_hal_hw_context;
 
 #include "psm_help.h"
 #include "psm_error.h"
+#include "psm_nic_select.h"
 #include "psm_context.h"
 #include "psm_utils.h"
 #include "psm_timer.h"
@@ -208,6 +220,7 @@ extern int psm3_opened_endpoint_count;
 
 extern int psm3_affinity_shared_file_opened;
 extern uint64_t *psm3_shared_affinity_ptr;
+extern uint64_t *psm3_shared_affinity_nic_refcount_ptr;
 extern char *psm3_affinity_shm_name;
 
 extern sem_t *psm3_sem_affinity_shm_rw;
@@ -378,6 +391,8 @@ extern uint32_t gpudirect_rdma_send_limit;
 extern uint32_t gpudirect_rdma_recv_limit;
 extern uint32_t gpu_thresh_rndv;
 
+#define MAX_ZE_DEVICES 8
+
 struct ips_gpu_hostbuf {
 	STAILQ_ENTRY(ips_gpu_hostbuf) req_next;
 	STAILQ_ENTRY(ips_gpu_hostbuf) next;
@@ -390,8 +405,9 @@ struct ips_gpu_hostbuf {
 	CUevent copy_status;
 #elif defined(PSM_ONEAPI)
 	ze_event_pool_handle_t event_pool;
-	ze_command_list_handle_t command_list;
+	ze_command_list_handle_t command_lists[MAX_ZE_DEVICES];
 	ze_event_handle_t copy_status;
+	int cur_dev_inx;
 #endif
 	psm2_mq_req_t req;
 	void* host_buf;
@@ -413,8 +429,6 @@ extern void *psmi_cuda_lib;
 
 #ifdef PSM_ONEAPI
 
-#define MAX_ZE_DEVICES 8
-
 int psmi_oneapi_ze_initialize(void);
 psm2_error_t psm3_ze_init_fds(void);
 int *psm3_ze_get_dev_fds(int *nfds);
@@ -428,11 +442,22 @@ extern int psm3_num_ze_dev_fds;
 
 struct ze_dev_ctxt {
 	ze_device_handle_t dev;
+	int dev_index; /* Index in ze_devices[] */
 	uint32_t ordinal; /* CmdQGrp ordinal for the 1st copy_only engine */
 	uint32_t index;   /* Cmdqueue index within the CmdQGrp */
 	uint32_t num_queues; /* Number of queues in the CmdQGrp */
+	// for most sync copies
 	ze_command_queue_handle_t cq;	// NULL if psm3_oneapi_immed_sync_copy
 	ze_command_list_handle_t cl;
+	// fields below are only used for large DTOD sync copy so can do 2
+	// parallel async copies then wait for both
+	ze_event_handle_t copy_status0;
+	ze_event_handle_t copy_status1;
+	ze_command_list_handle_t async_cl0;
+	ze_command_list_handle_t async_cl1;
+	ze_command_queue_handle_t async_cq0;// NULL if psm3_oneapi_immed_sync_copy
+	ze_command_queue_handle_t async_cq1;// NULL if psm3_oneapi_immed_sync_copy
+	ze_event_pool_handle_t event_pool;
 };
 
 extern ze_api_version_t zel_api_version;
@@ -444,6 +469,7 @@ extern int num_ze_devices;
 extern struct ze_dev_ctxt *cur_ze_dev;
 extern int psm3_oneapi_immed_sync_copy;
 extern int psm3_oneapi_immed_async_copy;
+extern unsigned psm3_oneapi_parallel_dtod_copy_thresh;
 
 const char* psmi_oneapi_ze_result_to_string(const ze_result_t result);
 void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt,
@@ -467,6 +493,7 @@ extern int psm3_oneapi_ze_using_zemem_alloc;
 extern void psm3_oneapi_ze_can_use_zemem();
 
 void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size);
+void psmi_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size);
 
 static inline
 int device_support_gpudirect()
@@ -501,6 +528,8 @@ extern CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream);
 extern CUresult (*psmi_cuEventSynchronize)(CUevent hEvent);
 extern CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags);
 extern CUresult (*psmi_cuMemFreeHost)(void* p);
+extern CUresult (*psmi_cuMemHostRegister)(void* p, size_t bytesize, unsigned int Flags);
+extern CUresult (*psmi_cuMemHostUnregister)(void* p);
 extern CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
 extern CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
 extern CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
@@ -527,6 +556,7 @@ extern ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDr
 extern ze_result_t (*psmi_zexDriverReleaseImportedPointer)(ze_driver_handle_t hDriver, void *ptr);
 #endif
 extern ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices);
+extern ze_result_t (*psmi_zeDevicePciGetPropertiesExt)(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties);
 #ifndef PSM3_NO_ONEAPI_IMPORT
 extern ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress);
 #endif
@@ -591,6 +621,8 @@ extern uint64_t psmi_count_cuEventRecord;
 extern uint64_t psmi_count_cuEventSynchronize;
 extern uint64_t psmi_count_cuMemHostAlloc;
 extern uint64_t psmi_count_cuMemFreeHost;
+extern uint64_t psmi_count_cuMemHostRegister;
+extern uint64_t psmi_count_cuMemHostUnregister;
 extern uint64_t psmi_count_cuMemcpy;
 extern uint64_t psmi_count_cuMemcpyDtoD;
 extern uint64_t psmi_count_cuMemcpyDtoH;
@@ -617,6 +649,7 @@ extern uint64_t psmi_count_zexDriverImportExternalPointer;
 extern uint64_t psmi_count_zexDriverReleaseImportedPointer;
 #endif
 extern uint64_t psmi_count_zeDeviceGet;
+extern uint64_t psmi_count_zeDevicePciGetPropertiesExt;
 #ifndef PSM3_NO_ONEAPI_IMPORT
 extern uint64_t psmi_count_zeDriverGetExtensionFunctionAddress;
 #endif
@@ -679,6 +712,20 @@ static int check_set_cuda_ctxt(void)
 	return 0;
 }
 
+/* Make sure have a real GPU job.  Set cu_ctxt if available */
+PSMI_ALWAYS_INLINE(
+int check_have_cuda_ctxt(void))
+{
+	if (! cu_ctxt) {
+		if (unlikely(check_set_cuda_ctxt())) {			\
+			psm3_handle_error(PSMI_EP_NORETURN,		\
+			PSM2_INTERNAL_ERR, "Failed to set/synchronize"	\
+			" CUDA context.\n");				\
+		}							\
+	}
+	return (cu_ctxt != NULL);
+}
+
 
 #define PSMI_CUDA_CALL(func, args...) do {				\
 		CUresult cudaerr;					\
@@ -688,19 +735,18 @@ static int check_set_cuda_ctxt(void)
 			" CUDA context.\n");				\
 		}							\
 		psmi_count_##func++;					\
-		cudaerr = psmi_##func(args);				\
+		cudaerr = (CUresult)psmi_##func(args);			\
 		if (cudaerr != CUDA_SUCCESS) {				\
 			const char *pStr = NULL;			\
 			psmi_count_cuGetErrorString++;			\
 			psmi_cuGetErrorString(cudaerr, &pStr);		\
 			_HFI_ERROR(					\
 				"CUDA failure: %s() (at %s:%d)"		\
-				"returned %d: %s\n",			\
+				" returned %d: %s\n",			\
 				#func, __FILE__, __LINE__, cudaerr,	\
 				pStr?pStr:"Unknown");			\
-			psm3_handle_error(				\
-				PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
-				"Error returned from CUDA function.\n");\
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function %s.\n", #func);\
 		}							\
 	} while (0)
 #endif // PSM_CUDA
@@ -712,12 +758,12 @@ static int check_set_cuda_ctxt(void)
 	psmi_count_##func++; \
 	result = psmi_##func(args);	\
 	if(result != ZE_RESULT_SUCCESS) { \
-		_HFI_ERROR( "OneAPI Level Zero failure: %s() (at %s:%d) " \
-			"returned %d(%s)\n", \
-			#func, __FILE__, __LINE__, result, psmi_oneapi_ze_result_to_string(result)); \
-		psm3_handle_error( PSMI_EP_NORETURN, \
-			PSM2_INTERNAL_ERR, \
-			"Error returned from OneAPI Level Zero function %s.\n", STRINGIFY(func)); \
+		_HFI_ERROR( "OneAPI Level Zero failure: %s() (at %s:%d)" \
+			" returned 0x%x: %s\n", \
+			#func, __FILE__, __LINE__, result, \
+			psmi_oneapi_ze_result_to_string(result)); \
+		psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+			"Error returned from OneAPI Level Zero function %s.\n", #func); \
 	} \
 } while (0)
 
@@ -755,7 +801,7 @@ _psmi_is_oneapi_ze_mem(const void *ptr, struct ze_dev_ctxt **ctxt))
 	if (result == ZE_RESULT_SUCCESS &&
 	    (mem_props.type != ZE_MEMORY_TYPE_UNKNOWN)) {
 		ret = 1;
-		_HFI_VDBG("ptr %p type %d dev %p ze_device %p\n",
+		_HFI_VDBG("ptr %p type %d dev %p cur_ze_dev %p\n",
 			  ptr, mem_props.type, dev, cur_ze_dev->dev);
 		/*
 		 * Check if the gpu device has changed.
@@ -782,6 +828,7 @@ _psmi_is_oneapi_ze_mem(const void *ptr, struct ze_dev_ctxt **ctxt))
 					break;
 				}
 			}
+			_HFI_VDBG("check ze_device[%d-%d] for dev %p: no match\n", 0, num_ze_devices-1, dev);
 		}
 	}
 
@@ -947,19 +994,18 @@ int gpu_p2p_supported())
 				"before psm3_ep_open call \n");		\
 			_HFI_ERROR(					\
 				"CUDA failure: %s() (at %s:%d)"		\
-				"returned %d: %s\n",			\
+				" returned %d: %s\n",			\
 				#func, __FILE__, __LINE__, cudaerr,	\
 				pStr?pStr:"Unknown");			\
-			psm3_handle_error(				\
-				PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
-				"Error returned from CUDA function.\n");\
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function %s.\n", #func);\
 		} else if (cudaerr == except_err) { \
 			const char *pStr = NULL;			\
 			psmi_count_cuGetErrorString++;			\
 			psmi_cuGetErrorString(cudaerr, &pStr);		\
 			_HFI_DBG( \
 				"CUDA non-zero return value: %s() (at %s:%d)"		\
-				"returned %d: %s\n",			\
+				" returned %d: %s\n",			\
 				#func, __FILE__, __LINE__, cudaerr,	\
 				pStr?pStr:"Unknown");			\
 		} \
@@ -974,12 +1020,11 @@ int gpu_p2p_supported())
 			psmi_count_cuGetErrorString++;			\
 			psmi_cuGetErrorString(cudaerr, &pStr);		\
 			_HFI_ERROR(					\
-				"CUDA failure: %s() returned %d: %s\n",	\
-				"cuEventQuery", cudaerr,		\
+				"CUDA failure: %s() (at %s:%d) returned %d: %s\n",	\
+				"cuEventQuery", __FILE__, __LINE__, cudaerr,		\
 				pStr?pStr:"Unknown");			\
-			psm3_handle_error(				\
-				PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
-				"Error returned from CUDA function.\n");\
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function cuEventQuery.\n");\
 		}							\
 	} while (0)
 
@@ -1063,13 +1108,12 @@ int _psm3_oneapi_ze_memcpy_done(const struct ips_gpu_hostbuf *ghb)
 	} else if (result == ZE_RESULT_NOT_READY) {
 		return 0;
 	} else {
-		_HFI_ERROR( "OneAPI LZ failure: %s() returned %d(%s)\n",
-			__FUNCTION__, result,
+		_HFI_ERROR("OneAPI Level Zero failure: %s() (at %s:%d) returned 0x%x: %s\n",
+			"zeEventQueryStatus",  __FILE__, __LINE__, result,
 			psmi_oneapi_ze_result_to_string(result));
-		psm3_handle_error( PSMI_EP_NORETURN,
-			PSM2_INTERNAL_ERR,
-			"Error returned from OneAPI LZ function %s.\n",
-			__FUNCTION__);
+		psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"Error returned from OneAPI Level Zero function %s.\n",
+			"zeEventQueryStatus");
 	}
 	return 0;
 }
@@ -1219,16 +1263,13 @@ _psmi_is_gdr_copy_enabled())
 		PSMI_CUDA_CALL(cuEventRecord, ghb->copy_status,       \
 			protoexp->cudastream_recv);                   \
 	} while (0)
-#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len, bufsz)            \
+#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len)                   \
 	do {                                                          \
 		if (proto->cudastream_send == NULL) {                 \
 			PSMI_CUDA_CALL(cuStreamCreate,                \
 				&proto->cudastream_send,              \
 				CU_STREAM_NON_BLOCKING);              \
 		}                                                     \
-		if (ghb->host_buf == NULL && bufsz) {                 \
-			PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz);   \
-		}                                                     \
 		if (ghb->copy_status == NULL) {                       \
 			PSMI_CUDA_CALL(cuEventCreate,                 \
 				&ghb->copy_status, CU_EVENT_DEFAULT); \
@@ -1246,13 +1287,6 @@ _psmi_is_gdr_copy_enabled())
 		ghb->copy_status = NULL;                              \
 		ghb->host_buf = NULL;                                 \
 	} while (0)
-// TBD, create of Event here could be omitted and let HTOD/DTOH_START create it
-#define PSM3_GPU_HOSTBUF_FORCE_INIT(ghb, bufsz)                       \
-	do {                                                          \
-		PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz);           \
-		PSMI_CUDA_CALL(cuEventCreate,                         \
-			&ghb->copy_status, CU_EVENT_DEFAULT);         \
-	} while (0)
 #define PSM3_GPU_HOSTBUF_RESET(ghb)                                   \
 	do {                                                          \
 	} while (0)
@@ -1278,6 +1312,10 @@ _psmi_is_gdr_copy_enabled())
 		PSMI_CUDA_CALL(cuMemHostAlloc, (void **)(ret_ptr),    \
 			(size),CU_MEMHOSTALLOC_PORTABLE);             \
 	} while (0)
+#define PSM3_GPU_HOST_FREE(ptr)                                       \
+	do {                                                          \
+		PSMI_CUDA_CALL(cuMemFreeHost, (void *)ptr);           \
+	} while (0)
 // HOST_ALLOC memory treated as CPU memory for Verbs MRs
 #define PSM3_GPU_ADDR_SEND_MR(mqreq)                                  \
 	( (mqreq)->is_buf_gpu_mem && ! (mqreq)->gpu_hostbuf_used )
@@ -1295,24 +1333,40 @@ _psmi_is_gdr_copy_enabled())
 #elif defined(PSM_ONEAPI)
 #define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp)                       \
 	do {                                                          \
-		protoexp->cq_recv = NULL;                             \
+		int i;                                                \
+	                                                              \
+		for (i = 0; i < MAX_ZE_DEVICES; i++)                  \
+			protoexp->cq_recvs[i] = NULL;                 \
 	} while (0)
 #define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto)                          \
 	do {                                                          \
-		proto->cq_send = NULL;                                \
+		int i;                                                \
+		                                                      \
+		for (i = 0; i < MAX_ZE_DEVICES; i++)                  \
+			proto->cq_sends[i] = NULL;                    \
 	} while (0)
 #define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp)                      \
 	do {                                                          \
-		if (protoexp->cq_recv) {                              \
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy,    \
-				protoexp->cq_recv);                   \
+		int i;                                                \
+		                                                      \
+		for (i = 0; i < MAX_ZE_DEVICES; i++) {                \
+			if (protoexp->cq_recvs[i]) {                  \
+				PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \
+					protoexp->cq_recvs[i]);       \
+				protoexp->cq_recvs[i] = NULL;         \
+			}                                             \
 		}                                                     \
 	} while (0)
 #define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto)                         \
 	do {                                                          \
-		if (proto->cq_send) {                                 \
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy,    \
-				proto->cq_send);                      \
+		int i;                                                \
+		                                                      \
+		for (i = 0; i < MAX_ZE_DEVICES; i++) {                \
+			if (proto->cq_sends[i]) {                     \
+				PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \
+					proto->cq_sends[i]);          \
+				proto->cq_sends[i] = NULL;            \
+			}                                             \
 		}                                                     \
 	} while (0)
 
@@ -1330,13 +1384,14 @@ _psmi_is_gdr_copy_enabled())
 			.index = 0                                    \
 		};                                                    \
 		struct ze_dev_ctxt *ctxt;                             \
+		int inx;                                              \
 		                                                      \
 		ctxt = psmi_oneapi_dev_ctxt_get(ghb->gpu_buf);        \
 		if (!ctxt)                                            \
 			psm3_handle_error(PSMI_EP_NORETURN,           \
 					  PSM2_INTERNAL_ERR,          \
-					  "%s HTOD: no dev ctxt\n",   \
-					  __FUNCTION__);              \
+					  "%s HTOD: unknown GPU device for addr %p\n", \
+					  __FUNCTION__, ghb->gpu_buf);\
 		if (ghb->event_pool == NULL) {                        \
 			PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate,        \
 				ze_context, &pool_desc, 0, NULL,      \
@@ -1347,23 +1402,26 @@ _psmi_is_gdr_copy_enabled())
 				ghb->event_pool, &event_desc,         \
 				&ghb->copy_status);                   \
 		}                                                     \
-		if (! ghb->command_list) {                            \
+		inx = ctxt->dev_index;                                \
+		if (! ghb->command_lists[inx]) {                      \
 			psmi_oneapi_async_cmd_create(ctxt,            \
-				 &protoexp->cq_recv, &ghb->command_list);\
+				 &protoexp->cq_recvs[inx],            \
+				 &ghb->command_lists[inx]);           \
 		}                                                     \
+		ghb->cur_dev_inx = inx;                               \
 		PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy,    \
-			ghb->command_list,                            \
+			ghb->command_lists[inx],                      \
 			ghb->gpu_buf, ghb->host_buf, len,             \
 			ghb->copy_status, 0, NULL);                   \
 		if (! psm3_oneapi_immed_async_copy) {                 \
 			PSMI_ONEAPI_ZE_CALL(zeCommandListClose,       \
-				ghb->command_list);                   \
+				ghb->command_lists[inx]);                   \
 			PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\
-				protoexp->cq_recv, 1,                 \
-				&ghb->command_list, NULL);            \
+				protoexp->cq_recvs[inx], 1,           \
+				&ghb->command_lists[inx], NULL);      \
 		}                                                     \
 	} while (0)
-#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len, bufsz)            \
+#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len)                   \
 	do {                                                          \
 		ze_event_pool_desc_t pool_desc = {                    \
 			.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,   \
@@ -1377,13 +1435,14 @@ _psmi_is_gdr_copy_enabled())
 			.index = 0                                    \
 		};                                                    \
 		struct ze_dev_ctxt *ctxt;                             \
+		int inx;                                              \
 		                                                      \
 		ctxt = psmi_oneapi_dev_ctxt_get(ghb->gpu_buf);        \
 		if (!ctxt)                                            \
 			psm3_handle_error(PSMI_EP_NORETURN,           \
 					  PSM2_INTERNAL_ERR,          \
-					  "%s DTOH: no dev ctxt\n",   \
-					  __FUNCTION__);              \
+					  "%s DTOH: unknown GPU device for addr %p\n", \
+					  __FUNCTION__, ghb->gpu_buf);\
 		if (ghb->event_pool == NULL) {                        \
 			PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate,        \
 				ze_context, &pool_desc, 0, NULL,      \
@@ -1394,68 +1453,50 @@ _psmi_is_gdr_copy_enabled())
 				ghb->event_pool, &event_desc,         \
 				&ghb->copy_status);                   \
 		}                                                     \
-		if (ghb->host_buf == NULL && bufsz) {                 \
-			PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz);   \
-		}                                                     \
-		if (! ghb->command_list) {                            \
+		inx = ctxt->dev_index;                                \
+		if (! ghb->command_lists[inx]) {                      \
 			psmi_oneapi_async_cmd_create(ctxt,            \
-				 &proto->cq_send, &ghb->command_list);\
+				 &proto->cq_sends[inx],               \
+				 &ghb->command_lists[inx]);           \
 		}                                                     \
+		ghb->cur_dev_inx = inx;                               \
 		PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy,    \
-			ghb->command_list,                            \
+			ghb->command_lists[inx],                      \
 			ghb->host_buf, ghb->gpu_buf, len,             \
 			ghb->copy_status, 0, NULL);                   \
 		if (! psm3_oneapi_immed_async_copy) {                 \
 			PSMI_ONEAPI_ZE_CALL(zeCommandListClose,       \
-				ghb->command_list);                   \
+				ghb->command_lists[inx]);             \
 			PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\
-				proto->cq_send, 1,                 \
-				&ghb->command_list, NULL);            \
+				proto->cq_sends[inx], 1,              \
+				&ghb->command_lists[inx], NULL);      \
 		}                                                     \
 	} while (0)
 #define PSM3_GPU_MEMCPY_DONE(ghb) \
 	_psm3_oneapi_ze_memcpy_done(ghb)
 #define PSM3_GPU_HOSTBUF_LAZY_INIT(ghb)                               \
 	do {                                                          \
+		int i;                                                \
+		                                                      \
 		ghb->event_pool = NULL;                               \
 		ghb->copy_status = NULL;                              \
-		ghb->command_list = NULL;                             \
+		for (i = 0; i < MAX_ZE_DEVICES; i++)                  \
+			ghb->command_lists[i] = NULL;                 \
 		ghb->host_buf = NULL;                                 \
 	} while (0)
-// TBD, create of Event and command list here could be omitted and let
-// HTOD/DTOH_START create it
-#define PSM3_GPU_HOSTBUF_FORCE_INIT(ghb, bufsz)                       \
-	do {                                                          \
-		ze_event_pool_desc_t pool_desc = {                    \
-			.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,   \
-			.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE,     \
-			.count = 1                                    \
-		};                                                    \
-		ze_event_desc_t event_desc = {                        \
-			.stype = ZE_STRUCTURE_TYPE_EVENT_DESC,        \
-			.signal = ZE_EVENT_SCOPE_FLAG_HOST,           \
-			.wait = ZE_EVENT_SCOPE_FLAG_HOST,             \
-			.index = 0                                    \
-		};                                                    \
-		PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate,                \
-			ze_context, &pool_desc, 0, NULL,              \
-			&ghb->event_pool);                            \
-		PSMI_ONEAPI_ZE_CALL(zeEventCreate,                    \
-			ghb->event_pool, &event_desc,                 \
-			&ghb->copy_status);                           \
-		PSM3_GPU_HOST_ALLOC(&ghb->host_buf, bufsz);           \
-	} while (0)
 #define PSM3_GPU_HOSTBUF_RESET(ghb)                                   \
 	do {                                                          \
 		if (! psm3_oneapi_immed_async_copy) {                 \
 			PSMI_ONEAPI_ZE_CALL(zeCommandListReset,       \
-					 ghb->command_list);          \
+				ghb->command_lists[ghb->cur_dev_inx]);\
 		}                                                     \
 		PSMI_ONEAPI_ZE_CALL(zeEventHostReset,                 \
 			ghb->copy_status);                            \
 	} while (0)
 #define PSM3_GPU_HOSTBUF_DESTROY(ghb)                                 \
 	do {                                                          \
+		int i;                                                \
+                                                                      \
 		if (ghb->copy_status != NULL) {                       \
 			PSMI_ONEAPI_ZE_CALL(zeEventDestroy,           \
 				ghb->copy_status);                    \
@@ -1467,13 +1508,17 @@ _psmi_is_gdr_copy_enabled())
 			PSMI_ONEAPI_ZE_CALL(zeEventPoolDestroy,       \
 				ghb->event_pool);                     \
 		}                                                     \
-		if (ghb->command_list != NULL) {                      \
-			PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy,     \
-				ghb->command_list);                   \
+		for (i = 0; i < MAX_ZE_DEVICES; i++) {                \
+			if (ghb->command_lists[i]) {                  \
+				PSMI_ONEAPI_ZE_CALL(                  \
+					zeCommandListDestroy,         \
+					ghb->command_lists[i]);       \
+				ghb->command_lists[i] = NULL;         \
+			}                                             \
 		}                                                     \
 	} while (0)
 #define PSM3_GPU_MEMCPY_DTOD(dstptr, srcptr, len) \
-	do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while(0)
+	do { psmi_oneapi_ze_memcpy_DTOD(dstptr, srcptr, len); } while(0)
 #define PSM3_GPU_MEMCPY_HTOD(dstptr, srcptr, len) \
 	do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while(0)
 #define PSM3_GPU_SYNCHRONIZE_MEMCPY() \
@@ -1506,6 +1551,7 @@ _psmi_is_gdr_copy_enabled())
 	( (tidrecvc)->is_ptr_gpu_backed                               \
           || ((mqreq)->gpu_hostbuf_used && psm3_oneapi_ze_using_zemem_alloc))
 #endif /* PSM3_USE_ONEAPI_MALLOC */
+#define PSM3_GPU_HOST_FREE(ptr) PSM3_ONEAPI_ZE_HOST_FREE(ptr)
 #define PSM3_MARK_BUF_SYNCHRONOUS(buf) do { /* not needed for OneAPI ZE */ } while (0)
 #define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) \
 	do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while (0)
diff --git a/prov/psm3/psm3/psm_utils.c b/prov/psm3/psm3/psm_utils.c
index e99b950e1bf..c2525fa935c 100644
--- a/prov/psm3/psm3/psm_utils.c
+++ b/prov/psm3/psm3/psm_utils.c
@@ -2550,14 +2550,12 @@ unsigned psmi_parse_gpudirect_rdma_send_limit(int force)
 
 	/* Default send threshold for Gpu-direct set to UINT_MAX
  	 * (always use GPUDIRECT) */
-	psm3_getenv("PSM3_GPUDIRECT_RDMA_SEND_LIMIT",
-		    "GPUDirect RDMA feature on send side will be switched off for messages larger than limit.",
+	psm3_getenv_range("PSM3_GPUDIRECT_RDMA_SEND_LIMIT",
+		    "GPUDirect RDMA feature on send side will be switched off for messages larger than limit.", NULL,
 		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-#ifdef PSM_ONEAPI
-		    (union psmi_envvar_val)(1024*1024), &envval);
-#else
-		    (union psmi_envvar_val)UINT_MAX, &envval);
-#endif
+		    (union psmi_envvar_val)UINT_MAX,
+		    (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, 
+		    NULL, NULL, &envval);
 
 	saved = envval.e_uint;
 done:
@@ -2584,10 +2582,16 @@ unsigned psmi_parse_gpudirect_rdma_recv_limit(int force)
 
 	/* Default receive threshold for Gpu-direct set to UINT_MAX
  	 * (always use GPUDIRECT) */
-	psm3_getenv("PSM3_GPUDIRECT_RDMA_RECV_LIMIT",
-		    "GPUDirect RDMA feature on receive side will be switched off for messages larger than limit.",
+	psm3_getenv_range("PSM3_GPUDIRECT_RDMA_RECV_LIMIT",
+		    "GPUDirect RDMA feature on receive side will be switched off for messages larger than limit.", NULL,
 		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-		    (union psmi_envvar_val)UINT_MAX, &envval);
+#ifdef PSM_CUDA
+		    (union psmi_envvar_val)UINT_MAX,
+#elif defined(PSM_ONEAPI)
+		    (union psmi_envvar_val)1,
+#endif
+		    (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, 
+		    NULL, NULL, &envval);
 
 	saved = envval.e_uint;
 done:
@@ -2611,10 +2615,11 @@ unsigned psmi_parse_gpudirect_rv_gpu_cache_size(int reload)
 
 	// RV defaults are sufficient for default PSM parameters
 	// but for HALs with RDMA, if user adjusts ep->hfi_num_send_rdma or
-	// mq->hfi_base_window_rv they also need to increase the cache size.
+	// mq->ips_gpu_window_rv they also need to increase the cache size.
 	// psm3_verbs_alloc_mr_cache will verify cache size is sufficient.
 	// min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) *
-	// chunk size (mq->hfi_base_window_rv after psmi_mq_initialize_params)
+	// chunk size (psm3_mq_max_window_rv(mq, 1) after
+	// psmi_mq_initialize_params)
 	if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect() ) {
 		psm3_getenv("PSM3_RV_GPU_CACHE_SIZE",
 				"kernel space GPU cache size"
@@ -2665,23 +2670,28 @@ int psm3_parse_identify(void)
 {
 	union psmi_envvar_val myenv;
 	static int have_value;
-	static unsigned saved_identify;
+	static int saved_identify;
 
 	// only parse once so doesn't appear in PSM3_VERBOSE_ENV multiple times
 	if (have_value)
 		return saved_identify;
 
-	psm3_getenv("PSM3_IDENTIFY", "Identify PSM version being run "
-				"(0 - disable, 1 - enable, 1: - limit output to rank 0, "
-				"1:pattern - limit output "
-				"to processes whose label matches "
+	psm3_getenv_range("PSM3_IDENTIFY", "Identify PSM version being run",
+				"  0 - disable\n"
+				"  1 - enable\n"
+				"  1: - limit output to rank 0\n"
+				"  1:pattern - limit output to processes whose label matches\n    "
 #ifdef FNM_EXTMATCH
 				"extended "
 #endif
 				"glob pattern)",
-				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_VAL_PAT,
-				(union psmi_envvar_val)"0", &myenv);
-	(void)psm3_parse_val_pattern(myenv.e_str, 0, &saved_identify);
+				PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_NOABBREV,
+				PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT,
+				(union psmi_envvar_val)"0",
+				(union psmi_envvar_val)0, (union psmi_envvar_val)1,
+				NULL, NULL, &myenv);
+	(void)psm3_parse_val_pattern_int(myenv.e_str, 0, &saved_identify,
+			PSMI_ENVVAR_FLAG_NOABBREV, 0, 1);
 	have_value = 1;
 
 	return saved_identify;
@@ -2891,11 +2901,12 @@ void psm3_print_ep_identify(psm2_ep_t ep)
 
 	(void)psmi_hal_get_port_speed(ep->unit_id, ep->portnum, &link_speed);
 	psmi_hal_get_node_id(ep->unit_id, &node_id);
-	psm3_print_identify("%s %s NIC %u (%s) Port %u %"PRIu64" Mbps NUMA %d %s%s\n",
+	psm3_print_identify("%s %s NIC %u (%s) Port %u %"PRIu64" Mbps NUMA %d %s%s%s\n",
 		psm3_get_mylabel(), psm3_ident_tag,
 		ep->unit_id,  ep->dev_name,
 		ep->portnum, link_speed/(1000*1000),
 		node_id, psm3_epid_fmt_addr(ep->epid, 0),
+		ep->addl_nic_info?ep->addl_nic_info:"",
 		(! psm3_ep_device_is_enabled(ep, PTL_DEVID_AMSH)
 		 && (((struct ptl_ips *)(ep->ptl_ips.ptl))->proto.flags
 		 	& IPS_PROTO_FLAG_LOOPBACK))?" loopback":"");
@@ -3011,7 +3022,7 @@ void psm3_parse_multi_ep()
 
 #ifdef PSM_FI
 
-unsigned psm3_faultinj_enabled = 0;
+int psm3_faultinj_enabled = 0;
 int psm3_faultinj_verbose = 0;
 char *psm3_faultinj_outfile = NULL;
 int psm3_faultinj_sec_rail = 0;
@@ -3025,21 +3036,25 @@ void psm3_parse_faultinj()
 {
 	union psmi_envvar_val env_fi;
 
-	psm3_getenv("PSM3_FI", "PSM Fault Injection "
-				"(0 - disable, 1 - enable, "
-				"2 - enable but default each injector to 0 rate "
-				"#: - limit to rank 0, "
-				"#:pattern - limit "
-				"to processes whose label matches "
+	psm3_getenv_range("PSM3_FI", "PSM Fault Injection",
+				"  0 - disable\n"
+				"  1 - enable\n"
+				"  2 - enable but default each injector to 0 rate\n"
+				"  #: - limit to rank 0\n"
+				"  #:pattern - limit to processes whose label matches\n    "
 #ifdef FNM_EXTMATCH
 				"extended "
 #endif
-				"glob pattern) "
-				"mode 2 can be useful to generate full stats help "
-				"when PSM3_PRINT_STATS_HELP enabled",
-		    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR_VAL_PAT,
-		    (union psmi_envvar_val)"0", &env_fi);
-	(void)psm3_parse_val_pattern(env_fi.e_str, 0, &psm3_faultinj_enabled);
+				"glob pattern\n"
+				"mode 2 can be useful to generate help for all injectors\n"
+				"when PSM3_PRINT_STATS_HELP=1 or PSM3_VERBOSE_ENV=3:",
+		    PSMI_ENVVAR_LEVEL_HIDDEN|PSMI_ENVVAR_FLAG_NOABBREV,
+		    PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT,
+		    (union psmi_envvar_val)"0",
+		    (union psmi_envvar_val)0, (union psmi_envvar_val)2,
+		    NULL, NULL, &env_fi);
+	(void)psm3_parse_val_pattern_int(env_fi.e_str, 0,
+		 &psm3_faultinj_enabled, PSMI_ENVVAR_FLAG_NOABBREV, 0, 2);
 
 	if (psm3_faultinj_enabled) {
 		char *def = NULL;
@@ -3143,6 +3158,52 @@ void psm3_faultinj_fini()
 	return;
 }
 
+/* parse fault injection controls
+ * format is num:denom:initial_seed
+ * denom must be >= num and > 0
+ * Either field can be omitted in which case default (input fvals) is used
+ * for given field.
+ * 0 - successfully parsed, fvals updated
+ * -1 - str empty, fvals unchanged
+ * -2 - syntax error, fvals may have been changed
+ */
+static int parse_faultinj_control(const char *str,
+                size_t errstr_size, char errstr[],
+                int fvals[3])
+{
+	psmi_assert(fvals);
+	int ret = psm3_parse_str_tuples(str, 3, fvals);
+	if (ret < 0)
+		return ret;
+	if (! fvals[1]) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " denom must be non-zero");
+		return -2;
+	}
+	if (fvals[0] < 0 || fvals[1] < 0) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " Negative values for num and denom not allowed");
+		return -2;
+	}
+	if (fvals[0] > fvals[1]) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " num (%d) must be <= denom (%d)", fvals[0], fvals[1]);
+		return -2;
+	}
+	return 0;
+}
+
+static int parse_check_faultinj_control(int type,
+				const union psmi_envvar_val val, void *ptr,
+				size_t errstr_size, char errstr[])
+{
+	// parser will set fvals to result, use a copy to protect input of defaults
+	int fvals[3] = { ((int*)ptr)[0], ((int*)ptr)[1], ((int*)ptr)[2] };
+	psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES);
+	return parse_faultinj_control(val.e_str, errstr_size, errstr, fvals);
+}
+
+
 /*
  * Intended to be used only once, not in the critical path
  */
@@ -3186,27 +3247,34 @@ struct psm3_faultinj_spec *psm3_faultinj_getspec(const char *spec_name,
 	 * error condition.
 	 */
 	{
-		int fvals[3] = { num, denom, (int)getpid() };
+		int fvals[3] = { fi->num, fi->denom, fi->initial_seed };
 		union psmi_envvar_val env_fi;
 		char fvals_str[128];
 		char fname[128];
 		char fdesc[300];
+		int ret;
 
 		snprintf(fvals_str, sizeof(fvals_str), "%d:%d:%d",
 				fi->num, fi->denom, fi->initial_seed);
 		snprintf(fname, sizeof(fname), "PSM3_FI_%s", spec_name);
-		snprintf(fdesc, sizeof(fdesc), "Fault Injection - %s <%s>",
-			 help, fvals_str);
-
-		if (!psm3_getenv(fname, fdesc, PSMI_ENVVAR_LEVEL_HIDDEN,
-				 PSMI_ENVVAR_TYPE_STR_TUPLES,
-				 (union psmi_envvar_val)fvals_str, &env_fi)) {
+		snprintf(fdesc, sizeof(fdesc), "Fault Injection - %s", help);
+
+		ret = psm3_getenv_range(fname, fdesc,
+				"Specified as num:denom:seed, where num/denom is approx probability\nand seed seeds the random number generator",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR_TUPLES,
+				(union psmi_envvar_val)fvals_str,
+				(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+				parse_check_faultinj_control, fvals, &env_fi);
+		if (ret == 0) {
 			/* not using default values */
-			(void)psm3_parse_str_tuples(env_fi.e_str, 3, fvals);
+			if (parse_faultinj_control(env_fi.e_str, 0, NULL, fvals) < 0) {
+				// already checked, shouldn't get parse errors nor empty strings
+				psmi_assert(0);
+			}
 			fi->num = fvals[0];
 			fi->denom = fvals[1];
 			fi->initial_seed = fvals[2];
-		} else if (psm3_faultinj_enabled == 2) {
+		} else if (ret == 1 && psm3_faultinj_enabled == 2) {
 			// default unspecified injectors to off
 			fi->num = 0;
 		}
diff --git a/prov/psm3/psm3/psm_utils.h b/prov/psm3/psm3/psm_utils.h
index ab654cb451d..d39b49e6711 100644
--- a/prov/psm3/psm3/psm_utils.h
+++ b/prov/psm3/psm3/psm_utils.h
@@ -528,7 +528,7 @@ void psm3_parse_multi_ep();
  *		pri_reg_mr - priority register MR failure (ENOMEM)
  *		gdrmmap - GPU gdrcopy pin and mmap failure
  */
-extern unsigned psm3_faultinj_enabled; /* use macro to test */
+extern int psm3_faultinj_enabled; /* use macro to test */
 extern int psm3_faultinj_verbose; /* use IS_FAULT macro to test */
 extern int psm3_faultinj_sec_rail;/* faults only on secondary rails or EPs */
 
diff --git a/prov/psm3/psm3/ptl.h b/prov/psm3/psm3/ptl.h
index dcdba3a7c6d..44110636411 100644
--- a/prov/psm3/psm3/ptl.h
+++ b/prov/psm3/psm3/ptl.h
@@ -68,14 +68,6 @@
 #include <psm2_am.h>
 #include <psm_help.h>
 
-/* We currently have 3 PTLs, 0 is reserved. */
-#define PTL_DEVID_IPS  1
-#define PTL_DEVID_AMSH 2
-#define PTL_DEVID_SELF 3
-
-/* We can currently initialize up to 3 PTLs */
-#define PTL_MAX_INIT	3
-
 /* struct ptl is an incomplete type, and it serves as a generic or opaque
    container.  It should remain an incomplete type in the entire psm
    source base. concrete ptl types need to have a suffix such as ptl_self,
diff --git a/prov/psm3/psm3/ptl_am/am_config.h b/prov/psm3/psm3/ptl_am/am_config.h
index f436f471c25..79600601037 100644
--- a/prov/psm3/psm3/ptl_am/am_config.h
+++ b/prov/psm3/psm3/ptl_am/am_config.h
@@ -67,6 +67,14 @@
 #define AMSH_HAVE_CMA   0x1
 #define AMSH_HAVE_KASSIST 0x1
 
+#if defined(PSM_CUDA)
+/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */
+#define PSMI_MQ_GPU_RV_THRESH 127
+#elif defined(PSM_ONEAPI)
+/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */
+#define PSMI_MQ_GPU_RV_THRESH 127
+#endif
+
 /* Each block reserves some space at the beginning to store auxiliary data */
 #define AMSH_BLOCK_HEADER_SIZE  4096
 
diff --git a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c b/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c
index a8151240469..ac561c6d32f 100644
--- a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c
+++ b/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c
@@ -96,7 +96,7 @@ typedef struct {
 
 static psm2_error_t am_ze_memhandle_mpool_alloc(
 					am_ze_memhandle_cache_t cache, uint32_t memcache_size);
-void am_ze_memhandle_delete(void *buf_ptr);
+static void am_ze_memhandle_delete(void *buf_ptr);
 
 /*
  * Custom comparator
@@ -653,9 +653,9 @@ am_ze_memhandle_acquire(am_ze_memhandle_cache_t cache,
 
 }
 
+#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
 void am_ze_memhandle_delete(void *buf_ptr)
 {
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
 	/* Release the reference to the buffer */
 	PSMI_ONEAPI_ZE_CALL(zeMemFree, ze_context, buf_ptr);
 
@@ -679,8 +679,8 @@ void am_ze_memhandle_delete(void *buf_ptr)
 	 * GEM_CLOSE.
 	 */
 #endif
-#endif /* HAVE_DRM or HAVE_LIBDRM */
 }
+#endif /* HAVE_DRM or HAVE_LIBDRM */
 
 void
 am_ze_memhandle_release(am_ze_memhandle_cache_t cache,
diff --git a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c
index 2cea9932454..020f3afb349 100644
--- a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c
+++ b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c
@@ -88,6 +88,9 @@
 #endif
 
 int psm3_shm_mq_rv_thresh = PSMI_MQ_RV_THRESH_NO_KASSIST;
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+int psm3_shm_mq_gpu_rv_thresh = PSMI_MQ_GPU_RV_THRESH;
+#endif
 
 // qcounts and qelemsz tunable via amsh_fifo_getconfig();
 static amsh_qinfo_t amsh_qcounts = {
@@ -371,6 +374,16 @@ psm2_error_t psm3_shm_create(ptl_t *ptl_gen)
 	}
 
 	memset((void *) mapptr, 0, segsz); /* touch all of my pages */
+#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
+	if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
+		PSMI_CUDA_CALL(cuMemHostRegister, mapptr, segsz,
+				CU_MEMHOSTALLOC_PORTABLE);
+#endif
+#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
+	if (PSMI_IS_GPU_ENABLED)
+		PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver,
+                                    mapptr, segsz);
+#endif
 
 	/* Our own ep's info for ptl_am resides at the start of the
 	   shm object.  Other processes need some of this info to
@@ -418,6 +431,37 @@ psm2_error_t psm3_epdir_extend(ptl_t *ptl_gen)
 psm2_error_t psm3_do_unmap(uintptr_t shmbase)
 {
 	psm2_error_t err = PSM2_OK;
+#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
+	if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
+		/* ignore NOT_REGISTERED in case cuda initialized late */
+		/* ignore other errors as context could be destroyed before this */
+		CUresult cudaerr;
+		//PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+		//		cuMemHostUnregister, (void*)shmbase);
+		psmi_count_cuMemHostUnregister++;
+		cudaerr = psmi_cuMemHostUnregister((void*)shmbase);
+		if (cudaerr) {
+			const char *pStr = NULL;
+			psmi_count_cuGetErrorString++;
+			psmi_cuGetErrorString(cudaerr, &pStr);
+			_HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n",
+					cudaerr, pStr?pStr:"Unknown");
+		}
+	}
+#endif
+#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
+        if (PSMI_IS_GPU_ENABLED) {
+			ze_result_t result;
+			//PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver,
+			//	    (void *)shmbase);
+			psmi_count_zexDriverReleaseImportedPointer++;
+			result = psmi_zexDriverReleaseImportedPointer(ze_driver,
+					    (void *)shmbase);
+			if (result != ZE_RESULT_SUCCESS) {
+				_HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result));
+			}
+		}
+#endif
 	if (munmap((void *)shmbase, am_ctl_sizeof_block())) {
 		err =
 		    psm3_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
@@ -550,6 +594,16 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
 
 	// read every page in segment so faulted into our address space
 	psm3_touch_mmap(dest_mapptr, segsz);
+#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
+	if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
+		PSMI_CUDA_CALL(cuMemHostRegister, dest_mapptr, segsz,
+				CU_MEMHOSTALLOC_PORTABLE);
+#endif
+#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
+	if (PSMI_IS_GPU_ENABLED)
+		PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver,
+				    dest_mapptr, segsz);
+#endif
 
 	shmidx = -1;
 	if ((ptl->max_ep_idx + 1) == ptl->am_ep_size) {
@@ -711,6 +765,37 @@ psm2_error_t psm3_shm_detach(ptl_t *ptl_gen)
 	shm_unlink(ptl->amsh_keyname);
 	psmi_free(ptl->amsh_keyname);
 
+#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
+	if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
+		/* ignore NOT_REGISTERED in case cuda initialized late */
+		/* ignore other errors as context could be destroyed before this */
+		CUresult cudaerr;
+		//PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+		//		cuMemHostUnregister, (void*)shmbase);
+		psmi_count_cuMemHostUnregister++;
+		cudaerr = psmi_cuMemHostUnregister((void*)shmbase);
+		if (cudaerr) {
+			const char *pStr = NULL;
+			psmi_count_cuGetErrorString++;
+			psmi_cuGetErrorString(cudaerr, &pStr);
+			_HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n",
+					cudaerr, pStr?pStr:"Unknown");
+		}
+	}
+#endif
+#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
+	if (PSMI_IS_GPU_ENABLED) {
+		ze_result_t result;
+		//PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver,
+		//		    (void *)shmbase);
+		psmi_count_zexDriverReleaseImportedPointer++;
+		result = psmi_zexDriverReleaseImportedPointer(ze_driver,
+				    (void *)shmbase);
+		if (result != ZE_RESULT_SUCCESS) {
+			_HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result));
+		}
+	}
+#endif
 	if (munmap((void *)shmbase, am_ctl_sizeof_block())) {
 		err =
 		    psm3_handle_error(NULL, PSM2_SHMEM_SEGMENT_ERR,
@@ -2382,7 +2467,8 @@ amsh_mq_send_inner_eager(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 	args[2].u32w1 = tag->tag[2];
 	args[2].u32w0 = 0;
 
-	if (!flags_user && len <= AMLONG_MTU) {
+	psmi_assert(!(flags_user & PSM2_MQ_FLAG_SENDSYNC));// needs rndv
+	if (len <= AMLONG_MTU) {
 		if (len <= 32)
 			args[0].u32w0 = MQ_MSG_TINY;
 		else
@@ -2445,26 +2531,29 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 	if (PSM3_IS_BUFFER_GPU_MEM(ubuf, len)) {
 		gpu_mem = 1;
 
-		/* All sends from a gpu buffer use the rendezvous protocol if p2p is supported */
-		if (ep_supports_p2p) {
+		/* SENDSYNC gets priority, assume not used for MPI_isend w/INJECT */
+		/* otherwise use eager for INJECT as caller is waiting */
+		if ((flags_user & (PSM2_MQ_FLAG_SENDSYNC|PSM2_MQ_FLAG_INJECT))
+				== PSM2_MQ_FLAG_INJECT)
+			goto do_eager;
+
+		/* larger sends from a gpu buffer use the rendezvous protocol if p2p is supported */
+		if (ep_supports_p2p && len > mq->shm_gpu_thresh_rv) {
 			goto do_rendezvous;
 		}
-
-		/*
-		 * Use eager messages if P2P is unsupported between endpoints.
-		 * Potentially use rendezvous with blocking requests only.
-		 */
-		if (!is_blocking)
-			goto do_eager;
-	}
+	} else
 #endif
+	/* SENDSYNC gets priority, assume not used for MPI_isend w/INJECT */
+	/* otherwise use eager for INJECT as caller is waiting */
+	if ((flags_user & (PSM2_MQ_FLAG_SENDSYNC|PSM2_MQ_FLAG_INJECT))
+				== PSM2_MQ_FLAG_INJECT)
+		goto do_eager;
+
 	if (flags_user & PSM2_MQ_FLAG_SENDSYNC)
 		goto do_rendezvous;
 
 	if (len <= mq->shm_thresh_rv)
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 do_eager:
-#endif
 		return amsh_mq_send_inner_eager(mq, req, epaddr, args, flags_user,
 						flags_internal, tag, ubuf, len);
 do_rendezvous:
@@ -2600,17 +2689,31 @@ int psm3_get_kassist_mode()
 		return PSMI_KASSIST_OFF;
 #endif
 
-#if !defined(PSM_CUDA) && !defined(PSM_ONEAPI)
 	union psmi_envvar_val env_kassist;
 	const char *PSM3_KASSIST_MODE_HELP = "PSM Shared memory kernel assist mode "
 			 "(cma-put, cma-get, none)";
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	// GPU limits KASSIST choices to cma-get or none
+	const char *PSM3_KASSIST_MODE_GPU_HELP = "PSM Shared memory kernel assist mode "
+			 "(cma-get, none)";
+#endif
 
-	if (!psm3_getenv("PSM3_KASSIST_MODE", PSM3_KASSIST_MODE_HELP,
+	if (!psm3_getenv("PSM3_KASSIST_MODE",
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+			 PSMI_IS_GPU_ENABLED?
+				PSM3_KASSIST_MODE_GPU_HELP:PSM3_KASSIST_MODE_HELP,
+#else
+			 PSM3_KASSIST_MODE_HELP,
+#endif
 			 PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_STR,
 			 (union psmi_envvar_val)
 			 PSMI_KASSIST_MODE_DEFAULT_STRING, &env_kassist)) {
 		char *s = env_kassist.e_str;
-		if (strcasecmp(s, "cma-put") == 0)
+		if (
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+			! PSMI_IS_GPU_ENABLED &&
+#endif
+			strcasecmp(s, "cma-put") == 0)
 			mode = PSMI_KASSIST_CMA_PUT;
 		else if (strcasecmp(s, "cma-get") == 0)
 			mode = PSMI_KASSIST_CMA_GET;
@@ -2622,7 +2725,6 @@ int psm3_get_kassist_mode()
 			mode = PSMI_KASSIST_CMA_GET;
 		}
 	}
-#endif
 	return mode;
 }
 
@@ -3005,11 +3107,9 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl)
 				    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
 				    (union psmi_envvar_val)
 				    CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size);
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
 			if ((err = am_cuda_memhandle_cache_alloc(&ptl->memhandle_cache,
 						 env_memcache_size.e_uint, &ep->mq->stats) != PSM2_OK))
 				goto fail;
-#endif
 		}
 	}
 #endif
@@ -3160,6 +3260,10 @@ static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns)
 		am_ze_memhandle_cache_free(ptl->memhandle_cache);
 #endif
 	ptl->memhandle_cache = NULL;
+#endif
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	if (PSMI_IS_GPU_ENABLED && ptl->gpu_bounce_buf)
+		PSM3_GPU_HOST_FREE(ptl->gpu_bounce_buf);
 #endif
 	return PSM2_OK;
 fail:
diff --git a/prov/psm3/psm3/ptl_am/psm_am_internal.h b/prov/psm3/psm3/ptl_am/psm_am_internal.h
index 56df72a6c13..203b9512c3a 100644
--- a/prov/psm3/psm3/ptl_am/psm_am_internal.h
+++ b/prov/psm3/psm3/ptl_am/psm_am_internal.h
@@ -468,6 +468,10 @@ struct ptl_am {
 #ifdef PSM_ONEAPI
 	am_ze_memhandle_cache_t memhandle_cache;
 #endif
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#define AMSH_GPU_BOUNCE_BUF_SZ (256*1024)
+	void *gpu_bounce_buf;	// for H to D
+#endif
 } __attribute__((aligned(64)));
 
 #endif
diff --git a/prov/psm3/psm3/ptl_am/ptl.c b/prov/psm3/psm3/ptl_am/ptl.c
index 62142f898a9..8a38d22ad4d 100644
--- a/prov/psm3/psm3/ptl_am/ptl.c
+++ b/prov/psm3/psm3/ptl_am/ptl.c
@@ -54,6 +54,7 @@
 /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
 
 #include "psm_user.h"
+#include "psm2_hal.h"
 #include "psm_mq_internal.h"
 #include "psm_am_internal.h"
 #include "cmarw.h"
@@ -162,19 +163,32 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 		 * resides on the GPU
 		 */
 		if (req->is_buf_gpu_mem) {
-			void* gpu_ipc_bounce_buf = psmi_malloc(PSMI_EP_NONE, UNDEFINED, req->req_data.recv_msglen);
-			size_t nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf,
-					gpu_ipc_bounce_buf, req->req_data.recv_msglen);
-			psmi_assert_always(nbytes == req->req_data.recv_msglen);
-			PSM3_GPU_MEMCPY_HTOD(req->req_data.buf, gpu_ipc_bounce_buf,
-				req->req_data.recv_msglen);
+			size_t cnt = 0;
+			if (!ptl->gpu_bounce_buf)
+				PSM3_GPU_HOST_ALLOC(&ptl->gpu_bounce_buf, AMSH_GPU_BOUNCE_BUF_SZ);
+			while (cnt < req->req_data.recv_msglen) {
+				size_t nbytes = min(req->req_data.recv_msglen-cnt,
+									AMSH_GPU_BOUNCE_BUF_SZ);
+				size_t res = psm3_cma_get(pid, (void *)(req->rts_sbuf+cnt),
+										ptl->gpu_bounce_buf, nbytes);
+				void *buf;
+				psmi_assert_always(nbytes == res);
+				if (PSMI_USE_GDR_COPY_RECV(nbytes)
+					&& NULL != (buf = psmi_hal_gdr_convert_gpu_to_host_addr(
+									(unsigned long)req->req_data.buf+cnt,
+									nbytes, 1, ptl->ep)))
+					psm3_mq_mtucpy_host_mem(buf, ptl->gpu_bounce_buf, nbytes);
+				else
+					PSM3_GPU_MEMCPY_HTOD(req->req_data.buf+cnt,
+										 ptl->gpu_bounce_buf, nbytes);
+				cnt+= nbytes;
+			}
 			/* Cuda library has recent optimizations where they do
 			 * not guarantee synchronus nature for Host to Device
 			 * copies for msg sizes less than 64k. The event record
 			 * and synchronize calls are to guarentee completion.
 			 */
 			PSM3_GPU_SYNCHRONIZE_MEMCPY();
-			psmi_free(gpu_ipc_bounce_buf);
 		} else {
 			/* cma can be done in handler context or not. */
 			size_t nbytes = psm3_cma_get(pid, (void *)req->rts_sbuf,
diff --git a/prov/psm3/psm3/ptl_am/ptl_fwd.h b/prov/psm3/psm3/ptl_am/ptl_fwd.h
index e7dcd060d22..85593aad847 100644
--- a/prov/psm3/psm3/ptl_am/ptl_fwd.h
+++ b/prov/psm3/psm3/ptl_am/ptl_fwd.h
@@ -60,5 +60,6 @@
 extern struct ptl_ctl_init psm3_ptl_amsh;
 
 extern int psm3_shm_mq_rv_thresh;
+extern int psm3_shm_mq_gpu_rv_thresh;
 
 #endif
diff --git a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h
index 6e9b94f3a97..2bdd85a309c 100644
--- a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h
+++ b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h
@@ -137,7 +137,8 @@ struct ips_protoexp {
 #ifdef PSM_CUDA
 	CUstream cudastream_recv;
 #elif defined(PSM_ONEAPI)
-	ze_command_queue_handle_t cq_recv;	// NULL if psm3_oneapi_immed_async_copy
+	/* Will not be usd if psm3_oneapi_immed_async_copy */
+	ze_command_queue_handle_t cq_recvs[MAX_ZE_DEVICES];
 #endif
 };
 
@@ -201,6 +202,7 @@ struct ips_tid_send_desc {
 	 * would need to attach to a tidsendc would be 2
 	 */
 	struct ips_gpu_hostbuf *gpu_hostbuf[2];
+	struct ips_gpu_hostbuf *gpu_split_buf;
 	/* Number of hostbufs attached */
 	uint8_t gpu_num_buf;
 #endif
@@ -362,4 +364,11 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 			    ptl_arg_t rdescid, uint32_t tidflow_genseq,
 			    ips_tid_session_list *tid_list,
 			    uint32_t tid_list_size);
+
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+// buffers for GPU send copy pipeline
+struct ips_gpu_hostbuf* psm3_ips_allocate_send_chb(struct ips_proto *proto,
+				uint32_t nbytes, int allow_temp);
+void psm3_ips_deallocate_send_chb(struct ips_gpu_hostbuf* chb, int reset);
+#endif
 #endif /* #ifndef __IPS_EXPECTED_PROTO_H__ */
diff --git a/prov/psm3/psm3/ptl_ips/ips_path_rec.c b/prov/psm3/psm3/ptl_ips/ips_path_rec.c
index 3db38328818..de57f5317e9 100644
--- a/prov/psm3/psm3/ptl_ips/ips_path_rec.c
+++ b/prov/psm3/psm3/ptl_ips/ips_path_rec.c
@@ -127,8 +127,12 @@ enum psm3_ibv_rate ips_link_speed_to_enum(uint64_t link_speed)
 		return PSM3_IBV_RATE_300_GBPS;
 	else if (link_speed <= 400*PSM3_GIGABIT)
 		return PSM3_IBV_RATE_400_GBPS;
-	else
+	else if (link_speed <= 600*PSM3_GIGABIT)
 		return PSM3_IBV_RATE_600_GBPS;
+	else if (link_speed <= 800*PSM3_GIGABIT)
+		return PSM3_IBV_RATE_800_GBPS;
+	else
+		return PSM3_IBV_RATE_1200_GBPS;
 }
 
 static uint64_t ips_enum_to_link_speed(enum psm3_ibv_rate rate)
@@ -155,6 +159,8 @@ static uint64_t ips_enum_to_link_speed(enum psm3_ibv_rate rate)
 	case PSM3_IBV_RATE_50_GBPS:	return  50*PSM3_GIGABIT;
 	case PSM3_IBV_RATE_400_GBPS:	return 400*PSM3_GIGABIT;
 	case PSM3_IBV_RATE_600_GBPS:	return 600*PSM3_GIGABIT;
+	case PSM3_IBV_RATE_800_GBPS:	return 800*PSM3_GIGABIT;
+	case PSM3_IBV_RATE_1200_GBPS:	return 1200*PSM3_GIGABIT;
 	default:			return 100*PSM3_GIGABIT;
 	}
 }
@@ -458,6 +464,51 @@ ips_none_path_rec(struct ips_proto *proto,
 	return err;
 }
 
+/* parse error check timeouts for PSM3_ERRCHK_TIMEOUT or PSM3_ERRCHK_TIMEOUT_US
+ * format is min:max:factor
+ * all must be non-zero, min must be <= max
+ * Either field can be omitted in which case default (input tvals) is used
+ * for given field.
+ * 0 - successfully parsed, tvals updated
+ * -1 - str empty, tvals unchanged
+ * -2 - syntax error, tvals may have been changed
+ */
+static int parse_errchk_timeout(const char *str,
+			size_t errstr_size, char errstr[],
+			int tvals[3])
+{
+	psmi_assert(tvals);
+	int ret = psm3_parse_str_tuples(str, 3, tvals);
+	if (ret < 0)
+		return ret;
+	if (tvals[0] < 0 || tvals[1] < 0 || tvals[2] < 0) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " Negative values not allowed");
+		return -2;
+	}
+	if (tvals[0] == 0 || tvals[1] == 0 || tvals[2] == 0) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " Zero values not allowed");
+		return -2;
+	}
+	if (tvals[0] > tvals[1]) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " min (%d) must be <= max (%d)", tvals[0], tvals[1]);
+		return -2;
+	}
+	return 0;
+}
+
+static int parse_check_errchk_timeout(int type,
+			const union psmi_envvar_val val, void *ptr,
+			size_t errstr_size, char errstr[])
+{
+	// parser will set tvals to result, use a copy to protect input of defaults
+	int tvals[3] = { ((int*)ptr)[0], ((int*)ptr)[1], ((int*)ptr)[2] };
+	psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES);
+	return parse_errchk_timeout(val.e_str, errstr_size, errstr, tvals);
+}
+
 static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto)
 {
 	psm2_error_t err = PSM2_OK;
@@ -478,17 +529,18 @@ static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto)
 			IPS_PROTO_ERRCHK_FACTOR_DEFAULT
 		};
 
-		if (!psm3_getenv("PSM3_ERRCHK_TIMEOUT",
-				 "Errchk timeouts in mS <min:max:factor>",
+		(void)psm3_getenv_range("PSM3_ERRCHK_TIMEOUT",
+				 "Errchk timeouts in milliseconds <min:max:factor>",
+				 "Specified as min:max:factor where min and max is the range of timeouts\nand factor is the multiplier for growing timeout",
 				 PSMI_ENVVAR_LEVEL_USER,
 				 PSMI_ENVVAR_TYPE_STR_TUPLES,
 				 (union psmi_envvar_val)PSM_TID_TIMEOUT_DEFAULT,
-				 &env_to)) {
-			/* Not using default values, parse what we can */
-			(void)psm3_parse_str_tuples(env_to.e_str, 3, tvals);
-			/* Adjust for max smaller than min, things would break */
-			if (tvals[1] < tvals[0])
-				tvals[1] = tvals[0];
+				 (union psmi_envvar_val)NULL,
+				 (union psmi_envvar_val)NULL,
+				 parse_check_errchk_timeout, tvals, &env_to);
+		if (parse_errchk_timeout(env_to.e_str, 0, NULL, tvals) < 0) {
+			// already checked, shouldn't get parse errors nor empty strings
+			psmi_assert(0);
 		}
 
 		proto->epinfo.ep_timeout_ack = ms_2_cycles(tvals[0]);
@@ -502,22 +554,26 @@ static psm2_error_t ips_none_path_rec_init(struct ips_proto *proto)
 		 * This allows values in units of microseconds and will override
 		 * any values specified in PSM3_ERRCHK_TIMEOUT
 		 */
-		if (!psm3_getenv("PSM3_ERRCHK_TIMEOUT_US",
-				 "Errchk timeouts in usec <min:max:factor>",
+		int us_tvals[3] = {
+			IPS_PROTO_ERRCHK_MS_MIN_DEFAULT*1000,
+			IPS_PROTO_ERRCHK_MS_MAX_DEFAULT*1000,
+			IPS_PROTO_ERRCHK_FACTOR_DEFAULT
+		};
+		if (1 > psm3_getenv_range("PSM3_ERRCHK_TIMEOUT_US",
+				 "Errchk timeouts in microseconds <min:max:factor>",
+				 "Specified as min:max:factor where min and max is the range of timeouts\nand factor is the multiplier for growing timeout",
 				 PSMI_ENVVAR_LEVEL_USER,
 				 PSMI_ENVVAR_TYPE_STR_TUPLES,
 				 (union psmi_envvar_val)PSM_TID_TIMEOUT_DEFAULT_US,
-				 &env_to)) {
-			/* Not using default values, parse what we can */
-			int us_tvals[3] = {
-				IPS_PROTO_ERRCHK_MS_MIN_DEFAULT*1000,
-				IPS_PROTO_ERRCHK_MS_MAX_DEFAULT*1000,
-				IPS_PROTO_ERRCHK_FACTOR_DEFAULT
-			};
-			(void)psm3_parse_str_tuples(env_to.e_str, 3, us_tvals);
-			/* Adjust for max smaller than min, things would break */
-			if (us_tvals[1] < us_tvals[0])
-				us_tvals[1] = us_tvals[0];
+				 (union psmi_envvar_val)NULL,
+				 (union psmi_envvar_val)NULL,
+				 parse_check_errchk_timeout, us_tvals, &env_to)) {
+			// value specified (perhaps bad input), use
+			// what was returned (will be default if bad input)
+			if (parse_errchk_timeout(env_to.e_str, 0, NULL, us_tvals) < 0) {
+				// already checked, shouldn't get parse errors nor empty strings
+				psmi_assert(0);
+			}
 			proto->epinfo.ep_timeout_ack = us_2_cycles(us_tvals[0]);
 			proto->epinfo.ep_timeout_ack_max = us_2_cycles(us_tvals[1]);
 			proto->epinfo.ep_timeout_ack_factor = us_tvals[2];
diff --git a/prov/psm3/psm3/ptl_ips/ips_path_rec.h b/prov/psm3/psm3/ptl_ips/ips_path_rec.h
index ebca755e95a..17fa819a396 100644
--- a/prov/psm3/psm3/ptl_ips/ips_path_rec.h
+++ b/prov/psm3/psm3/ptl_ips/ips_path_rec.h
@@ -124,6 +124,8 @@ enum psm3_ibv_rate {
 	PSM3_IBV_RATE_50_GBPS	= 20,
 	PSM3_IBV_RATE_400_GBPS	= 21,
 	PSM3_IBV_RATE_600_GBPS	= 22,
+	PSM3_IBV_RATE_800_GBPS	= 23,
+	PSM3_IBV_RATE_1200_GBPS	= 24,
 };
 
 static inline int opa_mtu_enum_to_int(enum opa_mtu mtu)
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.c b/prov/psm3/psm3/ptl_ips/ips_proto.c
index d4c723a430a..f6c9c215bcb 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto.c
@@ -452,6 +452,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 		 (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) {
 		struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS;
 		uint32_t maxsz, chunksz, max_elements;
+		uint32_t pool_num_obj_max_total;
+		uint32_t small_pool_num_obj_max_total;
 
 		if ((err = psm3_parse_mpool_env(proto->mq, 1,
 						&rlim, &maxsz, &chunksz)))
@@ -459,10 +461,12 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 
 		/* the maxsz is the amount in MB, not the number of entries,
 		 * since the element size depends on the window size */
-		max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv;
+		max_elements = (maxsz*1024*1024) / psm3_mq_max_window_rv(proto->mq, 1);
 		/* mpool requires max_elements to be power of 2. round down. */
 		max_elements = 1 << (31 - __builtin_clz(max_elements));
-		proto->gpu_hostbuf_send_cfg.bufsz = proto->mq->hfi_base_window_rv;
+		/* need at least 3 buffers */
+		max_elements = max(4, max_elements);
+		proto->gpu_hostbuf_send_cfg.bufsz = psm3_mq_max_window_rv(proto->mq, 1);
 		proto->gpu_hostbuf_pool_send =
 			psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf),
 						  chunksz, max_elements, 0,
@@ -476,6 +480,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 						"Couldn't allocate GPU host send buffer pool");
 			goto fail;
 		}
+		psm3_mpool_get_obj_info(proto->gpu_hostbuf_pool_send,
+					NULL, &pool_num_obj_max_total);
 
 		/* use the same number of elements for the small pool */
 		proto->gpu_hostbuf_small_send_cfg.bufsz = GPU_SMALLHOSTBUF_SZ;
@@ -492,6 +498,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 						"Couldn't allocate GPU host small send buffer pool");
 			goto fail;
 		}
+		psm3_mpool_get_obj_info(proto->gpu_hostbuf_pool_small_send,
+					NULL, &small_pool_num_obj_max_total);
 
 		/* Configure the amount of prefetching */
 		union psmi_envvar_val env_prefetch_limit;
@@ -502,6 +510,12 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 			    (union psmi_envvar_val)GPU_WINDOW_PREFETCH_DEFAULT,
 			    &env_prefetch_limit);
 		proto->gpu_prefetch_limit = env_prefetch_limit.e_uint;
+		_HFI_DBG("GPU Send Copy Pipeline: %u of %u bytes (small), %u of %u bytes, prefetch %u\n",
+			small_pool_num_obj_max_total,
+			proto->gpu_hostbuf_small_send_cfg.bufsz,
+			pool_num_obj_max_total,
+			proto->gpu_hostbuf_send_cfg.bufsz,
+			proto->gpu_prefetch_limit);
 	}
 #endif /* PSM_CUDA || PSM_ONEAPI */
 
@@ -530,7 +544,8 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 		// but can survive if it's smaller as we will delay transfer til avail
 		if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) {
 			cache_pri_entries =  HFI_TF_NFLOWS + proto->ep->hfi_num_send_rdma;
-			cache_pri_size  = (uint64_t)cache_pri_entries * proto->mq->hfi_base_window_rv;
+			cache_pri_size  = (uint64_t)cache_pri_entries *
+					psm3_mq_max_window_rv(proto->mq, 0);
 			if (MR_CACHE_USER_CACHING(proto->ep->mr_cache_mode)) {
 				// we attempt to cache, so can benefit from more than inflight
 				// make enough room to have a good number of entries
@@ -578,7 +593,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 				default_cache_entries = max(default_cache_entries,
 								((uint64_t)env_mr_cache_size_mb.e_uint
 									* (1024*1024))
-										/ max( proto->mq->hfi_base_window_rv/2,
+										/ max(psm3_mq_max_window_rv(proto->mq, 0)/2,
 												proto->mq->hfi_thresh_rv));
 			} else {
 				// only send DMA, size based on smaller MRs
@@ -2292,10 +2307,10 @@ ips_proto_register_stats(struct ips_proto *proto)
 				   "RDMA rendezvous message bytes received direct into a GPU buffer",
 				   &proto->strat_stats.rndv_rdma_gdr_recv_bytes),
 		PSMI_STATS_DECLU64("rndv_rdma_hbuf_recv",
-				   "RDMA rendezvous messages received into via pipelined GPU copy",
+				   "RDMA rendezvous messages received into a GPU buffer via pipelined GPU copy",
 				   &proto->strat_stats.rndv_rdma_hbuf_recv),
 		PSMI_STATS_DECLU64("rndv_rdma_hbuf_recv_bytes",
-				   "RDMA rendezvous message bytes received into via pipelined GPU copy",
+				   "RDMA rendezvous message bytes received into a GPU buffer via pipelined GPU copy",
 				   &proto->strat_stats.rndv_rdma_hbuf_recv_bytes),
 #endif
 		PSMI_STATS_DECLU64("rndv_rdma_cpu_send",
@@ -2312,10 +2327,10 @@ ips_proto_register_stats(struct ips_proto *proto)
 				   "RDMA rendezvous message bytes sent from a GPU buffer via send RDMA",
 				   &proto->strat_stats.rndv_rdma_gdr_send_bytes),
 		PSMI_STATS_DECLU64("rndv_rdma_hbuf_send",
-				   "RDMA rendezvous messages sent from a GPU buffer into via pipelined GPU copy",
+				   "RDMA rendezvous messages sent from a GPU buffer via pipelined GPU copy",
 				   &proto->strat_stats.rndv_rdma_hbuf_send),
 		PSMI_STATS_DECLU64("rndv_rdma_hbuf_send_bytes",
-				   "RDMA rendezvous message bytes sent from a GPU buffer into via pipelined GPU copy",
+				   "RDMA rendezvous message bytes sent from a GPU buffer via pipelined GPU copy",
 				   &proto->strat_stats.rndv_rdma_hbuf_send_bytes),
 #endif
 	};
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.h b/prov/psm3/psm3/ptl_ips/ips_proto.h
index eccd6ce3d25..9c1b920f075 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto.h
+++ b/prov/psm3/psm3/ptl_ips/ips_proto.h
@@ -437,7 +437,8 @@ struct ips_proto {
 #ifdef PSM_CUDA
 	CUstream cudastream_send;
 #elif defined(PSM_ONEAPI)
-	ze_command_queue_handle_t cq_send;	// NULL if psm3_oneapi_immed_async_copy
+	/* Will not be used if psm3_oneapi_immed_async_copy */
+	ze_command_queue_handle_t cq_sends[MAX_ZE_DEVICES];
 #endif
 
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c
index 057bdb74c5c..c39231b8679 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c
@@ -260,10 +260,11 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto,
 
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	{
-		if (PSMI_IS_GPU_ENABLED &&
-			 !(proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) {
+		if (PSMI_IS_GPU_ENABLED) {
 			struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS;
 			uint32_t maxsz, chunksz, max_elements;
+			uint32_t pool_num_obj_max_total;
+			uint32_t small_pool_num_obj_max_total;
 
 			if ((err = psm3_parse_mpool_env(protoexp->proto->mq, 1,
 							&rlim, &maxsz, &chunksz)))
@@ -271,11 +272,14 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto,
 
 			/* the maxsz is the amount in MB, not the number of entries,
 			 * since the element size depends on the window size */
-			max_elements = (maxsz*1024*1024) / proto->mq->hfi_base_window_rv;
+			max_elements = (maxsz*1024*1024) /
+				psm3_mq_max_window_rv(proto->mq, 1);
 			/* mpool requires max_elements to be power of 2. round down. */
 			max_elements = 1 << (31 - __builtin_clz(max_elements));
+			/* need at least 2 buffers */
+			max_elements = max(2, max_elements);
 			protoexp->gpu_hostbuf_recv_cfg.bufsz =
-				proto->mq->hfi_base_window_rv;
+				psm3_mq_max_window_rv(proto->mq, 1);
 
 			protoexp->gpu_hostbuf_pool_recv =
 				psm3_mpool_create_for_gpu(sizeof(struct ips_gpu_hostbuf),
@@ -290,6 +294,8 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto,
 							"Couldn't allocate GPU host receive buffer pool");
 				goto fail;
 			}
+			psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_recv,
+						NULL, &pool_num_obj_max_total);
 
 			protoexp->gpu_hostbuf_small_recv_cfg.bufsz =
 				GPU_SMALLHOSTBUF_SZ;
@@ -306,6 +312,13 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto,
 							"Couldn't allocate GPU host small receive buffer pool");
 				goto fail;
 			}
+			psm3_mpool_get_obj_info(protoexp->gpu_hostbuf_pool_small_recv,
+						NULL, &small_pool_num_obj_max_total);
+			_HFI_DBG("GPU Recv Copy Pipeline: %u of %u bytes (small), %u of %u bytes\n",
+				small_pool_num_obj_max_total,
+				protoexp->gpu_hostbuf_small_recv_cfg.bufsz,
+				pool_num_obj_max_total,
+				protoexp->gpu_hostbuf_recv_cfg.bufsz);
 			PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp);
 			STAILQ_INIT(&protoexp->gpupend_getreqsq);
 		} else {
@@ -460,7 +473,7 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
 	uint64_t nbytes;
 
 	PSM2_LOG_MSG("entering");
-	psmi_assert((req->mq->hfi_base_window_rv % PSMI_PAGESIZE) == 0);
+	psmi_assert((psm3_mq_get_window_rv(req) % PSMI_PAGESIZE) == 0);
 	getreq = (struct ips_tid_get_request *)
 	    psm3_mpool_get(protoexp->tid_getreq_pool);
 
@@ -519,8 +532,9 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
 	else
 #endif
 		nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_PAGESIZE);
-	getreq->tidgr_rndv_winsz =
-	    min(nbytes, req->mq->hfi_base_window_rv);
+	getreq->tidgr_rndv_winsz = psm3_mq_get_window_rv(req);
+	if (nbytes < getreq->tidgr_rndv_winsz)
+		getreq->tidgr_rndv_winsz = nbytes;
 	_HFI_MMDBG("posting TID get request: nbytes=%"PRIu64" winsz=%u len=%u\n",
 				 nbytes, getreq->tidgr_rndv_winsz, getreq->tidgr_length);
 	// we have now computed the size of each TID sequence (tidgr_rndv_winsz)
@@ -635,12 +649,19 @@ psm3_ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc)
 
 
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-static
-void psmi_deallocate_chb(struct ips_gpu_hostbuf* chb)
+void psm3_ips_deallocate_send_chb(struct ips_gpu_hostbuf* chb, int reset)
 {
-	PSM3_GPU_HOSTBUF_DESTROY(chb);
-	psmi_free(chb);
-	return;
+	if (chb->is_tempbuf) {
+		PSM3_GPU_HOSTBUF_DESTROY(chb);
+		psmi_free(chb);
+	} else {
+		chb->req = NULL;
+		chb->offset = 0;
+		chb->bytes_read = 0;
+		if (reset)
+			PSM3_GPU_HOSTBUF_RESET(chb);
+		psm3_mpool_put(chb);
+	}
 }
 #endif
 
@@ -677,19 +698,13 @@ ips_protoexp_tidsendc_complete(struct ips_tid_send_desc *tidsendc)
 				STAILQ_REMOVE(&req->sendreq_prefetch,
 					      tidsendc->gpu_hostbuf[0],
 					      ips_gpu_hostbuf, req_next);
-				if (tidsendc->gpu_hostbuf[0]->is_tempbuf)
-					psmi_deallocate_chb(tidsendc->gpu_hostbuf[0]);
-				else {
-					tidsendc->gpu_hostbuf[0]->req = NULL;
-					tidsendc->gpu_hostbuf[0]->offset = 0;
-					tidsendc->gpu_hostbuf[0]->bytes_read = 0;
-					PSM3_GPU_HOSTBUF_RESET(tidsendc->gpu_hostbuf[0]);
-					psm3_mpool_put(tidsendc->gpu_hostbuf[0]);
-				}
+				psm3_ips_deallocate_send_chb(tidsendc->gpu_hostbuf[0], 1);
 				psmi_gpu_run_prefetcher(protoexp, tidsendc);
 			}
-		} else
-			psmi_free(tidsendc->userbuf);
+		} else {
+			psm3_ips_deallocate_send_chb(tidsendc->gpu_split_buf, 0);
+			tidsendc->gpu_split_buf = NULL;
+		}
 	}
 #endif
 	/* Check if we can complete the send request. */
@@ -1220,7 +1235,9 @@ int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref,
 	// For User RC conn_ref is context we set in rc_qp_create (*ipsaddr)
 	// For Kernel RC, conn_ref is the conn handle (psm3_rv_conn_get_conn_handle)
 	// maybe this should be an assert so don't add test in production code
+	// caller can't get qp_context (conn_ref) from rbuf_qp for SRQ
 	if ((conn_type == RDMA_IMMED_USER_RC)
+			&& ! proto->ep->verbs_ep.srq
 			&& (uint64_t)tidrecvc->ipsaddr != conn_ref) {
 		// RDWA Write is not on expected RC QP from remote node
 		_HFI_ERROR("RDMA Write on Wrong User QP 0x%"PRIx64", expect 0x%"PRIx64"\n",
@@ -1304,19 +1321,41 @@ psmi_gpu_reclaim_hostbufs(struct ips_tid_get_request *getreq)
 	}
 	return PSM2_OK;
 }
-static
-struct ips_gpu_hostbuf* psmi_allocate_chb(uint32_t window_len)
+
+// allocate a chb control structure.  The actual buffer and event needed for the
+// DTOH async copy are allocated in chb's 1st use in PSM3_GPU_MEMCPY_DTOH_START
+struct ips_gpu_hostbuf* psm3_ips_allocate_send_chb(struct ips_proto *proto,
+				uint32_t nbytes, int allow_temp)
 {
-	struct ips_gpu_hostbuf* chb = (struct ips_gpu_hostbuf*)
-						psmi_calloc(PSMI_EP_NONE,
+	struct ips_gpu_hostbuf* chb = NULL;
+	unsigned bufsz;
+
+	if (nbytes <= GPU_SMALLHOSTBUF_SZ) {
+		chb = (struct ips_gpu_hostbuf *) psm3_mpool_get(
+			proto->gpu_hostbuf_pool_small_send);
+		bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz;
+	}
+	if (chb == NULL) {
+		chb = (struct ips_gpu_hostbuf *) psm3_mpool_get(
+			proto->gpu_hostbuf_pool_send);
+		bufsz = proto->gpu_hostbuf_send_cfg.bufsz;
+	}
+
+	/* were any buffers available? If not force allocate */
+	if (chb == NULL && allow_temp) {
+		chb = (struct ips_gpu_hostbuf*) psmi_calloc(PSMI_EP_NONE,
 							    UNDEFINED, 1,
 							    sizeof(struct ips_gpu_hostbuf));
-	if_pf (chb == NULL) {
-		psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
-						"Couldn't allocate cuda host buffers ");
-		return NULL;
+		if_pf (chb == NULL) {
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+							"Couldn't allocate GPU host bounce buffers ");
+			return NULL;
+		}
+		chb->is_tempbuf = 1;
+		bufsz = nbytes;
 	}
-	PSM3_GPU_HOSTBUF_FORCE_INIT(chb, window_len);
+	if (chb && ! chb->host_buf)
+		PSM3_GPU_HOST_ALLOC(&chb->host_buf, bufsz);
 	return chb;
 }
 
@@ -1333,21 +1372,12 @@ void psmi_gpu_run_prefetcher(struct ips_protoexp *protoexp,
 	if (req->prefetch_send_msgoff < req->req_data.send_msglen) {
 		/* some data remains to be sent */
 		offset = req->prefetch_send_msgoff;
+		psmi_assert(req->is_buf_gpu_mem);
 		window_len =
 			ips_gpu_next_window(
-					     proto->mq->hfi_base_window_rv,
+					     psm3_mq_get_window_rv(req),
 					     offset, req->req_data.buf_len);
-		unsigned bufsz = 0;
-		if (window_len <= GPU_SMALLHOSTBUF_SZ) {
-			chb = (struct ips_gpu_hostbuf *) psm3_mpool_get(
-				proto->gpu_hostbuf_pool_small_send);
-			bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz;
-		}
-		if (chb == NULL) {
-			chb = (struct ips_gpu_hostbuf *) psm3_mpool_get(
-				proto->gpu_hostbuf_pool_send);
-			bufsz = proto->gpu_hostbuf_send_cfg.bufsz;
-		}
+		chb = psm3_ips_allocate_send_chb(proto, window_len, 0);
 		/* were any buffers available for the prefetcher? */
 		if (chb == NULL)
 			return;
@@ -1358,7 +1388,7 @@ void psmi_gpu_run_prefetcher(struct ips_protoexp *protoexp,
 		chb->gpu_buf = (uint8_t*)req->req_data.buf + offset;
 		chb->bytes_read = 0;
 
-		PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len, bufsz);
+		PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len);
 
 		STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next);
 		return;
@@ -1384,28 +1414,13 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 	while (req->prefetch_send_msgoff < tsess_srcoff + tsess_length) {
 		/* some data remains to be sent */
 		offset = req->prefetch_send_msgoff;
+		psmi_assert(req->is_buf_gpu_mem);
 		window_len =
 			ips_gpu_next_window(
-					     proto->mq->hfi_base_window_rv,
+					     psm3_mq_get_window_rv(req),
 					     offset, req->req_data.buf_len);
-		unsigned bufsz = 0;
-		if (window_len <= GPU_SMALLHOSTBUF_SZ) {
-			chb = (struct ips_gpu_hostbuf *) psm3_mpool_get(
-				proto->gpu_hostbuf_pool_small_send);
-			bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz;
-		}
-		if (chb == NULL) {
-			chb = (struct ips_gpu_hostbuf *) psm3_mpool_get(
-				proto->gpu_hostbuf_pool_send);
-			bufsz = proto->gpu_hostbuf_send_cfg.bufsz;
-		}
-
-		/* were any buffers available? If not force allocate */
-		if (chb == NULL) {
-			chb = psmi_allocate_chb(window_len);
-			psmi_assert(chb);
-			chb->is_tempbuf = 1;
-		}
+		/* if no buffers available, force allocate of a temp buf */
+		chb = psm3_ips_allocate_send_chb(proto, window_len, 1);
 		req->prefetch_send_msgoff += window_len;
 		chb->offset = offset;
 		chb->size = window_len;
@@ -1413,19 +1428,24 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 		chb->gpu_buf = (uint8_t*)req->req_data.buf + offset;
 		chb->bytes_read = 0;
 
-		PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len, bufsz);
+		PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len);
 
 		STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb, req_next);
 		if (type == PSMI_GPU_PARTIAL_MATCH_FOUND) {
+			// caller matched 1st chb, but needed more prefetched
+			// see if we have what we need now
 			if ((tsess_srcoff < chb->offset)
 			     && ((tsess_srcoff + tsess_length) > chb->offset)) {
+				// will collect the 2 prefetched chb's for this
+				// RDMA Write send into a single CPU temp buffer
+				// do alloc now, hoping to hide it behind GPU async copy to chb
 				tidsendc->gpu_hostbuf[0] = chb_prev;
 				tidsendc->gpu_hostbuf[1] = chb;
 				tidsendc->gpu_num_buf = 2;
-				void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
-						tsess_length);
+				tidsendc->gpu_split_buf = psm3_ips_allocate_send_chb(proto,
+											tsess_length, 1);
 				tidsendc->userbuf =
-					(void *)((uintptr_t) buffer);
+					(void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf);
 				tidsendc->buffer =
 					(void *)((uintptr_t)tidsendc->userbuf +
 						tsess_unaligned_start);
@@ -1433,29 +1453,35 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 			}
 		} else {
 			if (attached) {
+				// we attached one in prior loop, now have
+				// a second, should have what we need now
+				psmi_assert((tsess_srcoff + tsess_length) > chb->offset);
+				// will collect the 2 prefetched chb's for this
+				// RDMA Write send into a single CPU temp buffer
+				// do alloc now, hoping to hide it behind GPU async copy to chb
 				tidsendc->gpu_hostbuf[0] = chb_prev;
 				tidsendc->gpu_hostbuf[1] = chb;
 				tidsendc->gpu_num_buf = 2;
-				void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
-						tsess_length);
+				tidsendc->gpu_split_buf = psm3_ips_allocate_send_chb(proto,
+											tsess_length, 1);
 				tidsendc->userbuf =
-					(void *)((uintptr_t) buffer);
+					(void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf);
 				tidsendc->buffer =
 					(void *)((uintptr_t)tidsendc->userbuf +
 						tsess_unaligned_start);
-				attached = 0;
 				return;
 			}
 			if ((tsess_srcoff > chb->offset)
 			    && (tsess_srcoff < (chb->offset + chb->size))
 			     && ((tsess_srcoff + tsess_length) > (chb->offset + chb->size))) {
+				// we prefetched one, but need another
 				chb_prev = chb;
 				attached = 1;
-				chb = NULL;
 				continue;
 			} else if ((chb->offset <= tsess_srcoff) &&
 				  ((tsess_srcoff + tsess_length) <=
 				   (chb->offset+chb->size))) {
+				// we prefetched one and have what we need
 				tidsendc->gpu_hostbuf[0] = chb;
 				tidsendc->gpu_hostbuf[1] = NULL;
 				tidsendc->gpu_num_buf = 1;
@@ -1466,8 +1492,7 @@ void psmi_attach_chb_to_tidsendc(struct ips_protoexp *protoexp,
 					(void *)((uintptr_t)tidsendc->userbuf +
 							tsess_unaligned_start );
 				return;
-			} else
-				chb = NULL;
+			}
 		}
 	}
 }
@@ -1575,11 +1600,11 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 	tidsendc->gpu_num_buf = 0;
 	if (req->gpu_hostbuf_used) {
 		/* To get a match:
-		 * 1. Tid list offset + length is contained within a chb
-		 * 2. Tid list offset + length is contained within
-		 * the prefetched offset of this req.
-		 * 3. Tid list offset + length is partially prefetched
-		 * within one chb. (A partial match)
+		 * 1. FULL - Tid list offset + length is contained within a chb
+		 * 2. SPLIT - Tid list offset + length is contained within
+		 * the prefetched offset of this req. (2 chb)
+		 * 3. PARTIAL - Tid list offset + length is partially prefetched
+		 * within one chb.
 		 */
 		STAILQ_FOREACH(chb, &req->sendreq_prefetch, req_next) {
 			rc = psmi_find_match_in_prefeteched_chb(chb,
@@ -1600,10 +1625,13 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 			tidsendc->gpu_hostbuf[1] = NULL;
 			tidsendc->gpu_num_buf = 1;
 		} else if (rc == PSMI_GPU_SPLIT_MATCH_FOUND){
-			void *buffer = psmi_malloc(PSMI_EP_NONE, UNDEFINED,
-					tid_list->tsess_length);
+			// will collect the 2 prefetched chb's for this
+			// RDMA Write send into a single CPU temp buffer
+			// do alloc now, hoping to hide it behind GPU async copy to chb
+			tidsendc->gpu_split_buf =psm3_ips_allocate_send_chb(protoexp->proto,
+										tid_list->tsess_length, 1);
 			tidsendc->userbuf =
-				(void *)((uintptr_t) buffer);
+				(void *)((uintptr_t) tidsendc->gpu_split_buf->host_buf);
 			tidsendc->buffer =
 				(void *)((uintptr_t)tidsendc->userbuf
 				);
@@ -1612,6 +1640,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 			tidsendc->gpu_hostbuf[1] = chb_next;
 			tidsendc->gpu_num_buf = 2;
 		} else if (rc == PSMI_GPU_PARTIAL_MATCH_FOUND) {
+			// need to prefetch more
 			psmi_attach_chb_to_tidsendc(protoexp, req,
 						    tidsendc,
 						    chb,
@@ -1620,6 +1649,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 							0,
 						    rc);
 		} else {
+			// no match, need to prefetch
 			psmi_attach_chb_to_tidsendc(protoexp, req,
 						    tidsendc,
 						    NULL,
@@ -1849,6 +1879,7 @@ psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc)
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	struct ips_gpu_hostbuf *chb, *chb_next;
 	uint32_t offset_in_chb, i;
+	// wait for async copies into needed prefetcher chb's to finish
 	for (i = 0; i < tidsendc->gpu_num_buf; i++) {
 		chb = tidsendc->gpu_hostbuf[i];
 		if (chb) {
@@ -1864,8 +1895,9 @@ psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc)
 		chb = tidsendc->gpu_hostbuf[0];
 		chb_next = tidsendc->gpu_hostbuf[1];
 		offset_in_chb = tidsendc->tid_list.tsess_srcoff - chb->offset;
-		/* Copying data from multiple cuda
-		 * host buffers into a bounce buffer.
+		/* Copying data from multiple prefetched
+		 * host buffers into a single temp CPU bounce buffer.
+		 * so can issue a single RDMA Write from the temp bounce buffer
 		 */
 		memcpy(tidsendc->buffer, (void *)((uintptr_t)chb->host_buf +
 			offset_in_chb), chb->size-offset_in_chb);
@@ -1881,29 +1913,13 @@ psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc)
 		if(chb->bytes_read == chb->size) {
 			STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb,
 				       ips_gpu_hostbuf, req_next);
-			if (chb->is_tempbuf)
-				psmi_deallocate_chb(chb);
-			else {
-				chb->req = NULL;
-				chb->offset = 0;
-				chb->bytes_read = 0;
-				PSM3_GPU_HOSTBUF_RESET(chb);
-				psm3_mpool_put(chb);
-			}
+			psm3_ips_deallocate_send_chb(chb, 1);
 			psmi_gpu_run_prefetcher(protoexp, tidsendc);
 		 }
 		if(chb_next->bytes_read == chb_next->size) {
 			STAILQ_REMOVE(&tidsendc->mqreq->sendreq_prefetch, chb_next,
 				       ips_gpu_hostbuf, req_next);
-			if (chb_next->is_tempbuf)
-				psmi_deallocate_chb(chb_next);
-			else{
-				chb_next->req = NULL;
-				chb_next->offset = 0;
-				chb_next->bytes_read = 0;
-				PSM3_GPU_HOSTBUF_RESET(chb_next);
-				psm3_mpool_put(chb_next);
-			}
+			psm3_ips_deallocate_send_chb(chb_next, 1);
 			psmi_gpu_run_prefetcher(protoexp, tidsendc);
 		}
 		/* Clean Up tidsendc ref's to split cuda hostbufs when no longer needed */
@@ -2190,8 +2206,10 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp,
 	tidrecvc->stats.nReXmit = 0;
 	tidrecvc->stats.nErrChkReceived = 0;
 
-	_HFI_EXP("alloc tidrecv=%d\n",
-		 tidrecvc->rdescid._desc_idx);
+	_HFI_EXP("alloc tidrecv=%d srcoff=%u length=%u\n",
+		 tidrecvc->rdescid._desc_idx,
+		 tidrecvc->tid_list.tsess_srcoff,
+		 tidrecvc->tid_list.tsess_length);
 
 	tidrecvc->grantscb = grantscb;
 
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c
index b4582c6521d..cdcc480e89a 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c
@@ -158,8 +158,7 @@ int ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes)
 			chb = STAILQ_FIRST(&req->sendreq_prefetch);
 			STAILQ_REMOVE_HEAD(&req->sendreq_prefetch,
 						   req_next);
-			PSM3_GPU_HOSTBUF_RESET(chb);
-			psm3_mpool_put(chb);
+			psm3_ips_deallocate_send_chb(chb, 1);
 		}
 	}
 #endif
@@ -508,24 +507,13 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 		while ((offset < len) &&
 		       (prefetch_lookahead < proto->gpu_prefetch_limit)) {
 			chb = NULL;
+			psmi_assert(req->is_buf_gpu_mem);
 			window_len =
 				ips_gpu_next_window(
-						     proto->mq->hfi_base_window_rv,
+						     psm3_mq_get_window_rv(req),
 						     offset, len);
 
-			unsigned bufsz;
-			if (window_len <= GPU_SMALLHOSTBUF_SZ) {
-				chb = (struct ips_gpu_hostbuf *)
-					psm3_mpool_get(
-					proto->gpu_hostbuf_pool_small_send);
-				bufsz = proto->gpu_hostbuf_small_send_cfg.bufsz;
-			}
-			if (chb == NULL) {
-				chb = (struct ips_gpu_hostbuf *)
-					psm3_mpool_get(
-					proto->gpu_hostbuf_pool_send);
-				bufsz = proto->gpu_hostbuf_send_cfg.bufsz;
-			}
+			chb = psm3_ips_allocate_send_chb(proto, window_len, 0);
 
 			/* any buffers available? */
 			if (chb == NULL) {
@@ -540,7 +528,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 			chb->gpu_buf = (uint8_t*)buf + offset;
 			chb->bytes_read = 0;
 
-			PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len, bufsz);
+			PSM3_GPU_MEMCPY_DTOH_START(proto, chb, window_len);
 
 			STAILQ_INSERT_TAIL(&req->sendreq_prefetch, chb,
 					   req_next);
@@ -590,7 +578,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 			&& ips_epaddr_rdma_connected(ipsaddr)
 			&& !req->mr
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-			&& len > GPUDIRECT_THRESH_RV
+			&& (!PSMI_IS_GPU_ENABLED || len > GPUDIRECT_THRESH_RV)
 			&& ! req->gpu_hostbuf_used
 #endif
 		) {
@@ -625,9 +613,11 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 static inline
-int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len)
+int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len,
+				uint32_t flags_user)
 {
 	if (
+		!(flags_user & PSM2_MQ_FLAG_INJECT) &&
 		len > gpu_thresh_rndv){
 		return 1;
 	}
@@ -667,6 +657,8 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 		ipsaddr = (ips_epaddr_t *)mepaddr;
 	}
 	psmi_assert(ipsaddr->cstate_outgoing == CSTATE_ESTABLISHED);
+		// psmx3 layer never uses mq_isend for FI_INJECT
+	psmi_assert(! (flags_user & PSM2_MQ_FLAG_INJECT));
 
 	proto = ((psm2_epaddr_t) ipsaddr)->proto;
 
@@ -681,7 +673,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 	if (req->is_buf_gpu_mem) {
 		gpu_mem = 1;
 		PSM3_MARK_BUF_SYNCHRONOUS(ubuf);
-		if (psm3_is_needed_rendezvous(proto, len))
+		if (psm3_is_needed_rendezvous(proto, len, 0))
 			goto do_rendezvous;
 	}
 #endif
@@ -1026,12 +1018,13 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 	gpu_mem = PSM3_IS_BUFFER_GPU_MEM(ubuf, len);
 	if (gpu_mem) {
 		PSM3_MARK_BUF_SYNCHRONOUS(ubuf);
-		if (psm3_is_needed_rendezvous(proto, len))
+		if (psm3_is_needed_rendezvous(proto, len, flags))
 			goto do_rendezvous;
 	}
 #endif
 	flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
 
+	/* SENDSYNC gets priority, assume not used for MPI_isend w/INJECT */
 	if (flags & PSM2_MQ_FLAG_SENDSYNC) {
 		goto do_rendezvous;
 	} else if (len <= mq->hfi_thresh_tiny) {
@@ -1117,7 +1110,11 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 			} else {
 				user_buffer = ubuf;
 #ifdef PSM_HAVE_REG_MR
-				if (len > proto->iovec_gpu_thresh_eager_blocking) {
+				if (len > proto->iovec_gpu_thresh_eager_blocking
+#ifdef PSM_INJECT_NOSDMA
+						&& !(flags & PSM2_MQ_FLAG_INJECT)
+#endif
+					) {
 					scb->mr = psm3_verbs_reg_mr(
 						proto->mr_cache, 0,
 						(void*)user_buffer, len, IBV_ACCESS_IS_GPU_ADDR);
@@ -1142,7 +1139,11 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 #endif // PSM_CUDA || PSM_ONEAPI
 		{
 #ifdef PSM_HAVE_REG_MR
-			if (len > proto->iovec_thresh_eager_blocking) {
+			if (len > proto->iovec_thresh_eager_blocking
+#ifdef PSM_INJECT_NOSDMA
+				&& !(flags & PSM2_MQ_FLAG_INJECT)
+#endif
+				) {
 				scb->mr = psm3_verbs_reg_mr(proto->mr_cache, 0,
 						(void*)user_buffer, len, 0);
 			} else
@@ -1240,6 +1241,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 			  ubuf, len, tag->tag[0], tag->tag[1], tag->tag[2]);
 
 	} else if (len <= mq->hfi_thresh_rv) {
+		// for FI_INJECT eager comes from user buffer, needs end to end ack
 		psm2_mq_req_t req;
 
 		/* Block until we can get a req */
diff --git a/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c
index f1cee4faffd..562721a0b37 100644
--- a/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c
+++ b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c
@@ -264,11 +264,64 @@ void psm3_ips_ptl_rcvthread_transfer_ownership(ptl_t *from_ptl_gen, ptl_t *to_pt
 	rcvc->ptl = to_ptl_gen;
 }
 
+/* parse recv thread frequency for PSM3_RCVTHREAD_FREQ"
+ * format is min_freq[:max_freq[:shift_freq]]",
+ * Either field can be omitted in which case default (input tvals) is used
+ * for given field.
+ * 0 - successfully parsed, tvals updated
+ * -1 - str empty, tvals unchanged
+ * -2 - syntax error, tvals may have been changed
+ */
+static int parse_rcvthread_freq(const char *str,
+			size_t errstr_size, char errstr[],
+			int tvals[3])
+{
+	psmi_assert(tvals);
+	int ret = psm3_parse_str_tuples(str, 3, tvals);
+	if (ret < 0)
+		return ret;
+	if (tvals[0] == 0 || tvals[1] == 0) {
+		// disables receiver thread, no other checks needed
+		return 0;
+	}
+	if (tvals[0] < 0 || tvals[0] > 1000) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " min_freq must be 0 to 1000");
+		return -2;
+	}
+	if (tvals[1] < 0 || tvals[1] > 1000) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " max_freq must be 0 to 1000");
+		return -2;
+	}
+	if (tvals[0] > tvals[1]) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " min_freq (%d) must be <= max_freq (%d)", tvals[0], tvals[1]);
+		return -2;
+	}
+	if (tvals[2] < 0 || tvals[2] > 10) {
+		if (errstr_size)
+			snprintf(errstr, errstr_size, " shift_freq must be 0 to 10");
+		return -2;
+	}
+	return 0;
+}
+
+static int parse_check_rcvthread_freq(int type,
+				const union psmi_envvar_val val, void *ptr,
+				size_t errstr_size, char errstr[])
+{
+	// parser will set tvals to result, use a copy to protect input of defaults
+	int tvals[3] = { ((int*)ptr)[0], ((int*)ptr)[1], ((int*)ptr)[2] };
+	psmi_assert(type == PSMI_ENVVAR_TYPE_STR_TUPLES);
+	return parse_rcvthread_freq(val.e_str, errstr_size, errstr, tvals);
+}
+
+
 psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc)
 {
 	union psmi_envvar_val env_to;
 	char rcv_freq[192];
-	int no_timeout = 0;
 	int tvals[3] = { RCVTHREAD_TO_MIN_FREQ,
 		RCVTHREAD_TO_MAX_FREQ,
 		RCVTHREAD_TO_SHIFT
@@ -276,40 +329,19 @@ psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc)
 	snprintf(rcv_freq, sizeof(rcv_freq) - 1, "%d:%d:%d",
 		 RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ,
 		 RCVTHREAD_TO_SHIFT);
-	rcv_freq[sizeof(rcv_freq) - 1] = '\0';
 
-	if (!psm3_getenv("PSM3_RCVTHREAD_FREQ",
+	(void)psm3_getenv_range("PSM3_RCVTHREAD_FREQ",
 			 "Recv Thread frequency (per sec) <min_freq[:max_freq[:shift_freq]]>",
+			 "Specified as min_freq[:max_freq[:shift_freq]]\nwhere min_freq and max_freq are polls per second\n(0 disables receiver thread)\nand 2^shift_freq is amount to multiply or divide frequency by",
 			 PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES,
-			 (union psmi_envvar_val)rcv_freq, &env_to)) {
-		/* not using default values */
-		(void)psm3_parse_str_tuples(env_to.e_str, 3, tvals);
-		int invalid = 0;
-
-		if (tvals[0] == 0 || tvals[1] == 0) {
-			no_timeout = 1;
-		} else {
-			if (tvals[0] > 1000)
-				invalid = 1;
-			if (tvals[1] > 1000 || tvals[1] < tvals[0])
-				invalid = 1;
-			if (tvals[2] > 10)
-				invalid = 1;
-		}
-
-		if (invalid) {
-			_HFI_INFO
-			    ("Overriding invalid request for RcvThread frequency"
-			     " settings of %s to be <%d:%d:%d>\n", env_to.e_str,
-			     RCVTHREAD_TO_MIN_FREQ, RCVTHREAD_TO_MAX_FREQ,
-			     RCVTHREAD_TO_SHIFT);
-			tvals[0] = RCVTHREAD_TO_MIN_FREQ;
-			tvals[1] = RCVTHREAD_TO_MAX_FREQ;
-			tvals[2] = RCVTHREAD_TO_SHIFT;
-		}
+			 (union psmi_envvar_val)rcv_freq,
+			 (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+			 parse_check_rcvthread_freq, tvals, &env_to);
+	if (parse_rcvthread_freq(env_to.e_str, 0, NULL, tvals) < 0) {
+		// already checked, shouldn't get parse errors nor empty strings
+		psmi_assert(0);
 	}
-
-	if (no_timeout) {
+	if (tvals[0] == 0 || tvals[1] == 0) {
 		rcvc->last_timeout = -1;
 		_HFI_PRDBG("PSM3_RCVTHREAD_FREQ set to only interrupt "
 			   "(no timeouts)\n");
diff --git a/prov/psm3/psm3/ptl_self/ptl.c b/prov/psm3/psm3/ptl_self/ptl.c
index 35181f0f3ba..19231015d9b 100644
--- a/prov/psm3/psm3/ptl_self/ptl.c
+++ b/prov/psm3/psm3/ptl_self/ptl.c
@@ -80,6 +80,14 @@ ptl_handle_rtsmatch(psm2_mq_req_t recv_req, int was_posted)
 	psm2_mq_req_t send_req = (psm2_mq_req_t) recv_req->ptl_req_ptr;
 
 	if (recv_req->req_data.recv_msglen > 0) {
+#ifdef PSM_DSA
+		if (psm3_use_dsa(recv_req->req_data.recv_msglen))
+			psm3_dsa_memcpy(recv_req->req_data.buf,
+					send_req->req_data.buf,
+					recv_req->req_data.recv_msglen, 0,
+					&send_req->mq->stats.dsa_stats[0]);
+		else
+#endif
 		psm3_mq_mtucpy(recv_req->req_data.buf, send_req->req_data.buf,
 			       recv_req->req_data.recv_msglen);
 	}
diff --git a/prov/psm3/psm3/utils/utils_dsa.c b/prov/psm3/psm3/utils/utils_dsa.c
index 219f8201fb1..2c697b1cf20 100644
--- a/prov/psm3/psm3/utils/utils_dsa.c
+++ b/prov/psm3/psm3/utils/utils_dsa.c
@@ -97,9 +97,14 @@ static uint32_t dsa_thresh; // copies > thresh will use DSA
 // per process (such as OneCCL workers or Intel MPI Multi-EP threading).
 // But expected counts for such are modest (2-4 for Intel MPI, 8-16 for OneCCL)
 #define DSA_MAX_QUEUES 32
+
+// Default: 2 MB.
+#define DSA_MAX_XFER_SIZE_DEFAULT (1 << 21)
+
 // information parsed from PSM3_DSA_WQS
 static char *dsa_wq_filename[DSA_MAX_PROC][DSA_MAX_QUEUES];
 static uint8_t dsa_wq_mode[DSA_MAX_PROC][DSA_MAX_QUEUES];
+static uint32_t dsa_wq_max_xfer_size[DSA_MAX_PROC][DSA_MAX_QUEUES];
 static uint32_t dsa_num_wqs[DSA_MAX_PROC];
 static uint32_t dsa_num_proc;
 
@@ -108,6 +113,7 @@ struct dsa_wq {
 	const char *wq_filename;	// points into dsa_wq_filename
 	void *wq_reg;	// mmap memory
 	uint32_t use_count;	// how many threads assigned to this WQ
+	uint32_t max_xfer_size; // Maximum supported transfer size
 	uint8_t dedicated;	// is this a dedicated (1) or shared (0) WQ
 };
 static struct dsa_wq dsa_wqs[DSA_MAX_QUEUES];
@@ -119,6 +125,7 @@ static psmi_spinlock_t dsa_wq_lock; // protects dsa_wq.use_count
 // Each thread is assigned a DSA WQ on 1st memcpy
 static __thread void *dsa_wq_reg = NULL;
 static __thread uint8_t dsa_wq_dedicated;
+static __thread uint32_t dsa_wq_xfer_limit;
 
 // we keep completion record in thread local storage instead of stack
 // this way if a DSA completion times out and arrives late it still has a
@@ -163,6 +170,13 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx,
 	uint32_t cpu_n;
 	uint64_t start_cycles, end_cycles;
 	uint64_t loops;
+	uint32_t dsa_chk_size;
+	uint32_t cpu_chk_size;
+	int t_chunks;
+	uint32_t dsa_copied_len = 0;
+	uint32_t cpu_copied_len = 0;
+	int copied_chunks = 0;
+	uint32_t dsa_cp_len;
 
 #if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	if (n && PSMI_IS_GPU_ENABLED && (PSMI_IS_GPU_MEM(dest) || PSMI_IS_GPU_MEM((void *) src))) {
@@ -177,22 +191,31 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx,
 		return;
 	}
 
+	/*
+	 * Calculate the total chunks.
+	 */
+	t_chunks = (n + dsa_wq_xfer_limit - 1) / dsa_wq_xfer_limit;
+
 	// TBD - add some statistics for DSA vs CPU copy use
 	// to maximize performance we do part of the copy with CPU while we
 	// wait for DSA to copy the rest
 	if (dsa_ratio) {
 		cpu_n = n/dsa_ratio;
+		cpu_chk_size = cpu_n / t_chunks;
 		// TBD - should we compute so DSA gets a full multiple of pages and CPU
 		// does the rest?  Should we start DSA on a page boundary?
 		// round down to page boundary
 		//cpu_n = ROUNDDOWNP2(cpu_n, PSMI_PAGESIZE);
 
 		// round to a multiple of 8 bytes at least
-		cpu_n = ROUNDDOWNP2(cpu_n, 8);
+		cpu_chk_size = ROUNDDOWNP2(cpu_chk_size, 8);
+		cpu_n = cpu_chk_size * t_chunks;
 	} else {
 		cpu_n = 0;
+		cpu_chk_size = 0;
 	}
 	dsa_n = n - cpu_n;
+	dsa_chk_size = (dsa_n + t_chunks - 1)/t_chunks;
 	dsa_src = (void*)((uintptr_t)src + cpu_n);
 	dsa_dest = (void*)((uintptr_t)dest + cpu_n);
 	psmi_assert(dsa_n);
@@ -200,6 +223,8 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx,
 
 	// comp ptr must be 32 byte aligned
 	comp = (struct dsa_completion_record *)(((uintptr_t)&dsa_comp[0] + 0x1f) & ~0x1f);
+
+restart:
 	comp->status = 0;
 	desc.opcode = DSA_OPCODE_MEMMOVE;
 	/* set CRAV (comp address valid) and RCR (request comp) so get completion */
@@ -218,9 +243,13 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx,
 	// for overall server.  Best to take the pain here as page faults should
 	// be rare during steady state of most apps
 	// desc.flags |= IDXD_OP_FLAG_BOF;
-	desc.xfer_size = dsa_n;
-	desc.src_addr = (uintptr_t)dsa_src;
-	desc.dst_addr = (uintptr_t)dsa_dest;
+	if (copied_chunks < (t_chunks - 1))
+		dsa_cp_len = dsa_chk_size;
+	else
+		dsa_cp_len = dsa_n - dsa_copied_len;
+	desc.xfer_size = dsa_cp_len;
+	desc.src_addr = (uintptr_t)dsa_src + dsa_copied_len;
+	desc.dst_addr = (uintptr_t)dsa_dest + dsa_copied_len;
 	desc.completion_addr = (uintptr_t)comp;
 
 	// make sure completion status zeroing fully written before post to HW
@@ -239,9 +268,8 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx,
 				if (get_cycles() > end_cycles) {
 					_HFI_INFO("Disabling DSA: DSA SWQ Enqueue Timeout\n");
 					dsa_available = 0;
-					memcpy(dest, src, n);
 					stats->dsa_error++;
-					return;
+					goto memcpy_exit;
 				}
 			}
 			stats->dsa_swq_wait_ns += cycles_to_nanosecs(get_cycles() - start_cycles);
@@ -252,11 +280,13 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx,
 
 	if (cpu_n) {
 		// while DSA does it's thing, we copy rest via CPU
-		memcpy(dest, src, cpu_n);
+		memcpy((void *)((uintptr_t)dest + cpu_copied_len),
+		       (void *)((uintptr_t)src + cpu_copied_len), cpu_chk_size);
+		cpu_copied_len += cpu_chk_size;
 	}
 
 	stats->dsa_copy++;
-	stats->dsa_copy_bytes += dsa_n;
+	stats->dsa_copy_bytes += dsa_cp_len;
 
 	// wait for DSA to finish
 	start_cycles = get_cycles();
@@ -269,8 +299,8 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx,
 		if (get_cycles() > end_cycles && comp->status == 0) {
 			_HFI_INFO("Disabling DSA: DSA Hardware Timeout\n");
 			dsa_available = 0;
-			memcpy(dsa_dest, dsa_src, dsa_n);
 			stats->dsa_error++;
+			goto memcpy_exit;
 			return;
 		}
 		loops++;
@@ -294,9 +324,22 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx,
 				stats->dsa_page_fault_rd++;
 			_HFI_VDBG("DSA desc failed: page fault status %u\n", comp->status);
 		}
-		memcpy(dsa_dest, dsa_src, dsa_n);
-		return;
+		goto memcpy_exit;
 	}
+	/* Check loop status */
+	dsa_copied_len += dsa_cp_len;
+	if (++copied_chunks < t_chunks)
+		goto restart;
+
+	return;
+
+memcpy_exit:
+	memcpy((void *)((uintptr_t)dsa_dest + dsa_copied_len),
+	       (void *)((uintptr_t)dsa_src + dsa_copied_len),
+	       dsa_n - dsa_copied_len);
+	memcpy((void *)((uintptr_t)dest + cpu_copied_len),
+	       (void *)((uintptr_t)src + cpu_copied_len),
+	       cpu_n - cpu_copied_len);
 	return;
 }
 
@@ -378,6 +421,58 @@ static int psm3_dsa_mode(const char *wq_filename)
 	return -1;
 }
 
+// determine the max transfer size for a DSA WQ by reading the max_transfer_size
+// file under DSA_DEVICES/wqX.Y/
+// where wqX.Y is last part of supplied wq_filename
+// return the max_transfer_size.
+// on error returns 0 and an _HFI_ERROR message has been output
+static int psm3_dsa_max_xfer_size(const char *wq_filename)
+{
+	char wq_size_filename[PATH_MAX];
+	const char *p;
+	char buf[20];
+	int fd;
+	int res;
+
+	p = strrchr(wq_filename, '/');
+	if (p)
+		p++;	// skip '/'
+	else
+		p = wq_filename;
+	res = snprintf(wq_size_filename, sizeof(wq_size_filename),
+		       "%s/%s/max_transfer_size", DSA_DEVICES, p);
+	if (res < 0 || res > sizeof(wq_size_filename) - 1) {
+		_HFI_ERROR("Unable to determine DSA WQ max xfer size for %s\n",
+			   wq_filename);
+		return 0;
+	}
+	fd = open(wq_size_filename, O_RDONLY);
+	if (fd < 0) {
+		_HFI_ERROR("Failed to open DSA WQ max xfer size: %s: %s\n",
+			   wq_size_filename, strerror(errno));
+		return 0;
+	}
+	res = read(fd, buf, sizeof(buf)-1);
+	if (res < 0) {
+		_HFI_ERROR("Failed to read DSA WQ max xfer size: %s: %s\n",
+			   wq_size_filename, strerror(errno));
+		close(fd);
+		return 0;
+	}
+	close(fd);
+	if (! res) {
+		_HFI_ERROR("Failed to read DSA WQ max xfer size: %s: empty file\n",
+			   wq_size_filename);
+		return 0;
+	}
+	if (buf[res-1] == '\n')
+		buf[res-1] = '\0';
+	else
+		buf[res] = '\0';
+	_HFI_DBG("DSA WQ %s max xfer size %s\n", wq_filename, buf);
+	return (uint32_t)strtoul(buf, NULL, 0);
+}
+
 /* initialize DSA - call once per process */
 /* Some invalid inputs and DSA initialization errors are treated as fatal errors
  * since if DSA gets initialized on some nodes, but not on others, the
@@ -410,11 +505,11 @@ int psm3_dsa_init(void)
 	if (! psm3_getenv("PSM3_DSA_WQS",
 			"List of DSA WQ devices to use, one list per local process or per\n"
 			"CPU socket:\n"
-			"     wq0,wq2:wq4,wq6:,...\n"
+			"     wq0,wq2;wq4,wq6;,...\n"
 			"Each wq should be a shared workqueue DSA device or a unique\n"
 			"dedicated workqueue DSA device,\n"
 			"     such as /dev/dsa/wq0.0\n"
-			"Colon separates the lists for different processes\n"
+			"Semicolon separates the lists for different processes\n"
 			"     default is '' in which case DSA is not used\n",
 			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
                         (union psmi_envvar_val)"", &env_dsa_wq)) {
@@ -430,10 +525,13 @@ int psm3_dsa_init(void)
 		}
 		s = temp;
 		psmi_assert(*s);
+		// both : and ; are treated the same below, : is deprecated
 		do {
 			int mode;
+			uint32_t xfer_size;
+
 			new_proc = 0;
-			if (! *s)	// trailing ',' or ':' on 2nd or later loop
+			if (! *s)	// trailing ',' or ':' or ';' on 2nd or later loop
 				break;
 			if (proc >= DSA_MAX_PROC) {
 				_HFI_ERROR("PSM3_DSA_WQS exceeds %u per node process limit: '%s'",
@@ -441,9 +539,9 @@ int psm3_dsa_init(void)
 				psmi_free(temp);
 				goto fail;
 			}
-			delim = strpbrk(s, ",:");
+			delim = strpbrk(s, ",:;");
 			if (delim)  {
-				new_proc = (*delim == ':');
+				new_proc = (*delim == ':' || *delim == ';');
 				*delim = '\0';
 			}
 			if (dsa_num_wqs[proc] > DSA_MAX_QUEUES) {
@@ -460,6 +558,9 @@ int psm3_dsa_init(void)
 			}
 			if (mode)
 				all_are_shared = 0;
+			xfer_size = psm3_dsa_max_xfer_size(s);
+			dsa_wq_max_xfer_size[proc][dsa_num_wqs[proc]] = xfer_size > 0 ?
+				xfer_size : DSA_MAX_XFER_SIZE_DEFAULT;
 			dsa_wq_mode[proc][dsa_num_wqs[proc]] = mode;
 			dsa_wq_filename[proc][dsa_num_wqs[proc]] = psmi_strdup(PSMI_EP_NONE, s);
 			dsa_num_wqs[proc]++;
@@ -468,7 +569,7 @@ int psm3_dsa_init(void)
 			s = delim+1;
 		} while (delim);
 		psmi_free(temp);
-		// new_proc means trailing :, ignore it
+		// new_proc means trailing : or ;, ignore it
 		// otherwise, last we processed counts
 		if (!new_proc && proc < DSA_MAX_PROC && dsa_num_wqs[proc])
 			proc++;
@@ -580,6 +681,7 @@ int psm3_dsa_init(void)
 		// key off having rw access to the DSA WQ to decide if DSA is available
 		dsa_wqs[i].wq_filename = dsa_wq_filename[proc][i];
 		dsa_wqs[i].dedicated = dsa_wq_mode[proc][i];
+		dsa_wqs[i].max_xfer_size = dsa_wq_max_xfer_size[proc][i];
 		if (! realpath(dsa_wqs[i].wq_filename, dsa_filename)) {
 			_HFI_ERROR("Failed to resolve DSA WQ path %s\n", dsa_wqs[i].wq_filename);
 			goto fail;
@@ -658,6 +760,7 @@ static inline void psm3_dsa_pick_wq(void)
 found:
 	dsa_wq_reg = dsa_wqs[sel].wq_reg;
 	dsa_wq_dedicated = dsa_wqs[sel].dedicated;
+	dsa_wq_xfer_limit = dsa_wqs[sel].max_xfer_size;
 }
 
 
diff --git a/prov/psm3/psm3/utils/utils_env.c b/prov/psm3/psm3/utils/utils_env.c
index f8c2dbd8b96..55efb77bc2b 100644
--- a/prov/psm3/psm3/utils/utils_env.c
+++ b/prov/psm3/psm3/utils/utils_env.c
@@ -90,7 +90,8 @@ int psm3_env_initialize(void)
 	// get verbosity level setting for env logging
 	// if invalid syntax, will output warning when parse during psm3_getenv
 	const char *verb_env = getenv("PSM3_VERBOSE_ENV");
-	(void)psm3_parse_val_pattern(verb_env, 0, &verb_env_val);
+	(void)psm3_parse_val_pattern_uint(verb_env, 0, &verb_env_val,
+						PSMI_ENVVAR_FLAG_NOABBREV, 0, 3);
 	if (verb_env_val)
 		env_log_level = 0;	// log at INFO level
 
@@ -119,7 +120,7 @@ int psm3_env_initialize(void)
 			c = fgetc(f);
 			if (c != EOF) {
 				// line too long, fgetc until read newline
-				_HFI_INFO("%s: Ignoring line too long: '%s' ...\n",
+				_HFI_ENV_ERROR("%s: Ignoring line too long: '%s' ...\n",
 						PSM3_ENV_FILENAME, buf);
 				while (c != (int)(unsigned char)'\n' && (c = fgetc(f)) != EOF)
 					;
@@ -150,7 +151,7 @@ int psm3_env_initialize(void)
 		j = strspn(&buf[i], "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_");
 		if (buf[i+j] != '=') {
 			// malformed assignment,skip
-			_HFI_INFO("%s: Ignoring malformed assignment: '%s'\n",
+			_HFI_ENV_ERROR("%s: Ignoring malformed assignment: '%s'\n",
 					PSM3_ENV_FILENAME, buf);
 			continue;
 		}
@@ -180,7 +181,8 @@ int psm3_env_initialize(void)
 		// allow /etc/psm3.conf to set PSM3_VERBOSE_ENV when defaulted
 		// if invalid syntax, will output warning when parse during psm3_getenv
 		if (! verb_env && 0 == strcmp("PSM3_VERBOSE_ENV", var.name)) {
-			(void)psm3_parse_val_pattern(var.value, 0, &verb_env_val);
+			(void)psm3_parse_val_pattern_uint(var.value, 0, &verb_env_val,
+						PSMI_ENVVAR_FLAG_NOABBREV, 0, 3);
 			if (verb_env_val)
 				env_log_level = 0;	// log at INFO level
 		}
@@ -189,7 +191,7 @@ int psm3_env_initialize(void)
 		// this must be parsed in a constructor prior to this function,
 		// so we ignore it here
 		if (0 == strcmp(var.name, "PSM3_DISABLE_MMAP_MALLOC")) {
-			_HFI_INFO("WARNING: %s Ignoring %s\n", PSM3_ENV_FILENAME,var.name);
+			_HFI_ENV_ERROR("WARNING: %s Ignoring %s\n", PSM3_ENV_FILENAME,var.name);
 			psmi_free(var.name);
 			psmi_free(var.value);
 			continue;
@@ -252,7 +254,9 @@ void psm3_env_print_val(FILE *f, const char *name, int type,
 	switch (type) {
 	case PSMI_ENVVAR_TYPE_STR:
 	case PSMI_ENVVAR_TYPE_STR_TUPLES:
-	case PSMI_ENVVAR_TYPE_STR_VAL_PAT:
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT:
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT:
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS:
 		fprintf(f, "%s=%s\n", name, val.e_str);
 		break;
 	case PSMI_ENVVAR_TYPE_INT:
@@ -286,7 +290,9 @@ int psm3_env_snprint_val(char *buf, size_t size, const char *name, int type,
 	switch (type) {
 	case PSMI_ENVVAR_TYPE_STR:
 	case PSMI_ENVVAR_TYPE_STR_TUPLES:
-	case PSMI_ENVVAR_TYPE_STR_VAL_PAT:
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT:
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT:
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS:
 		return snprintf(buf, size, "%s=%s\n", name, val.e_str);
 		break;
 	case PSMI_ENVVAR_TYPE_INT:
@@ -332,20 +338,18 @@ char *psm3_env_get(const char *name)
 	return NULL;
 }
 
-/* _CONSUMED_ALL() is a macro which indicates if strtol() consumed all
-   of the input passed to it. */
-#define _CONSUMED_ALL(CHAR_PTR) (((CHAR_PTR) != NULL) && (*(CHAR_PTR) == 0))
-
 // don't document that 3 and 3: and 3:pattern can output hidden params
 const char *PSM3_VERBOSE_ENV_HELP =
-				"Enable verbose output of environment variables. "
-				"(0 - none, 1 - changed w/o help, 2 - user help, "
-				"#: - limit output to rank 0, #:pattern - limit output "
-				"to processes whose label matches "
+	"Enable verbose output of environment variables.\n"
+	"  0 - none\n"
+	"  1 - only output changed w/o help\n"
+	"  2 - output all with help,\n"
+	"  #: - limit output to rank 0\n"
+	"  #:pattern - limit output to processes whose label matches\n    "
 #ifdef FNM_EXTMATCH
-				"extended "
+		"extended "
 #endif
-				"glob pattern";
+		"glob pattern";
 
 /* If PSM3_VERBOSE_ENV is set in the environment, we determine
  * what its verbose level is and print the environment at "INFO"
@@ -362,25 +366,24 @@ static int psm3_getenv_is_verblevel(int printlevel)
 		unsigned verb_env_val;
 		if (env)
 			psm3_stats_print_env("PSM3_VERBOSE_ENV", env);
-		int ret = psm3_parse_val_pattern(env, 0, &verb_env_val);
+		int ret = psm3_parse_val_pattern_uint(env, 0, &verb_env_val,
+						PSMI_ENVVAR_FLAG_NOABBREV, 0, 3);
 		psmi_getenv_verblevel = verb_env_val;
-		if (psmi_getenv_verblevel < 0 || psmi_getenv_verblevel > 3)
-			psmi_getenv_verblevel = 2;
 		if (psmi_getenv_verblevel > 0)
 			nlevel = 0; /* output at INFO level */
 		if (ret == -2)
-			_HFI_ENVDBG(0, "Invalid value for %s ('%s') %-40s Using: %u\n",
-				"PSM3_VERBOSE_ENV", env, PSM3_VERBOSE_ENV_HELP, verb_env_val);
+			_HFI_ENVDBG(0, "Invalid value for %s ('%s') Using: %u\nHelp: %s\n",
+				"PSM3_VERBOSE_ENV", env, verb_env_val, PSM3_VERBOSE_ENV_HELP);
 		else if (psmi_getenv_verblevel == 1)
 			_HFI_ENVDBG(0, " %-25s => '%s' (default was '%s')\n",
 				"PSM3_VERBOSE_ENV", env?env:"", "0");
 		else if (env && *env)
-			_HFI_ENVDBG(nlevel, " %-25s %-40s => '%s' (default was '%s')\n",
-				"PSM3_VERBOSE_ENV", PSM3_VERBOSE_ENV_HELP, env, "0");
+			_HFI_ENVDBG(nlevel, " %-25s => '%s' (default was '%s')\nHelp: %s\n",
+				"PSM3_VERBOSE_ENV", env, "0", PSM3_VERBOSE_ENV_HELP);
 		else	/* defaulted */
 			_HFI_ENVDBG(nlevel,
-				" %-25s %-40s => '%s'\n",
-				"PSM3_VERBOSE_ENV", PSM3_VERBOSE_ENV_HELP, "0");
+				" %-25s => '%s'\nHelp: %s\n",
+				"PSM3_VERBOSE_ENV", "0", PSM3_VERBOSE_ENV_HELP);
 	}
 	// printlevel is visibility of env (USER=1 or HIDDEN=2)
 	// so at verbosity 1 and 2 output USER
@@ -419,314 +422,647 @@ static int psm3_count_tuples(const char *str)
 	return ret;
 }
 
-int
-MOCKABLE(psm3_getenv)(const char *name, const char *descr, int level,
-	    int type, union psmi_envvar_val defval,
+/* _CONSUMED_ALL indicates if strtol() (and friends) consumed all of the input
+ * passed to it. CHAR_PTR is the output char pointer from strtol
+ */
+#define _CONSUMED_ALL(CHAR_PTR) (((CHAR_PTR) != NULL) && (*(CHAR_PTR) == 0))
+
+/* convert a string to a signed number with basic bounds checking
+ * returns 0 - valid value and *val updated
+ * -1 -> empty string, *val unchanged
+ * -2 -> parse or range error, *val unchanged
+ */
+static int convert_str_signed(const char *str, long long *val,
+					long long min, long long max)
+{
+	char *ep;
+	long long temp;
+
+	psmi_assert(val != NULL);
+	if (! str || ! *str)
+		return -1;
+	/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */
+	errno = 0;
+	temp = strtoll(str, &ep, 10);
+	if (! _CONSUMED_ALL(ep)) {
+		errno = 0;
+		temp = strtoll(str, &ep, 16);
+		if (! _CONSUMED_ALL(ep))
+			return -2;
+	}
+	if (errno || temp < min || temp > max)
+		return -2;
+
+	*val = temp;
+	return 0;
+}
+
+/* convert a string to an unsigned number with basic bounds checking
+ * returns 0 - valid value and *val updated
+ * -1 -> empty string, *val unchanged
+ * -2 -> parse or range error, *val unchanged
+ */
+static int convert_str_unsigned(const char *str, unsigned long long *val,
+					unsigned long long min, unsigned long long max)
+{
+	char *ep;
+	unsigned long long temp;
+
+	psmi_assert(val != NULL);
+	if (! str || ! *str)
+		return -1;
+	/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */
+	errno = 0;
+	temp = strtoull(str, &ep, 10);
+	if (! _CONSUMED_ALL(ep)) {
+		errno = 0;
+		temp = strtoull(str, &ep, 16);
+		if (! _CONSUMED_ALL(ep))
+			return -2;
+	}
+	if (errno || temp < min || temp > max)
+		return -2;
+
+	*val = temp;
+	return 0;
+}
+#undef _CONSUMED_ALL
+
+// returns:
+// 0 - valid value input
+// 1 - variable not set, used default
+// -1 - invalid value for variable or invalid syntax, used default
+int MOCKABLE(psm3_getenv_range)(const char *name, const char *descr,
+	    const char *help, unsigned level_flags,
+	    int type, union psmi_envvar_val defval, union psmi_envvar_val min,
+	    union psmi_envvar_val max, psm3_getenv_check_t check, void *ptr,
 	    union psmi_envvar_val *newval)
 {
-	int used_default = 0;
+	int ret = 0;
 	union psmi_envvar_val tval;
 	char *env = psm3_env_get(name);
+	unsigned level = level_flags & PSMI_ENVVAR_LEVEL_MASK;
+	char rangestr[80] = "";	// for help
+	char errstr[512] = "";	// additional info for invalid values
+	char statserrstr[700] = "";	// add'l info for stats file when invalid input
+
+#define FORMAT_RANGESTR(FIELD, fmt) \
+	do {																	\
+		if ((level_flags & PSMI_ENVVAR_FLAG_NOMIN)) {						\
+			if ((level_flags & PSMI_ENVVAR_FLAG_NOMAX))						\
+				rangestr[0] = '\0';											\
+			else															\
+				snprintf(rangestr, sizeof(rangestr)," Max allowed " fmt "%s",\
+					max.FIELD,												\
+					(level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'max')");\
+		} else if ((level_flags & PSMI_ENVVAR_FLAG_NOMAX)) {				\
+			snprintf(rangestr, sizeof(rangestr)," Min allowed " fmt "%s",	\
+					min.FIELD,												\
+					(level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'min')");\
+		} else {															\
+			snprintf(rangestr, sizeof(rangestr)," Valid range " fmt "%s"	\
+ 					" to " fmt "%s",										\
+					min.FIELD,												\
+					(level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'min')",\
+					max.FIELD,												\
+					(level_flags & PSMI_ENVVAR_FLAG_NOABBREV)?"":" (or 'max')");\
+		}																	\
+	} while (0)
+
+#define _GETENV_CHECK(tval)													\
+	do {																	\
+		if (check) {														\
+			if ((*check)(type, tval, ptr, sizeof(errstr), errstr)) {				\
+				tval = defval;												\
+				ret = -1;													\
+				/* errstr now has additional error information */			\
+			}																\
+		}																	\
+	} while (0);
 
 	/* for verblevel 1 we only output non-default values with no help
 	 * for verblevel>1 we promote to info (verblevel=2 promotes USER,
 	 *		verblevel=3 promotes HIDDEN) and show help.
 	 * for verblevel< 1 we don't promote anything and show help
 	 */
-#define _GETENV_PRINT(env, used_default, fmt, val, defval) \
+#define _GETENV_PRINT(env, ret, fmt, val, defval) \
 	do {	\
 		(void)psm3_getenv_is_verblevel(level);			\
-		if (env && *env && used_default)				\
-			_HFI_INFO("Invalid value for %s ('%s') %-40s Using: " fmt "\n", \
-				name, env, descr, val);	\
-		else if (used_default && psmi_getenv_verblevel != 1)		\
-			GETENV_PRINTF(level, "%s%-25s %-40s => " fmt	\
-				"\n", level > 1 ? "*" : " ", name,	\
-				descr, val);	\
-		else if (! used_default && psmi_getenv_verblevel == 1)	\
+		if (ret < 0 && (level_flags & PSMI_ENVVAR_FLAG_FATAL)) { \
+			_HFI_ENV_ERROR("Invalid value for %s ('%s')%s\nHelp: %s%s\n%s%s", \
+				name, env, errstr, descr, rangestr,\
+				help?help:"", help?"\n":"");		\
+			snprintf(statserrstr, sizeof(statserrstr), 	\
+				"Invalid value ('%s')%s", env, errstr);			\
+		} else if (ret < 0) {				\
+			_HFI_ENV_ERROR("Invalid value for %s ('%s')%s Using: " fmt "\nHelp: %s%s\n%s%s", \
+				name, env, errstr, val, descr, rangestr,\
+				help?help:"", help?"\n":"");		\
+			snprintf(statserrstr, sizeof(statserrstr), 	\
+				"Invalid value ('%s')%s Using: " fmt, \
+				env, errstr, val);			\
+		} else if (ret > 0 && psmi_getenv_verblevel != 1)		\
+			GETENV_PRINTF(level, "%s%-25s => " fmt	\
+				"\nHelp: %s%s\n%s%s", level > 1 ? "*" : " ", name,	\
+				val, descr, rangestr,			\
+				help?help:"", help?"\n":"");\
+		else if (ret == 0 && psmi_getenv_verblevel == 1)	\
 			GETENV_PRINTF(1, "%s%-25s => "			\
 				fmt " (default was " fmt ")\n",	\
 				level > 1 ? "*" : " ", name,		\
 				val, defval);		\
-		else if (! used_default && psmi_getenv_verblevel != 1)	\
-			GETENV_PRINTF(1, "%s%-25s %-40s => "		\
-				fmt " (default was " fmt ")\n",	\
-				level > 1 ? "*" : " ", name, descr,	\
-				val, defval);		\
+		else if (ret == 0 && psmi_getenv_verblevel != 1)	\
+			GETENV_PRINTF(1, "%s%-25s => "		\
+				fmt " (default was " fmt ")\nHelp: %s%s\n%s%s",	\
+				level > 1 ? "*" : " ", name,		\
+				val, defval, descr, rangestr,		\
+				help?help:"", help?"\n":"");		\
 	} while (0)
 
-#define _CONVERT_TO_NUM(DEST,TYPE,STRTOL)						\
-	do {										\
-		char *ep;								\
-		/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */	\
-		DEST = (TYPE)STRTOL(env, &ep, 10);					\
-		if (! _CONSUMED_ALL(ep)) {						\
-			DEST = (TYPE)STRTOL(env, &ep, 16);				\
-			if (! _CONSUMED_ALL(ep)) {					\
-				used_default = 1;					\
-				tval = defval;						\
-			}								\
-		}									\
+#define _CONVERT_TO_NUM(FIELD,TYPE,SIGNED,MIN,MAX)					\
+	do {																	\
+		if (!(level_flags & (PSMI_ENVVAR_FLAG_NOMIN|PSMI_ENVVAR_FLAG_NOABBREV))\
+				&& (!strcasecmp(env, "min") || !strcasecmp(env, "minimum")))\
+			tval.FIELD = min.FIELD;											\
+		else if (!(level_flags & (PSMI_ENVVAR_FLAG_NOMAX|PSMI_ENVVAR_FLAG_NOABBREV))\
+				&& (!strcasecmp(env, "max") || !strcasecmp(env, "maximum")))\
+			tval.FIELD = max.FIELD;											\
+		else {																\
+			SIGNED long long temp;											\
+			if (convert_str_##SIGNED(env, &temp, MIN, MAX)) {				\
+				ret = -1; /* callered checked empty, so must be invalid */	\
+				tval = defval;												\
+			} else if ((temp < min.FIELD									\
+							&& !(level_flags & PSMI_ENVVAR_FLAG_NOMIN))		\
+					|| (temp > max.FIELD									\
+							&& !(level_flags & PSMI_ENVVAR_FLAG_NOMAX))) {	\
+				ret = -1;													\
+				tval = defval;												\
+			} else {														\
+				tval.FIELD = (TYPE)temp;									\
+			}																\
+		}																	\
 	} while (0)
 
 	switch (type) {
 	case PSMI_ENVVAR_TYPE_YESNO:
-		tval.e_int = psm3_parse_str_yesno(env);
-		if (tval.e_int < 0) {
+		if (!env || *env == '\0') {
 			tval = defval;
-			used_default = 1;
+			ret = 1;
+		} else {
+			switch (psm3_parse_str_yesno(env, &tval.e_int)) {
+			case -1:	// empty, use default
+				psmi_assert(0);	// shouldn't happen, checked for empty above
+				tval = defval;
+				ret = 1;
+				break;
+			case -2:	// bad syntax, use default
+				tval = defval;
+				ret = -1;
+				break;
+			default:	// valid input
+				_GETENV_CHECK(tval);
+				break;
+			}
 		}
-		_GETENV_PRINT(env, used_default, "%s", tval.e_int ? "YES" : "NO",
+		_GETENV_PRINT(env, ret, "%s", tval.e_int ? "YES" : "NO",
 			      defval.e_int ? "YES" : "NO");
 		break;
 
 	case PSMI_ENVVAR_TYPE_STR:
 		if (!env || *env == '\0') {
 			tval = defval;
-			used_default = 1;
-		} else
+			ret = 1;
+		} else {
 			tval.e_str = env;
-		_GETENV_PRINT(env, used_default, "'%s'", tval.e_str, defval.e_str);
+			_GETENV_CHECK(tval);
+		}
+		_GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str);
 		break;
 
 	case PSMI_ENVVAR_TYPE_INT:
 		if (!env || *env == '\0') {
 			tval = defval;
-			used_default = 1;
+			ret = 1;
 		} else {
-			_CONVERT_TO_NUM(tval.e_int,int,strtol);
+			_CONVERT_TO_NUM(e_int,int,signed,INT_MIN,INT_MAX);
+			if (ret == 0)
+				_GETENV_CHECK(tval);
 		}
-		_GETENV_PRINT(env, used_default, "%d", tval.e_int, defval.e_int);
+		FORMAT_RANGESTR(e_int, "%d");
+		_GETENV_PRINT(env, ret, "%d", tval.e_int, defval.e_int);
 		break;
 
 	case PSMI_ENVVAR_TYPE_UINT:
 	case PSMI_ENVVAR_TYPE_UINT_FLAGS:
 		if (!env || *env == '\0') {
 			tval = defval;
-			used_default = 1;
+			ret = 1;
 		} else {
-			_CONVERT_TO_NUM(tval.e_int,unsigned int,strtoul);
+			_CONVERT_TO_NUM(e_uint,unsigned int,unsigned,0,UINT_MAX);
+			if (ret == 0)
+				_GETENV_CHECK(tval);
 		}
-		if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS)
-			_GETENV_PRINT(env, used_default, "0x%x", tval.e_uint,
+		if (type == PSMI_ENVVAR_TYPE_UINT_FLAGS) {
+			FORMAT_RANGESTR(e_uint, "0x%x");
+			_GETENV_PRINT(env, ret, "0x%x", tval.e_uint,
 				      defval.e_uint);
-		else
-			_GETENV_PRINT(env, used_default, "%u", tval.e_uint,
+		} else {
+			FORMAT_RANGESTR(e_uint, "%u");
+			_GETENV_PRINT(env, ret, "%u", tval.e_uint,
 				      defval.e_uint);
+		}
 		break;
 
 	case PSMI_ENVVAR_TYPE_LONG:
 		if (!env || *env == '\0') {
 			tval = defval;
-			used_default = 1;
+			ret = 1;
 		} else {
-			_CONVERT_TO_NUM(tval.e_long,long,strtol);
+			_CONVERT_TO_NUM(e_long,long,signed,LONG_MIN,LONG_MAX);
+			if (ret == 0)
+				_GETENV_CHECK(tval);
 		}
-		_GETENV_PRINT(env, used_default, "%ld", tval.e_long, defval.e_long);
+		FORMAT_RANGESTR(e_long, "%ld");
+		_GETENV_PRINT(env, ret, "%ld", tval.e_long, defval.e_long);
 		break;
 	case PSMI_ENVVAR_TYPE_ULONG_ULONG:
 		if (!env || *env == '\0') {
 			tval = defval;
-			used_default = 1;
+			ret = 1;
 		} else {
-			_CONVERT_TO_NUM(tval.e_ulonglong,unsigned long long,strtoull);
+			_CONVERT_TO_NUM(e_ulonglong,unsigned long long,unsigned,0,ULLONG_MAX);
+			if (ret == 0)
+				_GETENV_CHECK(tval);
 		}
-		_GETENV_PRINT(env, used_default, "%llu",
+		FORMAT_RANGESTR(e_ulonglong, "%llu");
+		_GETENV_PRINT(env, ret, "%llu",
 			      tval.e_ulonglong, defval.e_ulonglong);
 		break;
-	case PSMI_ENVVAR_TYPE_STR_VAL_PAT:
-		{
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT:
+		if (!env || *env == '\0') {
+			tval = defval;
+			ret = 1;
+		} else if (check) {
+			// check will parse_val_pattern_int and check value returned
+			// caller must parse again
+			tval.e_str = env;
+			_GETENV_CHECK(tval);
+		} else {
+			int trash;
+			// we parse just for syntax check, caller must parse again
+			switch (psm3_parse_val_pattern_int(env, 0, &trash, level_flags,
+				(level_flags & PSMI_ENVVAR_FLAG_NOMIN)?INT_MIN:min.e_int,
+				(level_flags & PSMI_ENVVAR_FLAG_NOMAX)?INT_MAX:max.e_int)) {
+			case -1:	// empty, use default
+				psmi_assert(0);	// shouldn't happen, checked for empty above
+				tval = defval;
+				ret = 1;
+				break;
+			case -2:	// one or more fields with bad syntax, use default
+				tval = defval;
+				ret = -1;
+				break;
+			default:	// valid string
+				tval.e_str = env;
+				break;
+			}
+		}
+		FORMAT_RANGESTR(e_int, "%d");
+		_GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str);
+		break;
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT:
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS:
+		if (!env || *env == '\0') {
+			tval = defval;
+			ret = 1;
+		} else if (check) {
+			// check will parse_val_pattern_uint and check value returned
+			// caller must parse again
+			tval.e_str = env;
+			_GETENV_CHECK(tval);
+		} else {
 			unsigned trash;
 			// we parse just for syntax check, caller must parse again
-			if (psm3_parse_val_pattern(env, 0, &trash) < 0) {
+			switch (psm3_parse_val_pattern_uint(env, 0, &trash, level_flags,
+				(level_flags & PSMI_ENVVAR_FLAG_NOMIN)?0:min.e_uint,
+				(level_flags & PSMI_ENVVAR_FLAG_NOMAX)?UINT_MAX:max.e_uint)) {
+			case -1:	// empty, use default
+				psmi_assert(0);	// shouldn't happen, checked for empty above
 				tval = defval;
-				used_default = 1;
-			} else
+				ret = 1;
+				break;
+			case -2:	// one or more fields with bad syntax, use default
+				tval = defval;
+				ret = -1;
+				break;
+			default:	// valid string
 				tval.e_str = env;
-			_GETENV_PRINT(env, used_default, "'%s'", tval.e_str, defval.e_str);
+				break;
+			}
 		}
+		if (type == PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS)
+			FORMAT_RANGESTR(e_uint, "0x%x");
+		else
+			FORMAT_RANGESTR(e_uint, "%u");
+		_GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str);
 		break;
 	case PSMI_ENVVAR_TYPE_STR_TUPLES:
-		{
+		if (!env || *env == '\0') {
+			tval = defval;
+			ret = 1;
+		} else if (check) {
+			// check will parse_str_tuples and check their values
+			// caller must parse again
+			tval.e_str = env;
+			_GETENV_CHECK(tval);
+		} else {
 			// we parse just for syntax check, caller must parse again
 			int vals[3];
 			int ntup = psm3_count_tuples(defval.e_str);
 			psmi_assert_always(ntup > 0 && ntup <= 3);
-			// parse default into vals[] so can show what caller get
-			(void)psm3_parse_str_tuples(defval.e_str, ntup, vals);
 			switch (psm3_parse_str_tuples(env, ntup, vals)) {
 			case -1:	// empty, use default
+				psmi_assert(0);	// shouldn't happen, checked for empty above
 				tval = defval;
-				used_default = 1;
-				_GETENV_PRINT(env, 1, "'%s'", tval.e_str, defval.e_str);
+				ret = 1;
 				break;
-			case -2:	// one or more fields with bad syntax, show what we have
-				tval.e_str = env;
-				// only 3 choices, so just bruteforce it
-				switch (ntup) {
-				case 1:
-					_HFI_INFO("Invalid value for %s ('%s') %-40s Using: %d\n",
-						name, env, descr, vals[0]);
-					break;
-				case 2:
-					_HFI_INFO("Invalid value for %s ('%s') %-40s Using: %d:%d\n",
-						name, env, descr, vals[0], vals[1]);
-					break;
-				case 3:
-					_HFI_INFO("Invalid value for %s ('%s') %-40s Using: %d:%d:%d\n",
-						name, env, descr, vals[0], vals[1], vals[2]);
-					break;
-				}
+			case -2:	// one or more fields with bad syntax, use default
+				tval = defval;
+				ret = -1;
 				break;
 			default:	// valid string
 				tval.e_str = env;
-				_GETENV_PRINT(env, 0, "'%s'", tval.e_str, defval.e_str);
 				break;
 			}
 		}
+		_GETENV_PRINT(env, ret, "'%s'", tval.e_str, defval.e_str);
 		break;
 	case PSMI_ENVVAR_TYPE_ULONG:
 	case PSMI_ENVVAR_TYPE_ULONG_FLAGS:
 	default:
 		if (!env || *env == '\0') {
 			tval = defval;
-			used_default = 1;
+			ret = 1;
 		} else {
-			_CONVERT_TO_NUM(tval.e_ulong,unsigned long,strtoul);
+			_CONVERT_TO_NUM(e_ulong,unsigned long,unsigned,0,ULONG_MAX);
+			if (ret == 0)
+				_GETENV_CHECK(tval);
 		}
-		if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS)
-			_GETENV_PRINT(env, used_default, "0x%lx", tval.e_ulong,
+		if (type == PSMI_ENVVAR_TYPE_ULONG_FLAGS) {
+			FORMAT_RANGESTR(e_ulong, "0x%lx");
+			_GETENV_PRINT(env, ret, "0x%lx", tval.e_ulong,
 				      defval.e_ulong);
-		else
-			_GETENV_PRINT(env, used_default, "%lu", tval.e_ulong,
+		} else {
+			FORMAT_RANGESTR(e_ulong, "%lu");
+			_GETENV_PRINT(env, ret, "%lu", tval.e_ulong,
 				      defval.e_ulong);
+		}
 		break;
 	}
+#undef FORMAT_RANGESTR
+#undef _GETENV_CHECK
 #undef _GETENV_PRINT
+#undef _CONVERT_TO_NUM
 	*newval = tval;
-	if (! used_default)
+	switch (ret) {
+	case 0:	// good input
 		psm3_stats_print_env(name, env);
+		break;
+	case -1: // bad input, used default
+		// _GETENV_PRINT has set staterrstr
+		psm3_stats_print_env(name, statserrstr);
+		if (level_flags & PSMI_ENVVAR_FLAG_FATAL) {
+			// treat syntax or invalid input as fatal
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_PARAM_ERR,
+				"Invalid value for %s: '%s', can't proceed\n",
+				name, env);
+		}
+		break;
+	case 1: // no input, used default
+		// nothing special here
+		// as needed psm3_stats_initialize will log the stats controls
+		break;
+	}
+	return ret;
+}
+MOCK_DEF_EPILOGUE(psm3_getenv_range);
+
+int
+MOCKABLE(psm3_getenv)(const char *name, const char *descr,  int level,
+	    int type, union psmi_envvar_val defval,
+	    union psmi_envvar_val *newval)
+{
+	switch (type) {
+	case PSMI_ENVVAR_TYPE_YESNO:
+		return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)1, NULL, NULL, newval);
+		break;
+
+	case PSMI_ENVVAR_TYPE_STR:
+		return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, NULL, NULL, newval);
+		break;
 
-	return used_default;
+	case PSMI_ENVVAR_TYPE_INT:
+		return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)INT_MIN, (union psmi_envvar_val)INT_MAX, NULL, NULL, newval);
+		break;
+
+	case PSMI_ENVVAR_TYPE_UINT:
+	case PSMI_ENVVAR_TYPE_UINT_FLAGS:
+		return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, NULL, NULL, newval);
+		break;
+
+	case PSMI_ENVVAR_TYPE_LONG:
+		return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)LONG_MIN, (union psmi_envvar_val)LONG_MAX, NULL, NULL, newval);
+		break;
+	case PSMI_ENVVAR_TYPE_ULONG_ULONG:
+		return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)ULLONG_MAX, NULL, NULL, newval);
+		break;
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_INT:
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT:
+	case PSMI_ENVVAR_TYPE_STR_VAL_PAT_UINT_FLAGS:
+	case PSMI_ENVVAR_TYPE_STR_TUPLES:
+		return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL, NULL, NULL, newval);
+		break;
+	case PSMI_ENVVAR_TYPE_ULONG:
+	case PSMI_ENVVAR_TYPE_ULONG_FLAGS:
+	default:
+		return psm3_getenv_range(name, descr, NULL, (unsigned)level|PSMI_ENVVAR_FLAG_NOMIN_NOMAX, type, defval, (union psmi_envvar_val)0, (union psmi_envvar_val)ULONG_MAX, NULL, NULL, newval);
+		break;
+	}
 }
 MOCK_DEF_EPILOGUE(psm3_getenv);
 
 /*
  * Parsing int parameters
  * 0 -> ok, *val updated
- * -1 -> empty string
- * -2 -> parse error
+ * -1 -> empty string, *val not updated
+ * -2 -> parse error, *val not updated
  */
-int psm3_parse_str_int(const char *string, int *val)
+int psm3_parse_str_int(const char *string, int *val, int min, int max)
 {
-	char *ep;
-	long ret;
+	int ret;
+	long long temp;
 
-	psmi_assert(val != NULL);
-	if (! string || ! *string)
-		return -1;
-	/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */
-	ret = strtol(string, &ep, 10);
-	if (! _CONSUMED_ALL(ep)) {
-		ret = strtol(string, &ep, 16);
-		if (! _CONSUMED_ALL(ep))
-			return -2;
-	}
-	*val = ret;
+	if ((ret = convert_str_signed(string, &temp, min, max)))
+		return ret;
+	*val = (int)temp;
 	return 0;
 }
 
 /*
  * Parsing uint parameters
  * 0 -> ok, *val updated
- * -1 -> empty string
- * -2 -> parse error
+ * -1 -> empty string, *val not updated
+ * -2 -> parse error, *val not updated
  */
-int psm3_parse_str_uint(const char *string, unsigned int *val)
+int psm3_parse_str_uint(const char *string, unsigned int *val,
+						unsigned int min, unsigned int max)
 {
-	char *ep;
-	unsigned long ret;
+	int ret;
+	unsigned long long temp;
 
-	psmi_assert(val != NULL);
-	if (! string || ! *string)
-		return -1;
-	/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */
-	ret = strtoul(string, &ep, 10);
-	if (! _CONSUMED_ALL(ep)) {
-		ret = strtoul(string, &ep, 16);
-		if (! _CONSUMED_ALL(ep))
-			return -2;
-	}
-	*val = ret;
+	if ((ret = convert_str_unsigned(string, &temp, min, max)))
+		return ret;
+	*val = (unsigned int)temp;
 	return 0;
 }
 
 /*
  * Parsing long parameters
- * -1 -> empty string
- * -2 -> parse error
+ * Returns:
+ * 0 -> ok, *val updated
+ * -1 -> empty string, *val not updated
+ * -2 -> parse error, *val not updated
  */
-long psm3_parse_str_long(const char *string)
+int psm3_parse_str_long(const char *string, long *val, long min, long max)
 {
-	char *ep;
-	long ret;
+	int ret;
+	long long temp;
 
-	if (! string || ! *string)
-		return -1;
-	/* Avoid base 8 (octal) on purpose, so don't pass in 0 for radix */
-	ret = strtol(string, &ep, 10);
-	if (! _CONSUMED_ALL(ep)) {
-		ret = strtol(string, &ep, 16);
-		if (! _CONSUMED_ALL(ep))
-			return -2;
-	}
-	return ret;
+	if ((ret = convert_str_signed(string, &temp, min, max)))
+		return ret;
+	*val = (long)temp;
+	return 0;
 }
 
 /*
  * Parsing yesno parameters
  * allows: yes/no, true/false, on/off, 1/0
- * -1 -> empty string
- * -2 -> parse error
+ * Returns:
+ * 0 -> ok, *val updated
+ *		*val = 0 - no selected
+ *		*val = 1 - yes selected
+ * -1 -> empty string, *val not updated
+ * -2 -> parse error, *val not updated
  */
-int psm3_parse_str_yesno(const char *string)
+int psm3_parse_str_yesno(const char *string, int *val)
 {
+	psmi_assert(val != NULL);
 	if (! string || ! *string)
 		return -1;
 	else if (string[0] == 'Y' || string[0] == 'y'
 				|| string[0] == 'T' || string[0] == 't'
 				|| ((string[0] == 'O' || string[0] == 'o')
-					&& (string[1] == 'n' || string[1] == 'N')))
-		return 1;
-	else if (string[0] == 'N' || string[0] == 'n'
+					&& (string[1] == 'n' || string[1] == 'N'))) {
+		*val = 1;
+	} else if (string[0] == 'N' || string[0] == 'n'
 				|| string[0] == 'F' || string[0] == 'f'
 				|| ((string[0] == 'O' || string[0] == 'o')
-					&& (string[1] == 'f' || string[1] == 'F')))
-		return 0;
-	else {
-		char *ep;
-		unsigned long temp;
-		temp = strtoul(string, &ep, 0);
-		if (!_CONSUMED_ALL(ep)) {
-			return -2;
-		} else if (temp != 0) {
-			return 1;
+					&& (string[1] == 'f' || string[1] == 'F'))) {
+		*val = 0;
+	} else {
+		unsigned long long temp;
+		if (convert_str_unsigned(string, &temp, 0, UINT_MAX))
+			return -2;	// already checked for empty, so must be invalid value
+		*val = (temp != 0);
+	}
+	return 0;
+}
+
+/* parse int env of the form 'val' or 'val:' or 'val:pattern'
+ * for PSM3_PRINT_STATS
+ * Returns:
+ * 0 - parsed and matches current process, *val set to parsed val
+ * 0 - parsed and doesn't match current process, *val set to def
+ * -1 - nothing provided, *val set to def
+ * -2 - syntax error, *val set to def
+ * flags PSMI_ENVVAR_FLAG_NOMIN, PSMI_ENVVAR_FLAG_NOMAX and
+ * PSMI_ENVVAR_FLAG_NOABBREV control if 'min', 'minimum', 'max' or 'maximum'
+ * allowed as input and indicate if min and/or max supplied.
+ */
+int psm3_parse_val_pattern_int(const char *env, int def, int *val,
+							unsigned flags, int min, int max)
+{
+	int ret = 0;
+	long long temp;
+
+	psmi_assert(val != NULL);
+	if (!env || ! *env) {
+		*val = def;
+		ret = -1;
+	} else {
+		char *e = psmi_strdup(NULL, env);
+		char *p;
+
+		if (flags & PSMI_ENVVAR_FLAG_NOMIN)
+			min = INT_MIN;
+		if (flags & PSMI_ENVVAR_FLAG_NOMAX)
+			max = INT_MAX;
+
+		psmi_assert_always(e != NULL);
+		if (e == NULL) { // for klocwork
+			*val = def;
+			goto done;
+		}
+		p = strchr(e, ':');
+		if (p)
+			*p = '\0';
+		if (!(flags & (PSMI_ENVVAR_FLAG_NOMIN|PSMI_ENVVAR_FLAG_NOABBREV))
+				&& (!strcasecmp(e, "min") || !strcasecmp(e, "minimum")))
+			*val = min;
+		else if (!(flags & (PSMI_ENVVAR_FLAG_NOMAX|PSMI_ENVVAR_FLAG_NOABBREV))
+				&& (!strcasecmp(e, "max") || !strcasecmp(e, "maximum")))
+			*val = max;
+		else if (convert_str_signed(e, &temp, min, max)) {
+			*val = def;
+			ret = -2;
 		} else {
-			return 0;
+			*val = (int)temp;
 		}
+		if (ret == 0 && p) {
+			if (! *(p+1)) { // val: -> val:*:rank0
+				if (psm3_get_myrank() != 0)
+					*val = def;
+#ifdef FNM_EXTMATCH
+			} else if (0 != fnmatch(p+1, psm3_get_mylabel(),  FNM_EXTMATCH )) {
+#else
+			} else if (0 != fnmatch(p+1, psm3_get_mylabel(),  0 )) {
+#endif
+				*val = def;
+			}
+		}
+		psmi_free(e);
 	}
+done:
+	return ret;
 }
 
-/* parse env of the form 'val' or 'val:' or 'val:pattern'
+/* parse unsigned env of the form 'val' or 'val:' or 'val:pattern'
  * for PSM3_VERBOSE_ENV, PSM3_TRACEMASK, PSM3_FI and PSM3_IDENITFY
+ * Returns:
  * 0 - parsed and matches current process, *val set to parsed val
  * 0 - parsed and doesn't match current process, *val set to def
  * -1 - nothing provided, *val set to def
  * -2 - syntax error, *val set to def
+ * flags PSMI_ENVVAR_FLAG_NOMIN, PSMI_ENVVAR_FLAG_NOMAX and
+ * PSMI_ENVVAR_FLAG_NOABBREV control if 'min', 'minimum', 'max' or 'maximum'
+ * allowed as input and indicate if min and/or max supplied.
  */
-int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val)
+int psm3_parse_val_pattern_uint(const char *env, unsigned def, unsigned *val,
+								unsigned flags, unsigned min, unsigned max)
 {
 	int ret = 0;
+	unsigned long long temp;
 
 	psmi_assert(val != NULL);
 	if (!env || ! *env) {
@@ -734,9 +1070,13 @@ int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val)
 		ret = -1;
 	} else {
 		char *e = psmi_strdup(NULL, env);
-		char *ep;
 		char *p;
 
+		if (flags & PSMI_ENVVAR_FLAG_NOMIN)
+			min = 0;
+		if (flags & PSMI_ENVVAR_FLAG_NOMAX)
+			max = UINT_MAX;
+
 		psmi_assert_always(e != NULL);
 		if (e == NULL) { // for klocwork
 			*val = def;
@@ -745,11 +1085,19 @@ int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val)
 		p = strchr(e, ':');
 		if (p)
 			*p = '\0';
-		*val = (int)strtoul(e, &ep, 0);
-		if (! _CONSUMED_ALL(ep)) {
+		if (!(flags & (PSMI_ENVVAR_FLAG_NOMIN|PSMI_ENVVAR_FLAG_NOABBREV))
+				&& (!strcasecmp(e, "min") || !strcasecmp(e, "minimum")))
+			*val = min;
+		else if (!(flags & (PSMI_ENVVAR_FLAG_NOMAX|PSMI_ENVVAR_FLAG_NOABBREV))
+				&& (!strcasecmp(e, "max") || !strcasecmp(e, "maximum")))
+			*val = max;
+		else if (convert_str_unsigned(e, &temp, min, max)) {
 			*val = def;
 			ret = -2;
-		} else if (p) {
+		} else {
+			*val = (unsigned)temp;
+		}
+		if (ret == 0 && p) {
 			if (! *(p+1)) { // val: -> val:*:rank0
 				if (psm3_get_myrank() != 0)
 					*val = def;
@@ -777,11 +1125,11 @@ int psm3_parse_val_pattern(const char *env, unsigned def, unsigned *val)
  * It's valid for less than ntup values to be supplied, any unsupplied
  * fields are not updated in vals[]
  * Returns:
- * 	0 - parsed with no errors, vals[] updated
- * 	-1 - empty or NULL string, vals[] unchanged
- * 	-2 -  syntax error in one of more of the parameters
- * 			parameters with syntax errors are unchanged, others without
- * 			syntax errors are updated in vals[]
+ * 0 - parsed with no errors, vals[] updated
+ * -1 - empty or NULL string, vals[] unchanged
+ * -2 -  syntax error in one of more of the parameters
+ *			parameters with syntax errors are unchanged, others without
+ *			syntax errors are updated in vals[]
  */
 int psm3_parse_str_tuples(const char *string, int ntup, int *vals)
 {
@@ -804,17 +1152,14 @@ int psm3_parse_str_tuples(const char *string, int ntup, int *vals)
 		while (*e && *e != ':')
 			e++;
 		if (e > b) {	/* something to parse */
-			char *ep;
 			int len = e - b;
-			long int l;
+			long long temp;
 			strncpy(buf, b, len);
 			buf[len] = '\0';
-			l = strtol(buf, &ep, 0);
-			if (ep != buf) {	/* successful conversion */
-				vals[tup_i] = (int)l;
-			} else {
+			if (convert_str_signed(buf, &temp, INT_MIN, INT_MAX))
 				ret = -2;
-			}
+			else
+				vals[tup_i] = (int)temp;
 		}
 		if (*e == ':')
 			e++;	/* skip delimiter */
diff --git a/prov/psm3/psm3/utils/utils_mallopt.c b/prov/psm3/psm3/utils/utils_mallopt.c
index a821281cb00..830c1bbd22b 100644
--- a/prov/psm3/psm3/utils/utils_mallopt.c
+++ b/prov/psm3/psm3/utils/utils_mallopt.c
@@ -82,7 +82,8 @@ static void init_mallopt_disable_mmap(void)
 {
 	// since this occurs before psm3_init, we can't use psm3_env_get
 	// default to NO (0)
-	if (psm3_parse_str_yesno(getenv("PSM3_DISABLE_MMAP_MALLOC")) > 0) {
+	int disable = 0;
+	if (!psm3_parse_str_yesno(getenv("PSM3_DISABLE_MMAP_MALLOC"), &disable)  && disable) {
 		if (mallopt(M_MMAP_MAX, 0) && mallopt(M_TRIM_THRESHOLD, -1)) {
 			psm3_malloc_no_mmap = 1;
 		}
diff --git a/prov/psm3/src/psmx3_attr.c b/prov/psm3/src/psmx3_attr.c
index 7c5a61a8031..fc3663f6133 100644
--- a/prov/psm3/src/psmx3_attr.c
+++ b/prov/psm3/src/psmx3_attr.c
@@ -272,17 +272,87 @@ static uint64_t psmx3_check_fi_hmem_cap(void) {
 	int gpu = 0;
 	unsigned int gpudirect = 0;
 #ifdef PSM_CUDA
-	(void)psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &gpu);
+	(void)psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &gpu, INT_MIN, INT_MAX);
 #else /* PSM_ONEAPI */
-	(void)psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &gpu);
+	(void)psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &gpu,
+							INT_MIN, INT_MAX);
 #endif
-	(void)psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect);
+	(void)psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect,
+							0, UINT_MAX);
 	if ((gpu || gpudirect) && !ofi_hmem_p2p_disabled())
 		return FI_HMEM;
 #endif /* PSM_CUDA || PSM_ONEAPI */
 	return 0;
 }
 
+static uint64_t get_max_inject_size(void) {
+	unsigned int thresh_rv;
+	unsigned int temp;
+	int have_shm = 1;
+	int have_nic = 1;
+	int devid_enabled[PTL_MAX_INIT];
+
+	// check PSM3_DEVICES to determine if PSM3 shm enabled
+	if ((PSM2_OK == psm3_parse_devices(devid_enabled))) {
+		have_shm = psm3_device_is_enabled(devid_enabled, PTL_DEVID_AMSH);
+		have_nic = psm3_device_is_enabled(devid_enabled, PTL_DEVID_IPS);
+	}
+
+	// figure out the smallest rendezvous threshold (GPU vs CPU ips vs shm)
+	// If middleware above is not using PSM3 for shm but leaves it in
+	// PSM3_DEVICES, this could be more restrictive than necessary,
+	// but it's safe.  Note that PSM3_DEVICES can't be set per EP open.
+	// Also not yet sure which HAL will be selected so must pick most
+	// conservative ips (NIC) config
+	thresh_rv = 65536;	// default in odd case of PSM3_DEVICES=self
+
+	if (have_nic) {
+		temp = PSM_MQ_NIC_RNDV_THRESH;
+		psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_NIC_THRESH"), &temp,
+							0, UINT_MAX);
+		if (thresh_rv > temp)
+			thresh_rv = temp;
+	}
+
+	if (have_shm) {
+		temp = MQ_SHM_THRESH_RNDV;
+		psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_THRESH"), &temp,
+							0, UINT_MAX);
+		if (thresh_rv > temp)
+			thresh_rv = temp;
+	}
+
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	if (psmx3_prov_info.caps & FI_HMEM) {
+		if (have_nic) {
+			// GPU ips rendezvous threshold
+			// sockets HAL avoids rendezvous, so this may be overly restrictive
+			temp = GPU_THRESH_RNDV;
+			// PSM3_CUDA_THRESH_RNDV depricated, use PSM3_GPU_THRESH_RNDV if set
+			psm3_parse_str_uint(psm3_env_get("PSM3_CUDA_THRESH_RNDV"), &temp,
+								0, UINT_MAX);
+			psm3_parse_str_uint(psm3_env_get("PSM3_GPU_THRESH_RNDV"), &temp,
+								0, UINT_MAX);
+			if (thresh_rv > temp)
+				thresh_rv = temp;
+		}
+
+		if (have_shm) {
+			// GPU shm rendezvous threshold
+			temp = MQ_SHM_GPU_THRESH_RNDV;
+			psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_GPU_THRESH"), &temp,
+								0, UINT_MAX);
+			if (thresh_rv > temp)
+				thresh_rv = temp;
+		}
+	}
+#endif
+
+	// messages <= thresh_rv guaranteed to use eager, so thresh_rv
+	// is the max allowed inject_size.
+	return thresh_rv;
+}
+
 /*
  * Possible provider variations:
  *
@@ -496,6 +566,8 @@ void psmx3_update_prov_info(struct fi_info *info,
 			    struct psmx3_ep_name *dest_addr)
 {
 	struct fi_info *p;
+	unsigned int max_inject_size;
+	unsigned int inject_size;
 
 	for (p = info; p; p = p->next) {
 		psmx3_dup_addr(p->addr_format, src_addr,
@@ -506,6 +578,15 @@ void psmx3_update_prov_info(struct fi_info *info,
 
 	psmx3_expand_default_unit(info);
 
+	max_inject_size = get_max_inject_size();
+	if (psmx3_env.inject_size > max_inject_size)
+		inject_size = max_inject_size;
+	else
+		inject_size = psmx3_env.inject_size;
+	PSMX3_INFO(&psmx3_prov, FI_LOG_CORE,
+		"Using inject_size=%u based on FI_PSM3_INJECT_SIZE=%u with max %u\n",
+		inject_size, psmx3_env.inject_size, max_inject_size);
+
 	for (p = info; p; p = p->next) {
 		int unit = ((struct psmx3_ep_name *)p->src_addr)->unit;
 		int port = ((struct psmx3_ep_name *)p->src_addr)->port;
@@ -539,7 +620,7 @@ void psmx3_update_prov_info(struct fi_info *info,
 			int addr_index = psmx3_domain_info.addr_index[unit];
 
 			args[0].unit = unit_id;
-			args[1].port = port;
+			args[1].port = port == PSMX3_DEFAULT_PORT ? 1 : port;
 			args[2].addr_index = addr_index;
 			args[3].length = sizeof(unit_name);
 
@@ -571,7 +652,7 @@ void psmx3_update_prov_info(struct fi_info *info,
 			int addr_index = psmx3_domain_info.addr_index[unit];
 
 			args[0].unit = unit_id;
-			args[1].port = port;
+			args[1].port = port == PSMX3_DEFAULT_PORT ? 1 : port;
 			args[2].addr_index = addr_index;
 			args[3].length = sizeof(fabric_name);
 
@@ -591,7 +672,7 @@ void psmx3_update_prov_info(struct fi_info *info,
 			}
 		}
 
-		p->tx_attr->inject_size = psmx3_env.inject_size;
+		p->tx_attr->inject_size = inject_size;
 	}
 }
 
diff --git a/prov/psm3/src/psmx3_cq.c b/prov/psm3/src/psmx3_cq.c
index f1a10349dce..b072eb230df 100644
--- a/prov/psm3/src/psmx3_cq.c
+++ b/prov/psm3/src/psmx3_cq.c
@@ -622,8 +622,10 @@ psmx3_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry
 			data = PSMX3_GET_CQDATA(PSMX3_STATUS_TAG(req));
 			if (PSMX3_HAS_IMM(PSMX3_GET_FLAGS(PSMX3_STATUS_TAG(req))))
 				flags |= FI_REMOTE_CQ_DATA;
-			if (multi_recv_req->offset + PSMX3_STATUS_RCVLEN(req) +
-				multi_recv_req->min_buf_size > multi_recv_req->len)
+			len_remaining = multi_recv_req->len - multi_recv_req->offset -
+					PSMX3_STATUS_RCVLEN(req);
+			if (len_remaining < multi_recv_req->min_buf_size ||
+			    len_remaining == 0)
 				flags |= FI_MULTI_RECV;	/* buffer used up */
 			err = psmx3_cq_rx_complete(
 					status_data->poll_cq, ep->recv_cq, ep->av,
@@ -638,7 +640,8 @@ psmx3_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry
 		/* repost multi-recv buffer */
 		multi_recv_req->offset += PSMX3_STATUS_RCVLEN(req);
 		len_remaining = multi_recv_req->len - multi_recv_req->offset;
-		if (len_remaining >= multi_recv_req->min_buf_size) {
+		if (len_remaining >= multi_recv_req->min_buf_size &&
+		    len_remaining > 0) {
 			if (len_remaining > PSMX3_MAX_MSG_SIZE)
 				len_remaining = PSMX3_MAX_MSG_SIZE;
 			err = psm3_mq_irecv2(ep->rx->psm2_mq,
@@ -786,7 +789,8 @@ psmx3_mq_status_copy(struct psm2_mq_req_user *req, void *status_array, int entry
 			multi_recv_req = PSMX3_CTXT_USER(fi_context);
 			multi_recv_req->offset += PSMX3_STATUS_RCVLEN(req);
 			len_remaining = multi_recv_req->len - multi_recv_req->offset;
-			if (len_remaining >= multi_recv_req->min_buf_size) {
+			if (len_remaining >= multi_recv_req->min_buf_size &&
+			    len_remaining > 0) {
 				if (len_remaining > PSMX3_MAX_MSG_SIZE)
 					len_remaining = PSMX3_MAX_MSG_SIZE;
 				err = psm3_mq_irecv2(ep->rx->psm2_mq,
diff --git a/prov/psm3/src/psmx3_init.c b/prov/psm3/src/psmx3_init.c
index c263446fd64..c20035a84de 100644
--- a/prov/psm3/src/psmx3_init.c
+++ b/prov/psm3/src/psmx3_init.c
@@ -320,11 +320,11 @@ static int psmx3_check_multi_ep_cap(void)
 {
 	uint64_t caps = PSM2_MULTI_EP_CAP;
 	char *s = NULL;
+	int val = 1; /* if parses as empty (-1) or invalid (-2), use default of 1 */
 
 	s = psm3_env_get("PSM3_MULTI_EP");
-	/* if parses as empty or invalid (-1), use default of 1 */
-	/* psm3 below us will provide warning as needed when it parses it */
-	if (psm3_get_capability_mask(caps) == caps && 0 != psm3_parse_str_yesno(s))
+	/* psm3 below us will provide warning as needed when it parses it again */
+	if (psm3_get_capability_mask(caps) == caps && (psm3_parse_str_yesno(s, &val) || val))
 		psmx3_env.multi_ep = 1;
 	else
 		psmx3_env.multi_ep = 0;
@@ -438,7 +438,7 @@ static int psmx3_update_hfi_info(void)
 	// if parses as empty or invalid (-1), use default of 0 */
 	// PSM3 below us will provide warning as needed when it parses it
 	s = psm3_env_get("PSM3_MULTIRAIL");
-	(void)psm3_parse_str_int(s, &multirail);
+	(void)psm3_parse_str_int(s, &multirail, INT_MIN, INT_MAX);
 
 	psmx3_domain_info.num_reported_units = 0;
 	psmx3_domain_info.num_active_units = 0;
@@ -699,6 +699,7 @@ static void psmx3_update_nic_info(struct fi_info *info)
 	}
 }
 
+static int init_calls;
 static int psmx3_getinfo(uint32_t api_version, const char *node,
 			 const char *service, uint64_t flags,
 			 const struct fi_info *hints, struct fi_info **info)
@@ -740,6 +741,8 @@ static int psmx3_getinfo(uint32_t api_version, const char *node,
 		goto err_out;
 	}
 
+	init_calls += 1;
+
 	/* when available, default domain and fabric names are a superset
 	 * of all individual names, so we can do a substr search as a 1st level
 	 * filter
@@ -872,6 +875,9 @@ static int psmx3_getinfo(uint32_t api_version, const char *node,
 	*info = prov_info;
 	free(src_addr);
 	free(dest_addr);
+	if (hints || init_calls >= 2) {
+		psm3_turn_off_init_cache();
+	}
 	return 0;
 
 err_out:
diff --git a/prov/psm3/src/psmx3_msg.c b/prov/psm3/src/psmx3_msg.c
index 3fe17a6bf73..519593def74 100644
--- a/prov/psm3/src/psmx3_msg.c
+++ b/prov/psm3/src/psmx3_msg.c
@@ -225,7 +225,7 @@ ssize_t psmx3_send_generic(struct fid_ep *ep, const void *buf, size_t len,
 			return -FI_EMSGSIZE;
 
 		err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr,
-				    send_flag, &psm2_tag, buf, len);
+				    send_flag|PSM2_MQ_FLAG_INJECT, &psm2_tag, buf, len);
 
 		if (err != PSM2_OK)
 			return psmx3_errno(err);
@@ -374,7 +374,7 @@ ssize_t psmx3_sendv_generic(struct fid_ep *ep, const struct iovec *iov,
 		}
 
 		err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr,
-				    send_flag, &psm2_tag, req->buf, len);
+				    send_flag|PSM2_MQ_FLAG_INJECT, &psm2_tag, req->buf, len);
 
 		free(req);
 
diff --git a/prov/psm3/src/psmx3_tagged.c b/prov/psm3/src/psmx3_tagged.c
index 17caec29533..41475dc211c 100644
--- a/prov/psm3/src/psmx3_tagged.c
+++ b/prov/psm3/src/psmx3_tagged.c
@@ -551,7 +551,7 @@ ssize_t psmx3_tagged_send_generic(struct fid_ep *ep,
 			return -FI_EMSGSIZE;
 
 		err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr,
-				    0, &psm2_tag, buf, len);
+				    PSM2_MQ_FLAG_INJECT, &psm2_tag, buf, len);
 
 		if (err != PSM2_OK)
 			return psmx3_errno(err);
@@ -764,8 +764,8 @@ psmx3_tagged_inject_specialized(struct fid_ep *ep, const void *buf,
 	else
 		PSMX3_SET_TAG(psm2_tag, tag, ep_priv->sep_id, PSMX3_TYPE_TAGGED);
 
-	err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr, 0,
-			    &psm2_tag, buf, len);
+	err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr,
+				PSM2_MQ_FLAG_INJECT, &psm2_tag, buf, len);
 
 	if (err != PSM2_OK)
 		return psmx3_errno(err);
@@ -915,7 +915,7 @@ ssize_t psmx3_tagged_sendv_generic(struct fid_ep *ep,
 		}
 
 		err = psm3_mq_send2(ep_priv->tx->psm2_mq, psm2_epaddr,
-				    send_flag, &psm2_tag, req->buf, len);
+				    send_flag|PSM2_MQ_FLAG_INJECT, &psm2_tag, req->buf, len);
 
 		free(req);