From 55f288d0df0c7d2593807412ede68ec867987d56 Mon Sep 17 00:00:00 2001
From: Adam Goldman <adam.goldman@intel.com>
Date: Mon, 3 Mar 2025 10:03:04 -0500
Subject: [PATCH] prov/psm3: update provider to sync with IEFS 12.0.0.0.36

Updates:
- GPU HAL
- Removed AVX requirement, now will only warn.
- Improved PSM3_RDMA modes 2 & 3's performance.
- Improved NIC selection algorithms

Signed-off-by: Adam Goldman <adam.goldman@intel.com>
(cherry picked from commit f09b96d88fc8ff420ce01aad6d77a8b6a7ef8062)
---
 prov/psm3/COPYING                             |    1 +
 prov/psm3/Makefile.am                         |    5 +-
 prov/psm3/Makefile.include                    |   20 +-
 prov/psm3/VERSION                             |    2 +-
 prov/psm3/configure.ac                        |   31 +-
 prov/psm3/configure.m4                        |   43 +-
 prov/psm3/debian/changelog                    |    2 +-
 prov/psm3/libpsm3-fi.spec.in                  |    2 +-
 prov/psm3/psm3/Makefile.include               |   20 +-
 prov/psm3/psm3/gpu/psm_gpu_cuda.c             | 2025 ++++++++++
 prov/psm3/psm3/gpu/psm_gpu_hal.c              |  422 ++
 prov/psm3/psm3/gpu/psm_gpu_hal.h              |  817 ++++
 prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c        | 3548 +++++++++++++++++
 prov/psm3/psm3/hal_sockets/sockets_ep.c       |   26 +-
 prov/psm3/psm3/hal_sockets/sockets_ep.h       |    2 +-
 prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c   |   24 +-
 prov/psm3/psm3/hal_sockets/sockets_hal.c      |   68 +-
 prov/psm3/psm3/hal_sockets/sockets_hal.h      |    4 +-
 .../psm3/hal_sockets/sockets_hal_inline_i.h   |   14 +-
 prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c |    2 +-
 prov/psm3/psm3/hal_sockets/sockets_spio.c     |   44 +-
 prov/psm3/psm3/hal_verbs/verbs_ep.c           |  297 +-
 prov/psm3/psm3/hal_verbs/verbs_ep.h           |   14 +-
 prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c       |   27 +-
 prov/psm3/psm3/hal_verbs/verbs_hal.c          |   49 +-
 prov/psm3/psm3/hal_verbs/verbs_hal.h          |    4 +-
 prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h |   77 +-
 prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c     |    4 +-
 prov/psm3/psm3/hal_verbs/verbs_spio.c         |   84 +-
 prov/psm3/psm3/include/linux-i386/sysdep.h    |   28 -
 prov/psm3/psm3/include/utils_debug.h          |    6 +-
 prov/psm3/psm3/include/utils_env.h            |    5 +
 prov/psm3/psm3/include/utils_user.h           |   19 +-
 prov/psm3/psm3/psm.c                          | 1069 +----
 prov/psm3/psm3/psm2.h                         |   26 +-
 prov/psm3/psm3/psm2_hal.c                     |    6 +-
 prov/psm3/psm3/psm2_hal.h                     |   26 +-
 prov/psm3/psm3/psm2_hal_inline_t.h            |    8 +-
 prov/psm3/psm3/psm2_hal_loopback.c            |   12 +-
 prov/psm3/psm3/psm2_mq.h                      |    8 +-
 prov/psm3/psm3/psm_config.h                   |   30 +-
 prov/psm3/psm3/psm_context.c                  |  102 +-
 prov/psm3/psm3/psm_ep.c                       |   57 +-
 prov/psm3/psm3/psm_ep.h                       |   20 +-
 prov/psm3/psm3/psm_help.h                     |   16 +-
 prov/psm3/psm3/psm_mpool.c                    |   20 +-
 prov/psm3/psm3/psm_mpool.h                    |    2 +-
 prov/psm3/psm3/psm_mq.c                       |   78 +-
 prov/psm3/psm3/psm_mq_internal.h              |   41 +-
 prov/psm3/psm3/psm_mq_recv.c                  |   69 +-
 prov/psm3/psm3/psm_nic_select.c               |  885 ++--
 prov/psm3/psm3/psm_nic_select.h               |   42 +-
 prov/psm3/psm3/psm_oneapi_ze.c                | 1040 -----
 prov/psm3/psm3/psm_rndv_mod.c                 |  402 +-
 prov/psm3/psm3/psm_rndv_mod.h                 |   30 +-
 prov/psm3/psm3/psm_sysbuf.c                   |   84 +-
 prov/psm3/psm3/psm_sysbuf.h                   |    2 +-
 prov/psm3/psm3/psm_user.h                     | 1213 +-----
 prov/psm3/psm3/psm_utils.c                    |  117 +-
 prov/psm3/psm3/psm_utils.h                    |   32 +-
 prov/psm3/psm3/psm_verbs_mr.c                 |  175 +-
 prov/psm3/psm3/psm_verbs_mr.h                 |   16 +-
 prov/psm3/psm3/ptl.h                          |   32 +-
 .../psm3/ptl_am/am_cuda_memhandle_cache.c     |  515 ---
 .../psm3/ptl_am/am_cuda_memhandle_cache.h     |   91 -
 .../psm3/ptl_am/am_oneapi_memhandle_cache.c   |  696 ----
 .../psm3/ptl_am/am_oneapi_memhandle_cache.h   |   97 -
 prov/psm3/psm3/ptl_am/am_reqrep_shmem.c       |  558 +--
 prov/psm3/psm3/ptl_am/psm_am_internal.h       |   54 +-
 prov/psm3/psm3/ptl_am/ptl.c                   |  133 +-
 prov/psm3/psm3/ptl_ips/ips_config.h           |    6 +-
 prov/psm3/psm3/ptl_ips/ips_expected_proto.h   |   17 +-
 prov/psm3/psm3/ptl_ips/ips_proto.c            |  216 +-
 prov/psm3/psm3/ptl_ips/ips_proto.h            |   36 +-
 prov/psm3/psm3/ptl_ips/ips_proto_connect.h    |   24 +-
 prov/psm3/psm3/ptl_ips/ips_proto_expected.c   |   77 +-
 prov/psm3/psm3/ptl_ips/ips_proto_mq.c         |  138 +-
 prov/psm3/psm3/ptl_ips/ips_proto_params.h     |    7 +-
 prov/psm3/psm3/ptl_ips/ips_proto_recv.c       |    3 +
 prov/psm3/psm3/ptl_ips/ips_scb.c              |    4 +-
 prov/psm3/psm3/ptl_ips/ips_scb.h              |    6 +-
 prov/psm3/psm3/ptl_ips/ptl.c                  |   17 +-
 prov/psm3/psm3/ptl_ips/ptl_rcvthread.c        |   44 +-
 prov/psm3/psm3/ptl_self/ptl.c                 |   25 +-
 prov/psm3/psm3/utils/utils_debug.c            |    2 +-
 prov/psm3/psm3/utils/utils_dsa.c              |    4 +-
 prov/psm3/psm3/utils/utils_dwordcpy-x86_64.c  |    4 +-
 prov/psm3/src/psm3_revision.c.in              |    4 -
 prov/psm3/src/psmx3.h                         |    1 +
 prov/psm3/src/psmx3_atomic.c                  |   36 +-
 prov/psm3/src/psmx3_attr.c                    |   36 +-
 prov/psm3/src/psmx3_ep.c                      |   50 +
 prov/psm3/src/psmx3_init.c                    |   23 +-
 prov/psm3/src/psmx3_rma.c                     |   84 +-
 prov/psm3/src/psmx3_wait.c                    |   75 +-
 95 files changed, 8734 insertions(+), 7745 deletions(-)
 create mode 120000 prov/psm3/COPYING
 create mode 100644 prov/psm3/psm3/gpu/psm_gpu_cuda.c
 create mode 100644 prov/psm3/psm3/gpu/psm_gpu_hal.c
 create mode 100644 prov/psm3/psm3/gpu/psm_gpu_hal.h
 create mode 100644 prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c
 mode change 100755 => 100644 prov/psm3/psm3/hal_sockets/sockets_ep.c
 mode change 100755 => 100644 prov/psm3/psm3/hal_sockets/sockets_hal.h
 mode change 100755 => 100644 prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c
 delete mode 100644 prov/psm3/psm3/psm_oneapi_ze.c
 delete mode 100644 prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c
 delete mode 100644 prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h
 delete mode 100644 prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c
 delete mode 100644 prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.h

diff --git a/prov/psm3/COPYING b/prov/psm3/COPYING
new file mode 120000
index 00000000000..7d29222e4ca
--- /dev/null
+++ b/prov/psm3/COPYING
@@ -0,0 +1 @@
+../../COPYING
\ No newline at end of file
diff --git a/prov/psm3/Makefile.am b/prov/psm3/Makefile.am
index cec9bddede3..80def139e48 100644
--- a/prov/psm3/Makefile.am
+++ b/prov/psm3/Makefile.am
@@ -1,6 +1,6 @@
 #
 # Copyright (c) 2016 Cisco Systems, Inc.  All rights reserved.
-# Copyright (c) 2017-2018 Intel Corporation, Inc. All right reserved.
+# Copyright (c) 2017-2024 Intel Corporation, Inc. All right reserved.
 # Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All rights reserved.
 # (C) Copyright 2020 Hewlett Packard Enterprise Development LP
 #
@@ -97,6 +97,7 @@ common_srcs = \
 	shared/var.c \
 	shared/abi_1_0.c
 
+
 if MACOS
 common_srcs += shared/osx/osd.c
 common_srcs += shared/unix/osd.c
@@ -230,7 +231,7 @@ src_libpsm3_fi_la_LDFLAGS += -lpsm2
 endif !HAVE_PSM3_SRC
 
 if !EMBEDDED
-src_libpsm3_fi_la_LDFLAGS += -version-info 24:0:23
+src_libpsm3_fi_la_LDFLAGS += -version-info 25:0:24
 endif
 
 prov_install_man_pages = man/man7/fi_psm3.7
diff --git a/prov/psm3/Makefile.include b/prov/psm3/Makefile.include
index 9a7ef74370a..257af00e361 100644
--- a/prov/psm3/Makefile.include
+++ b/prov/psm3/Makefile.include
@@ -52,14 +52,11 @@ noinst_LTLIBRARIES += \
 	prov/psm3/psm3/libptl_self.la \
 	prov/psm3/psm3/libhal_verbs.la \
 	prov/psm3/psm3/libhal_sockets.la \
+	prov/psm3/psm3/libgpu.la \
 	prov/psm3/psm3/libpsm3i.la
 
 prov_psm3_psm3_libptl_am_la_SOURCES = \
 	prov/psm3/psm3/ptl_am/am_config.h \
-	prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c \
-	prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h \
-	prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c \
-	prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.h \
 	prov/psm3/psm3/ptl_am/am_reqrep.c \
 	prov/psm3/psm3/ptl_am/am_reqrep_shmem.c \
 	prov/psm3/psm3/ptl_am/cmarw.h \
@@ -191,6 +188,17 @@ prov_psm3_psm3_libhal_sockets_la_CPPFLAGS = \
 prov_psm3_psm3_libhal_sockets_la_CFLAGS = \
 	$(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags)
 
+prov_psm3_psm3_libgpu_la_SOURCES = \
+	prov/psm3/psm3/gpu/psm_gpu_hal.c \
+	prov/psm3/psm3/gpu/psm_gpu_hal.h \
+	prov/psm3/psm3/gpu/psm_gpu_cuda.c \
+	prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c
+prov_psm3_psm3_libgpu_la_CPPFLAGS = \
+	-I$(top_srcdir)/prov/psm3/psm3/gpu/ \
+	$(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags)
+prov_psm3_psm3_libgpu_la_CFLAGS = \
+	$(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags)
+
 prov_psm3_psm3_libpsm3i_la_SOURCES = \
 	prov/psm3/psm3/psm.c \
 	prov/psm3/psm3/psm_am.c \
@@ -218,7 +226,6 @@ prov_psm3_psm3_libpsm3i_la_SOURCES = \
 	prov/psm3/psm3/psm_netutils.h \
 	prov/psm3/psm3/psm_nic_select.c \
 	prov/psm3/psm3/psm_nic_select.h \
-	prov/psm3/psm3/psm_oneapi_ze.c \
 	prov/psm3/psm3/psm_perf.c \
 	prov/psm3/psm3/psm_perf.h \
 	prov/psm3/psm3/psm_rndv_mod.c \
@@ -263,6 +270,7 @@ prov_psm3_psm3_libpsm3i_la_LIBADD = \
 	prov/psm3/psm3/libptl_ips.la \
 	prov/psm3/psm3/libptl_self.la \
 	prov/psm3/psm3/libhal_verbs.la \
+	prov/psm3/psm3/libgpu.la \
 	prov/psm3/psm3/libhal_sockets.la
 
 prov_psm3_psm3_libpsm3i_la_DEPENDENCIES = \
@@ -271,6 +279,7 @@ prov_psm3_psm3_libpsm3i_la_DEPENDENCIES = \
 	prov/psm3/psm3/libptl_ips.la \
 	prov/psm3/psm3/libptl_self.la \
 	prov/psm3/psm3/libhal_verbs.la \
+	prov/psm3/psm3/libgpu.la \
 	prov/psm3/psm3/libhal_sockets.la
 
 # Mirror EXTRA_DIST to end of file
@@ -288,6 +297,7 @@ chksum_srcs += \
 	$(prov_psm3_psm3_libutils_la_SOURCES) \
 	$(prov_psm3_psm3_libhal_verbs_la_SOURCES) \
 	$(prov_psm3_psm3_libhal_sockets_la_SOURCES) \
+	$(prov_psm3_psm3_libgpu_la_SOURCES) \
 	$(prov_psm3_psm3_libpsm3i_la_SOURCES) \
 	$(prov_psm3_extra_dist)
 
diff --git a/prov/psm3/VERSION b/prov/psm3/VERSION
index 8cb63b0114c..a38fee63f9d 100644
--- a/prov/psm3/VERSION
+++ b/prov/psm3/VERSION
@@ -1 +1 @@
-3_7_0_0
+4_0_0_0
diff --git a/prov/psm3/configure.ac b/prov/psm3/configure.ac
index 53569e8e510..18a02468985 100644
--- a/prov/psm3/configure.ac
+++ b/prov/psm3/configure.ac
@@ -143,7 +143,7 @@ AS_IF([test "x$enable_psm3_rc" = "xcheck"],
 AS_IF([test "x$enable_psm3_rc" = "xyes"],
       [
        AS_IF([test "x$enable_psm3_verbs" = "xyes"],
-             [CPPFLAGS="$CPPFLAGS -DUSE_RC"],
+             [CPPFLAGS="$CPPFLAGS -DUSE_RC -DUSE_RDMA_READ"],
              [AC_MSG_ERROR([User RC QPs requires Verbs HAL active])])
       ])
 AS_IF([test "x$enable_psm3_src" = "xyes"],
@@ -690,12 +690,14 @@ AS_IF([test "$have_oneapi_ze" = "1"],
         LIBS="$LIBS $ze_LIBS"
 
         dnl - Check for zeMemPutIpcHandle after ZE added to LIBS/*FLAGS
+        save_LDFLAGS="$LDFLAGS"
+        LDFLAGS="$LDFLAGS -lze_loader"
         AC_MSG_CHECKING([for zeMemPutIpcHandle support in level-zero])
         AC_LINK_IFELSE(
             [AC_LANG_PROGRAM([[
                     #include <level_zero/ze_api.h>
                 ]],[[
-                    ze_context_handle_t hContext;
+                    ze_context_handle_t hContext = NULL;
                     ze_ipc_mem_handle_t handle;
                     (void)zeMemPutIpcHandle(hContext, handle);
                 ]])
@@ -703,8 +705,10 @@ AS_IF([test "$have_oneapi_ze" = "1"],
                 AC_MSG_RESULT(yes)
                 have_oneapi_zeMemPutIpcHandle=1
                 CPPFLAGS="$CPPFLAGS -DPSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE"
+                LDFLAGS="$save_LDFLAGS"
             ],[
                 AC_MSG_RESULT(no)
+                LDFLAGS="$save_LDFLAGS"
             ])
     ])
 
@@ -910,12 +914,9 @@ AS_IF([test ! -z "$CC" && ( test "x${CC%% *}" = "xicc" || test "x${CC%% *}" = "x
       [ dnl ICC/ICX
         CFLAGS="$CFLAGS -Werror -xATOM_SSE4.2 -DPSM_AVX512 -fpic -fPIC -D_GNU_SOURCE -DPACK_STRUCT_STL=packed,"
         LDFLAGS="$LDFLAGS -Wc,-static-intel"
-        PSM3_MARCH="avx2"
       ], [ dnl GCC/other
-        CFLAGS="$CFLAGS -Werror -mavx2 -fpic -fPIC -funwind-tables -Wformat -Wformat-security"
-        PSM3_MARCH="avx2"
+        CFLAGS="$CFLAGS -Werror -msse4.2 -fpic -fPIC -funwind-tables -Wformat -Wformat-security"
       ])
-AC_DEFINE_UNQUOTED([PSM3_MARCH], ["$PSM3_MARCH"], [PSM3 built with instruction set])
 AS_IF([test ! -z "$PSM_CPPFLAGS"], [CPPFLAGS="$CPPFLAGS $PSM_CPPFLAGS"], [])
 AS_IF([test ! -z "$PSM_CFLAGS"], [CFLAGS="$CFLAGS $PSM_CFLAGS"], [])
 
@@ -936,12 +937,10 @@ AC_DEFINE([HAVE_XPMEM], 0, [Ignore HAVE_XPMEM])
 
 dnl Provider-specific checks
 dnl FI_PROVIDER_INIT
-AC_DEFINE([HAVE_BGQ], 0, [Ignore HAVE_BGQ])
-AC_DEFINE([HAVE_BGQ_DL], 0, [Ignore HAVE_BGQ_DL])
+AC_DEFINE([HAVE_CXI], 0, [Ignore HAVE_CXI])
+AC_DEFINE([HAVE_CXI_DL], 0, [Ignore HAVE_CXI_DL])
 AC_DEFINE([HAVE_EFA], 0, [Ignore HAVE_EFA])
 AC_DEFINE([HAVE_EFA_DL], 0, [Ignore HAVE_EFA_DL])
-AC_DEFINE([HAVE_GNI], 0, [Ignore HAVE_GNI])
-AC_DEFINE([HAVE_GNI_DL], 0, [Ignore HAVE_GNI_DL])
 AC_DEFINE([HAVE_MRAIL], 0, [Ignore HAVE_MRAIL])
 AC_DEFINE([HAVE_MRAIL_DL], 0, [Ignore HAVE_MRAIL_DL])
 AC_DEFINE([HAVE_NET], 0, [Ignore HAVE_NET])
@@ -954,8 +953,6 @@ AC_DEFINE([HAVE_PSM2_DL], 0, [Ignore HAVE_PSM2_DL])
 dnl FI_PROVIDER_SETUP([psm3])
 AC_DEFINE([HAVE_OPX], 0, [Ignore HAVE_OPX])
 AC_DEFINE([HAVE_OPX_DL], 0, [Ignore HAVE_OPX_DL])
-AC_DEFINE([HAVE_RSTREAM], 0, [Ignore HAVE_RSTREAM])
-AC_DEFINE([HAVE_RSTREAM_DL], 0, [Ignore HAVE_RSTREAM_DL])
 AC_DEFINE([HAVE_RXD], 0, [Ignore HAVE_RXD])
 AC_DEFINE([HAVE_RXD_DL], 0, [Ignore HAVE_RXD_DL])
 AC_DEFINE([HAVE_RXM], 0, [Ignore HAVE_RXM])
@@ -974,8 +971,6 @@ AC_DEFINE([HAVE_UCX], 0, [Ignore HAVE_UCX])
 AC_DEFINE([HAVE_UCX_DL], 0, [Ignore HAVE_UCX_DL])
 AC_DEFINE([HAVE_UDP], 0, [Ignore HAVE_UDP])
 AC_DEFINE([HAVE_UDP_DL], 0, [Ignore HAVE_UDP_DL])
-AC_DEFINE([HAVE_USNIC], 0, [Ignore HAVE_USNIC])
-AC_DEFINE([HAVE_USNIC_DL], 0, [Ignore HAVE_USNIC_DL])
 AC_DEFINE([HAVE_VERBS], 0, [Ignore HAVE_VERBS])
 AC_DEFINE([HAVE_VERBS_DL], 0, [Ignore HAVE_VERBS_DL])
 dnl FI_PROVIDER_FINI
@@ -991,8 +986,12 @@ AM_COND_IF([HAVE_PSM3_SRC],
         AS_IF([test -z "${PSM3_IEFS_VERSION}"], [PSM3_IEFS_VERSION="${PACKAGE_VERSION}$(whoami)"])
         PSM3_IEFS_VERSION=$(echo "${PSM3_IEFS_VERSION}" | tr '.' '_')
         PSM3_GIT_HASH="$(git rev-parse HEAD)"
-        RPM_RELEASE=$(echo "${PSM3_IEFS_VERSION}" | cut -d'_' -f5)
-        RELEASE_VER=$(echo "${PSM3_IEFS_VERSION}" | cut -d'_' -f1-4 | sed 's/_/./g')
+        RPM_RELEASE=$(echo "${PSM3_IEFS_VERSION}" | tr -s '@<:@A-Z@:>@' '_' | cut -d'_' -f5)
+        RELEASE_VER=$(echo "${PSM3_IEFS_VERSION}" | tr -s '@<:@A-Z@:>@' '_' | cut -d'_' -f1-4 | sed 's/_/./g')
+        char=$(echo "${PSM3_IEFS_VERSION}" | tr -dc '@<:@A-Z@:>@' | tr '@<:@A-Z@:>@' '@<:@a-z@:>@')
+        AS_IF([test -n "$char"], [
+            RPM_RELEASE="0${char}${RPM_RELEASE}"
+        ])
         AS_IF([test x"${RELEASE_VER}" = x"${PACKAGE_VERSION}"], [], [
             AC_MSG_NOTICE([Release Tag does not match VERSION file])
             AC_MSG_NOTICE([${RELEASE_VER} != ${PACKAGE_VERSION}])
diff --git a/prov/psm3/configure.m4 b/prov/psm3/configure.m4
index 5c8c083f7dc..1fd157f7b58 100644
--- a/prov/psm3/configure.m4
+++ b/prov/psm3/configure.m4
@@ -20,7 +20,6 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[
 
      PSM3_HAL_INST=""
      PSM3_HAL_CNT=0
-     PSM3_MARCH=""
 
      psm3_happy=1
      AS_IF([test x"$enable_psm3" != x"no"],
@@ -57,7 +56,7 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[
         AS_IF([test "x$enable_psm3_rc" = "xyes"],
               [
             AS_IF([test "x$enable_psm3_verbs" = "xyes"],
-                  [psm3_CPPFLAGS="$psm3_CPPFLAGS -DUSE_RC"],
+                  [psm3_CPPFLAGS="$psm3_CPPFLAGS -DUSE_RC -DUSE_RDMA_READ"],
                   [AC_MSG_ERROR([User RC QPs requires Verbs HAL active])])
               ])
 
@@ -121,39 +120,16 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[
             ],[
                 AC_MSG_RESULT([yes])
                 PSM3_ARCH_CFLAGS="-msse4.2"
-                PSM3_MARCH="sse4.2"
             ],[
                 psm3_happy=0
                 AC_MSG_RESULT([no])
-                AC_MSG_NOTICE([psm3 requires minimum of avx instruction set to build])
+                AC_MSG_NOTICE([psm3 requires minimum of sse4.2 instruction set to build])
             ])
         CFLAGS=$save_CFLAGS
 
-        AC_MSG_CHECKING([for -mavx support])
+        AC_MSG_CHECKING([for -mavx2 support (recommended)])
         save_CFLAGS=$CFLAGS
-        CFLAGS="$PSM3_STRIP_OPTFLAGS -mavx -O0"
-        AC_LINK_IFELSE(
-            [AC_LANG_PROGRAM(
-                [[#include <immintrin.h>]],
-                [[unsigned long long _a[4] = {1ULL,2ULL,3ULL,4ULL};
-                  __m256i vA = _mm256_loadu_si256((__m256i *)_a);
-                  __m256i vB;
-                  _mm256_store_si256(&vB, vA);
-                  return 0;]])
-            ],[
-                AC_MSG_RESULT([yes])
-                PSM3_ARCH_CFLAGS="-mavx"
-                PSM3_MARCH="avx"
-            ],[
-                psm3_happy=0
-                AC_MSG_RESULT([no])
-                AC_MSG_NOTICE([psm3 requires minimum of avx instruction set to build])
-            ])
-        CFLAGS=$save_CFLAGS
-
-        AC_MSG_CHECKING([for -mavx2 support])
-        save_CFLAGS=$CFLAGS
-        CFLAGS="$PSM3_STRIP_OPTFLAGS -mavx2 -O0"
+        CFLAGS="$PSM3_STRIP_OPTFLAGS -O0"
         AC_LINK_IFELSE(
             [AC_LANG_PROGRAM(
                 [[#include <immintrin.h>]],
@@ -164,10 +140,9 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[
                   return 0;]])
             ],[
                 AC_MSG_RESULT([yes])
-                PSM3_ARCH_CFLAGS="-mavx2"
-                PSM3_MARCH="avx2"
             ],[
                 AC_MSG_RESULT([no])
+                AC_MSG_NOTICE([psm3 recommends minimum of avx2 instruction set for best performance])
             ])
         CFLAGS=$save_CFLAGS
 
@@ -227,20 +202,24 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[
 
         AS_IF([test "$have_oneapi_ze" = "1"],
             [
+                save_LDFLAGS="$LDFLAGS"
+                LDFLAGS="$LDFLAGS -lze_loader"
                 AC_MSG_CHECKING([for zeMemPutIpcHandle support in level-zero])
                 AC_LINK_IFELSE(
                     [AC_LANG_PROGRAM([[
                             #include <level_zero/ze_api.h>
                         ]],[[
-                            ze_context_handle_t hContext;
+                            ze_context_handle_t hContext = NULL;
                             ze_ipc_mem_handle_t handle;
                             (void)zeMemPutIpcHandle(hContext, handle);
                         ]])
                     ],[
                         AC_MSG_RESULT(yes)
                         psm3_CPPFLAGS="$psm3_CPPFLAGS -DPSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE"
+                        LDFLAGS="$save_LDFLAGS"
                     ],[
                         AC_MSG_RESULT(no)
+                        LDFLAGS="$save_LDFLAGS"
                     ])
             ])
 
@@ -413,8 +392,6 @@ AC_DEFUN([FI_PSM3_CONFIGURE],[
      AC_SUBST(psm3_LIBS)
      AC_SUBST(PSM3_HAL_CNT)
      AC_SUBST(PSM3_HAL_INST)
-     AC_DEFINE_UNQUOTED([PSM3_MARCH], ["$PSM3_MARCH"], [PSM3 built with instruction set])
-     AC_SUBST(PSM3_MARCH)
 
      PSM3_IEFS_VERSION=m4_normalize(m4_esyscmd([cat prov/psm3/VERSION]))
      AC_SUBST(PSM3_IEFS_VERSION)
diff --git a/prov/psm3/debian/changelog b/prov/psm3/debian/changelog
index 52852ac0f5e..399de39bf55 100644
--- a/prov/psm3/debian/changelog
+++ b/prov/psm3/debian/changelog
@@ -1,4 +1,4 @@
-libpsm3-fi (11.7.0.0-110) unstable; urgency=medium
+libpsm3-fi (12.0.0.0-36) unstable; urgency=medium
 
   * Initial release 
 
diff --git a/prov/psm3/libpsm3-fi.spec.in b/prov/psm3/libpsm3-fi.spec.in
index b24d4c13a63..282a84e2b34 100644
--- a/prov/psm3/libpsm3-fi.spec.in
+++ b/prov/psm3/libpsm3-fi.spec.in
@@ -62,7 +62,7 @@ rm -rf %{buildroot}
 %files
 %defattr(-,root,root,-)
 %{_libdir}/libfabric/%{name}*
-%doc README
+%doc README COPYING
 %exclude %{_libdir}/libfabric/*.a
 %exclude %{_libdir}/libfabric/*.la
 %exclude %{_libdir}/pkgconfig
diff --git a/prov/psm3/psm3/Makefile.include b/prov/psm3/psm3/Makefile.include
index fd253089532..a8c87fd261a 100644
--- a/prov/psm3/psm3/Makefile.include
+++ b/prov/psm3/psm3/Makefile.include
@@ -17,14 +17,11 @@ noinst_LTLIBRARIES += \
 	psm3/libptl_self.la \
 	psm3/libhal_verbs.la \
 	psm3/libhal_sockets.la \
+	psm3/libgpu.la \
 	psm3/libpsm3i.la
 
 psm3_libptl_am_la_SOURCES = \
 	psm3/ptl_am/am_config.h \
-	psm3/ptl_am/am_cuda_memhandle_cache.c \
-	psm3/ptl_am/am_cuda_memhandle_cache.h \
-	psm3/ptl_am/am_oneapi_memhandle_cache.c \
-	psm3/ptl_am/am_oneapi_memhandle_cache.h \
 	psm3/ptl_am/am_reqrep.c \
 	psm3/ptl_am/am_reqrep_shmem.c \
 	psm3/ptl_am/cmarw.h \
@@ -156,6 +153,17 @@ psm3_libhal_sockets_la_CPPFLAGS = \
 psm3_libhal_sockets_la_CFLAGS = \
 	$(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags)
 
+psm3_libgpu_la_SOURCES = \
+	psm3/gpu/psm_gpu_hal.c \
+	psm3/gpu/psm_gpu_hal.h \
+	psm3/gpu/psm_gpu_cuda.c \
+	psm3/gpu/psm_gpu_oneapi_ze.c
+psm3_libgpu_la_CPPFLAGS = \
+	-I$(top_srcdir)/psm3/gpu/ \
+	$(AM_CPPFLAGS) $(psm3_CPPFLAGS) $(_psm3_cppflags)
+psm3_libgpu_la_CFLAGS = \
+	$(AM_CFLAGS) $(psm3_CFLAGS) $(_psm3_cflags)
+
 psm3_libpsm3i_la_SOURCES = \
 	psm3/psm.c \
 	psm3/psm_am.c \
@@ -183,7 +191,6 @@ psm3_libpsm3i_la_SOURCES = \
 	psm3/psm_netutils.h \
 	psm3/psm_nic_select.c \
 	psm3/psm_nic_select.h \
-	psm3/psm_oneapi_ze.c \
 	psm3/psm_perf.c \
 	psm3/psm_perf.h \
 	psm3/psm_rndv_mod.c \
@@ -228,6 +235,7 @@ psm3_libpsm3i_la_LIBADD = \
 	psm3/libptl_ips.la \
 	psm3/libptl_self.la \
 	psm3/libhal_verbs.la \
+	psm3/libgpu.la \
 	psm3/libhal_sockets.la
 
 psm3_libpsm3i_la_DEPENDENCIES = \
@@ -236,6 +244,7 @@ psm3_libpsm3i_la_DEPENDENCIES = \
 	psm3/libptl_ips.la \
 	psm3/libptl_self.la \
 	psm3/libhal_verbs.la \
+	psm3/libgpu.la \
 	psm3/libhal_sockets.la
 
 _psm3_extra_dist = \
@@ -252,5 +261,6 @@ chksum_srcs += \
 	$(psm3_libutils_la_SOURCES) \
 	$(psm3_libhal_verbs_la_SOURCES) \
 	$(psm3_libhal_sockets_la_SOURCES) \
+	$(psm3_libgpu_la_SOURCES) \
 	$(psm3_libpsm3i_la_SOURCES) \
 	$(_psm3_extra_dist)
diff --git a/prov/psm3/psm3/gpu/psm_gpu_cuda.c b/prov/psm3/psm3/gpu/psm_gpu_cuda.c
new file mode 100644
index 00000000000..7b122134680
--- /dev/null
+++ b/prov/psm3/psm3/gpu/psm_gpu_cuda.c
@@ -0,0 +1,2025 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2024 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2024 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sched.h>		/* cpu_set */
+#include <ctype.h>		/* isalpha */
+#include <stdbool.h>
+#include <assert.h>
+
+#include "psm_user.h"
+
+#ifdef PSM_CUDA
+#include <dlfcn.h>
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+#include <driver_types.h>
+
+#if CUDA_VERSION < 7000
+#error Please update CUDA driver, required minimum version is 7.0
+#endif
+
+#include "psm2_hal.h"
+#include "psm_mq_internal.h"
+#include "ptl_am/psm_am_internal.h"
+#include "ptl_ips/ips_proto.h"
+#include "ptl_ips/ips_expected_proto.h"
+
+// cuCtxSetFlags(CU_CTX_SYNC_MEMOPS) was introduced in CUDA 12.1.0
+#define PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS (CUDA_VERSION >= 12010)
+
+// if defined, do not use cuMemHostRegister for malloced pipeline
+// copy bounce buffers
+// otherwise, use cuMemHostRegister when malloc buffer
+//#define PSM3_NO_CUDA_REGISTER
+
+// default value for PSM3_GPU_THRESH_RNDV
+#define PSM3_CUDA_GPU_THRESH_RNDV 8000
+// default value for PSM3_GPU_RNDV_NIC_WINDOW when using Cuda GPU
+#define PSM3_CUDA_RNDV_NIC_WINDOW_DEFAULT "2097152"
+// default value for PSM3_GPUDIRECT_RDMA_SEND_LIMIT
+#define PSM3_CUDA_GPUDIRECT_RDMA_SEND_LIMIT_DEFAULT UINT_MAX
+// default value for PSM3_GPUDIRECT_RDMA_RECV_LIMIT
+#define PSM3_CUDA_GPUDIRECT_RDMA_RECV_LIMIT_DEFAULT UINT_MAX
+// default value for PSM3_MQ_RNDV_SHM_GPU_THRESH
+// Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem
+#define PSM3_CUDA_MQ_RNDV_SHM_GPU_THRESH 63
+
+/* CUDA Driver Library */
+static void *psm3_cuda_lib;
+static int psm3_cuda_lib_version;
+/* CUDA Runtime (psm3_cudart) Library */
+static void *psm3_cudart_lib;
+static int psm3_cuda_runtime_ver;
+
+/* This is a global cuda context
+ * stored to provide hints during a cuda failure
+ * due to a null cuda context.
+ */
+CUcontext psm3_cu_ctxt;
+
+#ifdef PSM_HAVE_RNDV_MOD
+static int psm3_cuda_gpu_pin_check;	// PSM3_GPU_PIN_CHECK
+static uint64_t *psm3_cuda_gpu_bars;
+static int psm3_cuda_num_gpu_bars = 0;
+static uint64_t psm3_cuda_min_gpu_bar_size;
+
+static uint64_t psm3_cuda_get_nvidia_bar_addr(int domain, int bus, int slot);
+#endif
+
+typedef enum {
+	PSM3_CPE_REJECT = 0,
+	PSM3_CPE_IGNORE = 1,
+	PSM3_CPE_OBEY = 2,
+} psm3_cuda_permitted_enforcement_t;
+
+static psm3_cuda_permitted_enforcement_t psm3_cuda_permitted_enforcement = PSM3_CPE_IGNORE;
+
+typedef enum {
+	PSM3_CUDA_SYNC_CTX         = 0,
+	PSM3_CUDA_SYNC_PTR         = 1,
+	PSM3_CUDA_SYNC_PTR_RELAXED = 2,
+	PSM3_CUDA_SYNC_NONE        = 3,
+} psm3_cuda_sync_mode_t;
+
+static psm3_cuda_sync_mode_t psm3_cuda_sync_mode = PSM3_CUDA_SYNC_CTX;
+
+/* function pointers from dlopen access to cuda shared library */
+#define PSM3_CUDA_SYM_FP(name) PSM3_CONCAT(psm3_cuda_, name)
+static CUresult (*PSM3_CUDA_SYM_FP(cuInit))(unsigned int  Flags );
+static CUresult (*PSM3_CUDA_SYM_FP(cuCtxDetach))(CUcontext c);
+static CUresult (*PSM3_CUDA_SYM_FP(cuCtxGetCurrent))(CUcontext *c);
+static CUresult (*PSM3_CUDA_SYM_FP(cuCtxSetCurrent))(CUcontext c);
+#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS
+static CUresult (*PSM3_CUDA_SYM_FP(cuCtxSetFlags))(unsigned int flags);
+#endif
+static CUresult (*PSM3_CUDA_SYM_FP(cuPointerGetAttribute))(void *data, CUpointer_attribute pa, CUdeviceptr p);
+static CUresult (*PSM3_CUDA_SYM_FP(cuPointerGetAttributes))(unsigned int count, CUpointer_attribute *attrs, void **data, CUdeviceptr ptr);
+static CUresult (*PSM3_CUDA_SYM_FP(cuPointerSetAttribute))(void *data, CUpointer_attribute pa, CUdeviceptr p);
+static CUresult (*PSM3_CUDA_SYM_FP(cuDeviceCanAccessPeer))(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
+static CUresult (*PSM3_CUDA_SYM_FP(cuDeviceGet))(CUdevice* device, int  ordinal);
+static CUresult (*PSM3_CUDA_SYM_FP(cuDeviceGetAttribute))(int* pi, CUdevice_attribute attrib, CUdevice dev);
+static CUresult (*PSM3_CUDA_SYM_FP(cuDriverGetVersion))(int* driverVersion);
+static CUresult (*PSM3_CUDA_SYM_FP(cuDeviceGetCount))(int* count);
+static CUresult (*PSM3_CUDA_SYM_FP(cuStreamCreate))(CUstream* phStream, unsigned int Flags);
+static CUresult (*PSM3_CUDA_SYM_FP(cuStreamDestroy))(CUstream phStream);
+static CUresult (*PSM3_CUDA_SYM_FP(cuStreamSynchronize))(CUstream phStream);
+static CUresult (*PSM3_CUDA_SYM_FP(cuEventCreate))(CUevent* phEvent, unsigned int Flags);
+static CUresult (*PSM3_CUDA_SYM_FP(cuEventDestroy))(CUevent hEvent);
+static CUresult (*PSM3_CUDA_SYM_FP(cuEventQuery))(CUevent hEvent);
+static CUresult (*PSM3_CUDA_SYM_FP(cuEventRecord))(CUevent hEvent, CUstream hStream);
+static CUresult (*PSM3_CUDA_SYM_FP(cuEventSynchronize))(CUevent hEvent);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemRetainAllocationHandle))(CUmemGenericAllocationHandle *h, CUdeviceptr p);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemRelease))(CUmemGenericAllocationHandle h);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemHostAlloc))(void** pp, size_t bytesize, unsigned int Flags);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemFreeHost))(void* p);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemHostRegister))(void* p, size_t bytesize, unsigned int Flags);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemHostUnregister))(void* p);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpy))(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpyDtoD))(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpyDtoH))(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpyHtoD))(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpyDtoHAsync))(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemcpyHtoDAsync))(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream);
+static CUresult (*PSM3_CUDA_SYM_FP(cuIpcGetMemHandle))(CUipcMemHandle* pHandle, CUdeviceptr dptr);
+static CUresult (*PSM3_CUDA_SYM_FP(cuIpcOpenMemHandle))(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
+static CUresult (*PSM3_CUDA_SYM_FP(cuIpcCloseMemHandle))(CUdeviceptr dptr);
+static CUresult (*PSM3_CUDA_SYM_FP(cuMemGetAddressRange))(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr);
+static CUresult (*PSM3_CUDA_SYM_FP(cuDevicePrimaryCtxGetState))(CUdevice dev, unsigned int* flags, int* active);
+static CUresult (*PSM3_CUDA_SYM_FP(cuDevicePrimaryCtxRetain))(CUcontext* pctx, CUdevice dev);
+static CUresult (*PSM3_CUDA_SYM_FP(cuCtxGetDevice))(CUdevice* device);
+static CUresult (*PSM3_CUDA_SYM_FP(cuDevicePrimaryCtxRelease))(CUdevice device);
+static CUresult (*PSM3_CUDA_SYM_FP(cuGetErrorString))(CUresult error, const char **pStr);
+static cudaError_t (*PSM3_CUDA_SYM_FP(cudaRuntimeGetVersion))(int* runtimeVersion);
+
+/* statistics counting each cuda call PSM3 makes */
+#define PSM3_CUDA_SYM_COUNT(name) PSM3_CONCAT(psm3_cuda_count_, name)
+static uint64_t PSM3_CUDA_SYM_COUNT(cuInit);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuCtxDetach);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuCtxGetCurrent);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuCtxSetCurrent);
+#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS
+static uint64_t PSM3_CUDA_SYM_COUNT(cuCtxSetFlags);
+#endif
+static uint64_t PSM3_CUDA_SYM_COUNT(cuPointerGetAttribute);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuPointerGetAttributes);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuPointerSetAttribute);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuDeviceCanAccessPeer);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuDeviceGet);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuDeviceGetAttribute);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuDriverGetVersion);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuDeviceGetCount);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuStreamCreate);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuStreamDestroy);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuStreamSynchronize);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuEventCreate);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuEventDestroy);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuEventQuery);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuEventRecord);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuEventSynchronize);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemRetainAllocationHandle);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemRelease);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemHostAlloc);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemFreeHost);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemHostRegister);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemHostUnregister);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpy);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpyDtoD);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpyDtoH);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpyHtoD);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpyDtoHAsync);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemcpyHtoDAsync);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuIpcGetMemHandle);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuIpcOpenMemHandle);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuIpcCloseMemHandle);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuMemGetAddressRange);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuDevicePrimaryCtxGetState);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuDevicePrimaryCtxRetain);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuCtxGetDevice);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuDevicePrimaryCtxRelease);
+static uint64_t PSM3_CUDA_SYM_COUNT(cuGetErrorString);
+static uint64_t PSM3_CUDA_SYM_COUNT(cudaRuntimeGetVersion);
+
+/* Set the context-level SYNC_MEMOPS flag (as opposed to the pointer
+ * attribute)
+ */
+static void psm3_cuda_ctx_set_sync_memops(void)
+{
+#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS
+	if (psm3_cuda_sync_mode != PSM3_CUDA_SYNC_CTX)
+		return;
+	CUresult err = PSM3_CUDA_SYM_FP(cuCtxSetFlags)(CU_CTX_SYNC_MEMOPS);
+	if_pf (err != CUDA_SUCCESS)
+		psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"Failed to set CUDA context flag: SYNC_MEMOPS\n");
+#endif
+}
+
+static int psm3_cuda_check_set_cuda_ctxt(void)
+{
+	CUresult err;
+	CUcontext tmpctxt = {0};
+
+	if (unlikely(!PSM3_CUDA_SYM_FP(cuCtxGetCurrent) || !PSM3_CUDA_SYM_FP(cuCtxSetCurrent)))
+		return 0;
+
+	err = PSM3_CUDA_SYM_FP(cuCtxGetCurrent)(&tmpctxt);
+	if (likely(!err)) {
+		if (unlikely(!tmpctxt && psm3_cu_ctxt)) {
+			err = PSM3_CUDA_SYM_FP(cuCtxSetCurrent)(psm3_cu_ctxt);
+			if (likely(!err))
+				psm3_cuda_ctx_set_sync_memops();
+			return !!err;
+		} else if (unlikely(tmpctxt && !psm3_cu_ctxt)) {
+			psm3_cu_ctxt = tmpctxt;
+			psm3_cuda_ctx_set_sync_memops();
+		}
+	}
+	return 0;
+}
+
+/* Make sure have a real GPU job.  Set psm3_cu_ctxt if available */
+PSMI_ALWAYS_INLINE(
+int psm3_cuda_check_have_cuda_ctxt(void))
+{
+	if (! psm3_cu_ctxt) {
+		if (unlikely(psm3_cuda_check_set_cuda_ctxt())) {
+			psm3_handle_error(PSMI_EP_NORETURN,
+			PSM2_INTERNAL_ERR, "Failed to set/synchronize"
+			" CUDA context.\n");
+		}
+	}
+	return (psm3_cu_ctxt != NULL);
+}
+
+/**
+ * execute the specified function and return the result without error handling
+ */
+#define PSM3_CUDA_EXEC_ASSUME_CONTEXT(func, args...) \
+	({ \
+		PSM3_CONCAT(psm3_cuda_count_, func)++; \
+		(CUresult)PSM3_CONCAT(psm3_cuda_, func)(args); \
+	})
+
+#define PSM3_CUDA_EXEC(func, args...) \
+	({ \
+		if (unlikely(psm3_cuda_check_set_cuda_ctxt())) { \
+			psm3_handle_error( \
+				PSMI_EP_NORETURN, \
+				PSM2_INTERNAL_ERR, \
+				"Failed to set/synchronize CUDA context.\n"); \
+		} \
+		PSM3_CUDA_EXEC_ASSUME_CONTEXT(func, args); \
+	})
+
+/**
+ * apply boilerplate non-fatal error handling to the indicated error
+ */
+#define PSM3_CUDA_ERROR(func, cudaerr, log_level) \
+	do { \
+		const char *pStr = NULL; \
+		PSM3_CUDA_SYM_COUNT(cuGetErrorString)++; \
+		PSM3_CUDA_SYM_FP(cuGetErrorString)(cudaerr, &pStr); \
+		_HFI_##log_level( \
+			"CUDA failure: %s() (at %s:%d) returned %d: %s\n", \
+			#func, __FILE__, __LINE__, cudaerr, \
+			pStr ? pStr : "Unknown"); \
+	} while (0)
+
+/**
+ * check for errors, do necessary boilerplate, then fail hard
+ */
+#define PSM3_CUDA_CHECK(func, cudaerr) \
+	do { \
+		if (cudaerr != CUDA_SUCCESS) { \
+			PSM3_CUDA_ERROR(func, cudaerr, ERROR); \
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+				"Error returned from CUDA function %s.\n", #func); \
+		} \
+	} while (0)
+
+/**
+ * execute the CUDA function and handle any errors with failure
+ */
+#define PSM3_CUDA_CALL(func, args...) \
+	do { \
+		CUresult cudaerr = PSM3_CUDA_EXEC(func, args); \
+		PSM3_CUDA_CHECK(func, cudaerr); \
+	} while (0)
+
+/**
+ * Similar to PSM3_CUDA_CALL() except does not error out
+ * if func(args) returns CUDA_SUCCESS or except_err
+ *
+ * Invoker must provide 'CUresult cudaerr' in invoked scope
+ * so invoker can inspect whether cudaerr == CUDA_SUCCESS or
+ * cudaerr == except_err after expanded code is executed.
+ *
+ * As except_err is an allowed value, message is printed at
+ * DBG level.
+ */
+#define PSM3_CUDA_CALL_EXCEPT(except_err, func, args...) \
+	({ \
+		CUresult cudaerr; \
+		do { \
+			cudaerr = PSM3_CUDA_EXEC(func, args); \
+			if (cudaerr == except_err) { \
+				PSM3_CUDA_ERROR(func, cudaerr, ERROR); \
+				break; \
+			} \
+			PSM3_CUDA_CHECK(func, cudaerr); \
+		} while (0); \
+		cudaerr; \
+	})
+
+#define PSM3_CUDA_CHECK_EVENT(event, cudaerr) do {			\
+		PSM3_CUDA_SYM_COUNT(cuEventQuery)++;				\
+		cudaerr = PSM3_CUDA_SYM_FP(cuEventQuery)(event);			\
+		if ((cudaerr != CUDA_SUCCESS) && (cudaerr != CUDA_ERROR_NOT_READY)) { \
+			const char *pStr = NULL;			\
+			PSM3_CUDA_SYM_COUNT(cuGetErrorString)++;			\
+			PSM3_CUDA_SYM_FP(cuGetErrorString)(cudaerr, &pStr);		\
+			_HFI_ERROR(					\
+				"CUDA failure: %s() (at %s:%d) returned %d: %s\n",	\
+				"cuEventQuery", __FILE__, __LINE__, cudaerr,		\
+				pStr?pStr:"Unknown");			\
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
+				"Error returned from CUDA function cuEventQuery.\n");\
+		}							\
+	} while (0)
+
+// resolve a cuda shared library symbol
+#define PSM3_CUDA_DLSYM(psm3_cuda_lib,func) do {                \
+	PSM3_CONCAT(psm3_cuda_, func) = dlsym(psm3_cuda_lib, STRINGIFY(func));   \
+	if (!PSM3_CONCAT(psm3_cuda_, func)) {                                    \
+		psm3_handle_error(PSMI_EP_NORETURN,                     \
+				PSM2_INTERNAL_ERR,                              \
+				" Unable to resolve %s symbol"                  \
+				" in CUDA libraries.\n",STRINGIFY(func));       \
+	}                                                           \
+} while (0)
+
+static int psm3_cuda_lib_load()
+{
+	psm2_error_t err = PSM2_OK;
+	char *dlerr;
+
+	PSM2_LOG_MSG("entering");
+	_HFI_DBG("Loading CUDA library.\n");
+
+	psm3_cuda_lib = dlopen("libcuda.so.1", RTLD_LAZY);
+	if (!psm3_cuda_lib) {
+		dlerr = dlerror();
+		_HFI_ERROR("Unable to open libcuda.so.1.  Error %s\n",
+				dlerr ? dlerr : "no dlerror()");
+		goto fail;
+	}
+
+	PSM3_CUDA_SYM_FP(cuDriverGetVersion) = dlsym(psm3_cuda_lib, "cuDriverGetVersion");
+
+	if (!PSM3_CUDA_SYM_FP(cuDriverGetVersion)) {
+		_HFI_ERROR
+			("Unable to resolve symbols in CUDA libraries.\n");
+		goto fail;
+	}
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuGetErrorString);// for PSM3_CUDA_CALL
+
+	PSM3_CUDA_CALL(cuDriverGetVersion, &psm3_cuda_lib_version);
+	if (psm3_cuda_lib_version < 7000) {
+		_HFI_ERROR("Please update CUDA driver, required minimum version is 7.0\n");
+		goto fail;
+	}
+
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuInit);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuCtxGetCurrent);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuCtxDetach);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuCtxSetCurrent);
+#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuCtxSetFlags);
+#endif
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuPointerGetAttribute);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuPointerGetAttributes);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuPointerSetAttribute);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDeviceCanAccessPeer);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDeviceGetAttribute);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDeviceGet);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDeviceGetCount);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuStreamCreate);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuStreamDestroy);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuStreamSynchronize);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuEventCreate);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuEventDestroy);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuEventQuery);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuEventRecord);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuEventSynchronize);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemRetainAllocationHandle);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemRelease);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemHostAlloc);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemFreeHost);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemHostRegister);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemHostUnregister);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpy);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpyDtoD);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpyDtoH);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpyHtoD);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpyDtoHAsync);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemcpyHtoDAsync);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuIpcGetMemHandle);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuIpcOpenMemHandle);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuIpcCloseMemHandle);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuMemGetAddressRange);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDevicePrimaryCtxGetState);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDevicePrimaryCtxRetain);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuDevicePrimaryCtxRelease);
+	PSM3_CUDA_DLSYM(psm3_cuda_lib, cuCtxGetDevice);
+
+	/* CUDA Runtime */
+	psm3_cudart_lib = dlopen("libcudart.so", RTLD_LAZY);
+	if (!psm3_cudart_lib) {
+		dlerr = dlerror();
+		_HFI_ERROR("Unable to open libcudart.so.  Error %s\n",
+				dlerr ? dlerr : "no dlerror()");
+		goto fail;
+	}
+	PSM3_CUDA_DLSYM(psm3_cudart_lib, cudaRuntimeGetVersion);
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+fail:
+	if (psm3_cuda_lib)
+		dlclose(psm3_cuda_lib);
+	if (psm3_cudart_lib)
+		dlclose(psm3_cudart_lib);
+	err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to load CUDA library.\n");
+	return err;
+}
+
+static void psm3_cuda_stats_register()
+{
+#define PSM3_CUDA_COUNT_DECLU64(func) \
+	PSMI_STATS_DECLU64(#func, NULL, &PSM3_CONCAT(psm3_cuda_count_, func))
+
+	struct psmi_stats_entry entries[] = {
+		PSM3_CUDA_COUNT_DECLU64(cuInit),
+		PSM3_CUDA_COUNT_DECLU64(cuCtxDetach),
+		PSM3_CUDA_COUNT_DECLU64(cuCtxGetCurrent),
+		PSM3_CUDA_COUNT_DECLU64(cuCtxSetCurrent),
+#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS
+		PSM3_CUDA_COUNT_DECLU64(cuCtxSetFlags),
+#endif
+		PSM3_CUDA_COUNT_DECLU64(cuPointerGetAttribute),
+		PSM3_CUDA_COUNT_DECLU64(cuPointerGetAttributes),
+		PSM3_CUDA_COUNT_DECLU64(cuPointerSetAttribute),
+		PSM3_CUDA_COUNT_DECLU64(cuDeviceCanAccessPeer),
+		PSM3_CUDA_COUNT_DECLU64(cuDeviceGet),
+		PSM3_CUDA_COUNT_DECLU64(cuDeviceGetAttribute),
+		PSM3_CUDA_COUNT_DECLU64(cuDriverGetVersion),
+		PSM3_CUDA_COUNT_DECLU64(cuDeviceGetCount),
+		PSM3_CUDA_COUNT_DECLU64(cuStreamCreate),
+		PSM3_CUDA_COUNT_DECLU64(cuStreamDestroy),
+		PSM3_CUDA_COUNT_DECLU64(cuStreamSynchronize),
+		PSM3_CUDA_COUNT_DECLU64(cuEventCreate),
+		PSM3_CUDA_COUNT_DECLU64(cuEventDestroy),
+		PSM3_CUDA_COUNT_DECLU64(cuEventQuery),
+		PSM3_CUDA_COUNT_DECLU64(cuEventRecord),
+		PSM3_CUDA_COUNT_DECLU64(cuEventSynchronize),
+		PSM3_CUDA_COUNT_DECLU64(cuMemRetainAllocationHandle),
+		PSM3_CUDA_COUNT_DECLU64(cuMemRelease),
+		PSM3_CUDA_COUNT_DECLU64(cuMemHostAlloc),
+		PSM3_CUDA_COUNT_DECLU64(cuMemFreeHost),
+		PSM3_CUDA_COUNT_DECLU64(cuMemHostRegister),
+		PSM3_CUDA_COUNT_DECLU64(cuMemHostUnregister),
+		PSM3_CUDA_COUNT_DECLU64(cuMemcpy),
+		PSM3_CUDA_COUNT_DECLU64(cuMemcpyDtoD),
+		PSM3_CUDA_COUNT_DECLU64(cuMemcpyDtoH),
+		PSM3_CUDA_COUNT_DECLU64(cuMemcpyHtoD),
+		PSM3_CUDA_COUNT_DECLU64(cuMemcpyDtoHAsync),
+		PSM3_CUDA_COUNT_DECLU64(cuMemcpyHtoDAsync),
+		PSM3_CUDA_COUNT_DECLU64(cuIpcGetMemHandle),
+		PSM3_CUDA_COUNT_DECLU64(cuIpcOpenMemHandle),
+		PSM3_CUDA_COUNT_DECLU64(cuIpcCloseMemHandle),
+		PSM3_CUDA_COUNT_DECLU64(cuMemGetAddressRange),
+		PSM3_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxGetState),
+		PSM3_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxRetain),
+		PSM3_CUDA_COUNT_DECLU64(cuCtxGetDevice),
+		PSM3_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxRelease),
+		PSM3_CUDA_COUNT_DECLU64(cuGetErrorString),
+		PSM3_CUDA_COUNT_DECLU64(cudaRuntimeGetVersion),
+	};
+#undef PSM3_CUDA_COUNT_DECLU64
+
+	psm3_stats_register_type("PSM_Cuda_call_statistics",
+		"Count of CUDA calls per API entry point for the whole process.\n"
+		"When using an NVIDIA GPU, PSM3 may call lower level CUDA "
+		"APIs to access or transfer application buffers in GPU memory.",
+			PSMI_STATSTYPE_GPU,
+			entries, PSMI_HOWMANY(entries), NULL,
+			&PSM3_CUDA_SYM_COUNT(cuInit), NULL); /* context must != NULL */
+}
+
+#ifdef PSM_HAVE_RNDV_MOD
+static void psm3_cuda_get_bars(void)
+{
+	int num_devices, dev;
+	union psmi_envvar_val env;
+
+	psm3_getenv("PSM3_GPU_PIN_CHECK",
+			"Enable sanity check of physical addresses mapped into GPU BAR space (Enabled by default)",
+			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val)1, &env);
+	psm3_cuda_gpu_pin_check = env.e_int;
+
+	PSM3_CUDA_CALL(cuDeviceGetCount, &num_devices);
+	psm3_cuda_gpu_bars = psmi_calloc(PSMI_EP_NONE, UNDEFINED, num_devices, sizeof(psm3_cuda_gpu_bars[0]));
+	if (! psm3_cuda_gpu_bars)
+		return;	// psmi_calloc will have exited for Out of Memory
+
+	if (psm3_cuda_gpu_pin_check)
+		psm3_cuda_num_gpu_bars = num_devices;
+
+	for (dev = 0; dev < num_devices; dev++) {
+		CUdevice device;
+		int domain, bus, slot;
+
+		PSM3_CUDA_CALL(cuDeviceGet, &device, dev);
+		PSM3_CUDA_CALL(cuDeviceGetAttribute,
+				&domain,
+				CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID,
+				device);
+		PSM3_CUDA_CALL(cuDeviceGetAttribute,
+				&bus,
+				CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
+				device);
+		PSM3_CUDA_CALL(cuDeviceGetAttribute,
+				&slot,
+				CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
+				device);
+		psm3_cuda_gpu_bars[dev] = psm3_cuda_get_nvidia_bar_addr(domain, bus, slot);
+	}
+}
+#endif /* PSM_HAVE_RNDV_MOD */
+
+static void psm3_cuda_init_env_cpe(void)
+{
+	union psmi_envvar_val val;
+
+	int ret = psm3_getenv_range("PSM3_CUDA_PERMITTED_ENFORCEMENT",
+		"Enforcement policy for the CUDA_PERMITTED endpoint flag\n",
+		"  0: REJECT attempts to modify as an error\n"
+		"  1: IGNORE attempts to modify, feigning success (default)\n"
+		"  2: OBEY by restricting CUDA usage",
+		PSMI_ENVVAR_LEVEL_HIDDEN,
+		PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val)PSM3_CPE_IGNORE,
+		(union psmi_envvar_val)PSM3_CPE_REJECT,
+		(union psmi_envvar_val)PSM3_CPE_OBEY,
+		NULL,
+		NULL,
+		&val);
+
+	if (!ret)
+		psm3_cuda_permitted_enforcement = (psm3_cuda_permitted_enforcement_t)val.e_uint;
+}
+
+static void psm3_cuda_init_env_sync(void)
+{
+	union psmi_envvar_val val;
+
+	int ret = psm3_getenv_range("PSM3_CUDA_SYNC",
+		"Policy for setting synchroniation attributes.\n",
+		"  0 CTX: attempt to set context-level SYNC_MEMOPS on CUDA 12.1 or better\n"
+		"      otherwise, set pointer-level SYNC_MEMOPS\n"
+		"  1 PTR: always set pointer-level SYNC_MEMOPS\n"
+		"  2 PTR_RELAXED: always set pointer-level SYNC_MEMOPS,\n"
+		"      but ignore 801 (not supported, expected for VMM allocs)\n"
+		"  3 NONE: never set SYNC_MEMOPS\n",
+		PSMI_ENVVAR_LEVEL_HIDDEN,
+		PSMI_ENVVAR_TYPE_UINT,
+		(union psmi_envvar_val)PSM3_CUDA_SYNC_CTX,
+		(union psmi_envvar_val)PSM3_CUDA_SYNC_CTX,
+		(union psmi_envvar_val)PSM3_CUDA_SYNC_NONE,
+		NULL,
+		NULL,
+		&val);
+
+	if (!ret)
+		psm3_cuda_sync_mode = (psm3_cuda_sync_mode_t)val.e_uint;
+}
+
+static psm2_error_t psm3_cuda_initialize(void)
+{
+	psm2_error_t err = PSM2_OK;
+
+	PSM2_LOG_MSG("entering");
+	_HFI_DBG("Enabling CUDA support.\n");
+
+	psm3_cuda_stats_register();
+
+	err = psm3_cuda_lib_load();
+	if (err != PSM2_OK)
+		goto fail;
+
+	PSM3_CUDA_CALL(cuInit, 0);
+
+	PSM3_CUDA_CALL(cudaRuntimeGetVersion, &psm3_cuda_runtime_ver);
+
+#ifdef PSM_HAVE_RNDV_MOD
+	psm3_cuda_get_bars();
+#endif
+	if (! psm3_gpu_thresh_rndv) // sockets HAL could set new default
+		psm3_gpu_thresh_rndv = PSM3_CUDA_GPU_THRESH_RNDV;
+	psm3_gpu_rndv_nic_window_default = PSM3_CUDA_RNDV_NIC_WINDOW_DEFAULT;
+	psm3_gpu_gpudirect_rdma_send_limit_default = PSM3_CUDA_GPUDIRECT_RDMA_SEND_LIMIT_DEFAULT;
+	psm3_gpu_gpudirect_rdma_recv_limit_default = PSM3_CUDA_GPUDIRECT_RDMA_RECV_LIMIT_DEFAULT;
+	psm3_gpu_mq_rndv_shm_gpu_thresh_default = PSM3_CUDA_MQ_RNDV_SHM_GPU_THRESH;
+
+	psm3_cuda_init_env_cpe();
+	psm3_cuda_init_env_sync();
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+fail:
+	err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to initialize PSM3 CUDA support.\n");
+	return err;
+}
+
+static void psm3_cuda_finalize(void)
+{
+	psm3_stats_deregister_type(PSMI_STATSTYPE_GPU, &PSM3_CUDA_SYM_COUNT(cuInit));
+}
+
+static void psm3_cuda_ep_open(void)
+{
+	// nothing to do
+}
+
+static void psm3_cuda_ep_close(void)
+{
+	// nothing to do
+}
+
+static void psm3_cuda_identify(char *accel_vers, size_t size)
+{
+	char cudart_ver[64] = "unknown";
+	if (psm3_cuda_runtime_ver)
+		snprintf(cudart_ver, sizeof(cudart_ver), "%d.%d",
+			psm3_cuda_runtime_ver / 1000, (psm3_cuda_runtime_ver % 1000) / 10);
+	snprintf(accel_vers, size, "%s %s CUDA Runtime %s built against interface %d.%d\n",
+		psm3_get_mylabel(), psm3_ident_tag,
+		cudart_ver, CUDA_VERSION / 1000, (CUDA_VERSION % 1000) / 10);
+}
+
+static int psm3_cuda_p2p_supported()
+{
+	static int p2p_supported = -1; // -1 indicates "unset"
+	if (likely(p2p_supported > -1))
+		return p2p_supported;
+
+	p2p_supported = 0;
+
+	/* Check which devices the current device has p2p access to. */
+	CUdevice  current_device;
+	CUcontext current_context;
+	int num_devices, dev_idx;
+	PSM3_CUDA_CALL(cuDeviceGetCount, &num_devices);
+
+	if (num_devices > 1) {
+		PSM3_CUDA_CALL(cuCtxGetCurrent, &current_context);
+		if (current_context == NULL) {
+			_HFI_INFO("Unable to find active CUDA context, assuming P2P not supported\n");
+			return 0;
+		}
+		PSM3_CUDA_CALL(cuCtxGetDevice, &current_device);
+	}
+
+	for (dev_idx = 0; dev_idx < num_devices; dev_idx++) {
+		CUdevice device;
+		PSM3_CUDA_CALL(cuDeviceGet, &device, dev_idx);
+
+		if (num_devices > 1 && device != current_device) {
+			int canAccessPeer = 0;
+			PSM3_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
+					current_device, device);
+
+			if (canAccessPeer != 1)
+				_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev_idx);
+			else
+				p2p_supported |= (1 << dev_idx);
+		} else {
+			/* Always support p2p on the same GPU */
+			psm3_my_gpu_device = dev_idx;
+			p2p_supported |= (1 << dev_idx);
+		}
+	}
+
+	_HFI_DBG("returning (0x%x), device 0x%x (%d)\n", p2p_supported, (1 << psm3_my_gpu_device), psm3_my_gpu_device);
+	return p2p_supported;
+}
+
+static int psm3_cuda_gpudirect_supported()
+{
+	static int device_support_gpudirect = -1; // -1 indicates unset
+	
+	if (likely(device_support_gpudirect > -1)) return device_support_gpudirect;
+
+	int num_devices, dev;
+
+	/* Check if all devices support GPU Direct RDMA based on version. */
+	PSM3_CUDA_CALL(cuDeviceGetCount, &num_devices);
+
+	device_support_gpudirect = 1;
+
+	for (dev = 0; dev < num_devices; dev++) {
+		CUdevice device;
+		PSM3_CUDA_CALL(cuDeviceGet, &device, dev);
+
+		int major;
+		PSM3_CUDA_CALL(cuDeviceGetAttribute, &major,
+				CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device);
+		if (major < 3) {
+			device_support_gpudirect = 0;
+			_HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
+		}
+	}
+
+	return device_support_gpudirect;
+}
+
+static void psm3_cuda_using_rv_for_mrs(void)
+{
+	// nothing to do
+}
+
+static void psm3_cuda_verify_GPU_capabilities(void)
+{
+	static int device_support_unified_addr = -1; // -1 indicates "unchecked"
+	// we confirm the GPU supports unified addressing since this
+	// allows a GPU address alone to be sufficient to identify the GPU device
+	if (likely(device_support_unified_addr > -1)) return;
+
+	int num_devices, dev;
+
+	/* Check if all devices support Unified Virtual Addressing. */
+	PSM3_CUDA_CALL(cuDeviceGetCount, &num_devices);
+
+	device_support_unified_addr = 1;
+
+	for (dev = 0; dev < num_devices; dev++) {
+		CUdevice device;
+		PSM3_CUDA_CALL(cuDeviceGet, &device, dev);
+		int unifiedAddressing;
+		PSM3_CUDA_CALL(cuDeviceGetAttribute,
+				&unifiedAddressing,
+				CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
+				device);
+
+		if (unifiedAddressing !=1) {
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_EP_DEVICE_FAILURE,
+				"CUDA device %d does not support Unified Virtual Addressing.\n",
+				dev);
+		}
+	}
+
+	return;
+}
+
+static void psm3_cuda_get_pci_addr(uint32_t *domain_p, uint32_t *bus_p,
+							uint32_t *dev_p, uint32_t *func_p)
+{
+	int domain, bus, dev;
+	int num_devices;
+	CUdevice device;
+
+	PSM3_CUDA_CALL(cuDeviceGetCount, &num_devices);
+		_HFI_DBG("%d Cuda GPUs found\n", num_devices);
+	if (! num_devices)
+		return;
+
+	if (num_devices == 1) {
+		PSM3_CUDA_CALL(cuDeviceGet, &device, 0);
+	} else {
+		// all GPUs will be visible to process, see if app chose one first
+		CUcontext ctxt = {0};
+		if (! PSM3_CUDA_SYM_FP(cuCtxGetCurrent) || PSM3_CUDA_SYM_FP(cuCtxGetCurrent)(&ctxt) || ! ctxt) {
+			_HFI_DBG("Unable to get Cuda ctxt\n");
+			//PSM3_CUDA_CALL(cuDeviceGet, &device, 0);
+			return;
+		} else {
+			PSM3_CUDA_CALL(cuCtxGetDevice, &device);
+		}
+	}
+	_HFI_DBG("Using Cuda GPU %d\n", device);
+	PSM3_CUDA_CALL(cuDeviceGetAttribute,
+							&domain, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, device);
+	PSM3_CUDA_CALL(cuDeviceGetAttribute,
+							&bus, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device);
+	PSM3_CUDA_CALL(cuDeviceGetAttribute,
+							&dev, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, device);
+	*domain_p = domain;
+	*bus_p = bus;
+	*dev_p = dev;
+	*func_p = 0;
+}
+
+#ifdef PSM_HAVE_RNDV_MOD
+// The second BAR address is where the GPU will map GPUDirect memory.
+// The beginning of this BAR is reserved for non-GPUDirect uses.
+// However, it has been observed that in some multi-process
+// pinning failures, HED-2035, the nvidia_p2p_get_pages can foul up
+// it's IOMMU after which the next successful pin will incorrectly
+// return the 1st physical address of the BAR for the pinned pages.
+// In this case it will report this same physical address for other GPU virtual
+// addresses and cause RDMA to use the wrong memory.
+// As a workaround, we gather the Region 1 BAR address start for each
+// GPU and if we see this address returned as the phys_addr of a mmapped
+// GPUDirect Copy or the iova of a GPU MR we fail the job before it can
+// corrupt any more application data.
+static uint64_t psm3_cuda_get_nvidia_bar_addr(int domain, int bus, int slot)
+{
+	char sysfs[100];
+	int ret;
+	FILE *f;
+	unsigned long long start_addr, end_addr, bar_size;
+
+	ret = snprintf(sysfs, sizeof(sysfs),
+		"/sys/class/pci_bus/%04x:%02x/device/%04x:%02x:%02x.0/resource",
+		domain, bus, domain, bus, slot);
+	psmi_assert_always(ret < sizeof(sysfs));
+	f = fopen(sysfs, "r");
+	if (! f) {
+		if (psm3_cuda_gpu_pin_check) {
+			_HFI_ERROR("Unable to open %s for GPU BAR Address: %s\n",
+				sysfs, strerror(errno));
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				"Unable to get GPU BAR address\n");
+		}
+		return 0;
+	}
+	// for each BAR region, start, end and flags are listed in hex
+	// nVidia uses the 2nd BAR region (aka Region #1) to map peer to peer
+	// accesses into it's potentially larger GPU local memory space
+	ret = fscanf(f, "%*x %*x %*x %llx %llx", &start_addr, &end_addr);
+	if (ret != 2) {
+		if (psm3_cuda_gpu_pin_check) {
+			_HFI_ERROR("Unable to get GPU BAR Address from %s: %s\n",
+				sysfs, strerror(errno));
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				"Unable to get GPU BAR address\n");
+		}
+		fclose(f);
+		return 0;
+	}
+	fclose(f);
+
+	bar_size = (end_addr - start_addr) + 1;
+	_HFI_DBG("GPU BAR Addr from %s is 0x%llx - 0x%llx (size 0x%llx)\n", sysfs, start_addr, end_addr, bar_size);
+	if (! psm3_cuda_min_gpu_bar_size || bar_size < psm3_cuda_min_gpu_bar_size)
+		psm3_cuda_min_gpu_bar_size = bar_size;
+	return start_addr;
+}
+
+static uint64_t psm3_cuda_min_bar_size(void)
+{
+	// for ONEAPI can return 0 for now, implement later
+	return psm3_cuda_min_gpu_bar_size;
+}
+
+static psm2_error_t psm3_cuda_check_phys_addr(uint64_t phys_addr)
+{
+	int i;
+	for (i=0; i < psm3_cuda_num_gpu_bars; i++) {
+		if (phys_addr == psm3_cuda_gpu_bars[i]) {
+			_HFI_ERROR("Incorrect Physical Address (0x%"PRIx64") returned by nVidia driver.  PSM3 exiting to avoid data corruption.  Job may be rerun with PSM3_GPUDIRECT=0 to avoid this issue.\n",
+				phys_addr);
+			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+				"Incorrect Physical Address returned by nVidia driver\n");
+			psmi_assert_always(0);
+			return PSM2_INTERNAL_ERR;
+		}
+	}
+	return PSM2_OK;
+}
+
+static void psm3_cuda_roundup_gdrcopy(unsigned long buf, size_t size,
+							uintptr_t *pageaddr_p, uint64_t *pagelen_p)
+{
+	*pageaddr_p = buf & GPU_PAGE_MASK;
+	*pagelen_p = (uint64_t) (PSMI_GPU_PAGESIZE +
+						((buf + size - 1) & GPU_PAGE_MASK) - *pageaddr_p);
+}
+
+#ifdef PSM_HAVE_REG_MR
+static void psm3_cuda_roundup_rv_reg_mr(struct psm2_ep *ep,
+							void **addr_p, uint64_t *length_p, int access)
+{
+	uint64_t addr_in = (uint64_t)*addr_p;
+
+	*addr_p   = (void *)ROUNDDOWN64P2(addr_in, PSMI_GPU_PAGESIZE);
+	*length_p = ROUNDUP64P2(addr_in + *length_p, PSMI_GPU_PAGESIZE) - (uint64_t)*addr_p;
+}
+
+// add Cuda specific information to the mparams in prep for the
+// RV_IOCTL_REG_MEM ioctl to rv
+// For Cuda, no additional information is needed
+static int psm3_cuda_init_rv_reg_mr_params(
+				void *addr, uint64_t length, int access,
+				struct rv_mem_params *mparams,
+				union psm3_verbs_mr_gpu_specific *gpu_specific,
+				union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad)
+{
+	// nothing to do
+	return 0;
+}
+#endif // PSM_HAVE_REG_MR
+
+// add Cuda specific information to the params in prep for the
+// RV_IOCTL_PIN_MMAP ioctl to rv
+// For Cuda, no additional information is needed
+static int psm3_cuda_init_rv_pin_mmap_params(
+				void *addr, uint64_t length, int access,
+				struct rv_gpu_mem_params *params,
+				union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad)
+{
+	// nothing to do
+	return 0;
+}
+
+// cleanup Cuda specific scratchpad from
+// psm3_cuda_init_rv_reg_mr_params or
+// psm3_cuda_init_rv_pin_mmap_params
+// called on success or error path, makes sure not to polute errno
+// as it can reflect the earlier error for the error path in caller.
+static void psm3_cuda_rv_reg_mmap_cleanup(
+				void *addr, uint64_t length, int access,
+				union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad)
+{
+	// nothing to do
+}
+#endif /* PSM_HAVE_RNDV_MOD */
+
+#ifdef PSM_HAVE_REG_MR
+// compare GPU specific fields in verbs MR cache entry
+static int psm3_cuda_cmp_mr(const union psm3_verbs_mr_gpu_specific *a,
+		const union psm3_verbs_mr_gpu_specific *b)
+{
+	// nothing to do
+	return 0;
+}
+
+// initialize GPU specific fields in verbs MR cache entry
+static void psm3_cuda_init_mr(void *addr, uint64_t length, int access,
+				union psm3_verbs_mr_gpu_specific *gpu_specific)
+{
+	// nothing to do
+}
+#endif /* PSM_HAVE_REG_MR */
+
+static void psm3_cuda_fetch_ctxt(void)
+{
+	PSM3_CUDA_CALL(cuCtxGetCurrent, &psm3_cu_ctxt);
+}
+
+// ensure psm3_cu_ctxt reflects our most recent psm3_cu_ctxt
+static void psm3_cuda_refresh_ctxt(void)
+{
+	if (psm3_cu_ctxt)
+		PSM3_CUDA_CALL(cuCtxSetCurrent, psm3_cu_ctxt);
+}
+
+static void psm3_cuda_register_hostmem(void *buf, uint32_t size)
+{
+#ifndef PSM3_NO_CUDA_REGISTER
+	// By registering memory with Cuda, we make
+	// cuMemcpy run faster for copies
+	if (psm3_cuda_check_have_cuda_ctxt()) {
+		PSM3_CUDA_CALL(cuMemHostRegister,
+					buf, size, CU_MEMHOSTALLOC_PORTABLE);
+	}
+#endif
+}
+
+static void psm3_cuda_unregister_hostmem(void *buf)
+{
+#ifndef PSM3_NO_CUDA_REGISTER
+	if (psm3_cu_ctxt) {
+		/* ignore NOT_REGISTERED in case cuda initialized late */
+		/* ignore other errors as context could be destroyed before this */
+		CUresult cudaerr = PSM3_CUDA_EXEC_ASSUME_CONTEXT(cuMemHostUnregister, buf);
+		if (cudaerr)
+			PSM3_CUDA_ERROR(cuMemHostUnregister, cudaerr, DBG);
+	}
+#endif
+}
+
+static int psm3_cuda_is_gpu_mem(const void *ptr)
+{
+	CUresult cudaerr;
+	CUpointer_attribute attrs[] = {
+		CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+		CU_POINTER_ATTRIBUTE_IS_MANAGED,
+	};
+	CUmemorytype mt = 0;
+	uint64_t managed = 0;
+	void *resp[] = { &mt, &managed };
+
+	static_assert(PSMI_HOWMANY(attrs) == PSMI_HOWMANY(resp),
+		"attribute count must equal response count");
+
+	cudaerr = PSM3_CUDA_SYM_FP(cuPointerGetAttributes)(
+		PSMI_HOWMANY(attrs), attrs, resp, (CUdeviceptr)ptr);
+	PSM3_CUDA_SYM_COUNT(cuPointerGetAttributes) += 1;
+	return cudaerr == CUDA_SUCCESS && mt == CU_MEMORYTYPE_DEVICE && !managed;
+}
+
+static void psm3_cuda_prepare_HtoD_memcpys(struct ips_protoexp *protoexp)
+{
+	protoexp->gpu_specific.cudastream_recv = NULL;
+}
+
+static void psm3_cuda_prepare_DtoH_memcpys(struct ips_proto *proto)
+{
+	proto->gpu_specific.cudastream_send = NULL;
+}
+
+static void psm3_cuda_shutdown_HtoD_memcpys(struct ips_protoexp *protoexp)
+{
+	if (protoexp->gpu_specific.cudastream_recv != NULL) {
+			PSM3_CUDA_CALL(cuStreamDestroy, protoexp->gpu_specific.cudastream_recv);
+	}
+}
+
+static void psm3_cuda_shutdown_DtoH_memcpys(struct ips_proto *proto)
+{
+	if (proto->gpu_specific.cudastream_send) {
+		PSM3_CUDA_CALL(cuStreamDestroy, proto->gpu_specific.cudastream_send);
+	}
+}
+
+static void psm3_cuda_memcpy_HtoD_start(struct ips_protoexp *protoexp,
+								struct ips_gpu_hostbuf *ghb, uint32_t len)
+{
+	if (protoexp->gpu_specific.cudastream_recv == NULL) {
+		PSM3_CUDA_CALL(cuStreamCreate, &protoexp->gpu_specific.cudastream_recv,
+						CU_STREAM_NON_BLOCKING);
+	}
+	PSM3_CUDA_CALL(cuMemcpyHtoDAsync, (CUdeviceptr)ghb->gpu_buf, ghb->host_buf,
+						len, protoexp->gpu_specific.cudastream_recv);
+	if (ghb->gpu_specific.cuda_copy_status == NULL) {
+		PSM3_CUDA_CALL(cuEventCreate, &ghb->gpu_specific.cuda_copy_status, CU_EVENT_DEFAULT);
+	}
+	PSM3_CUDA_CALL(cuEventRecord, ghb->gpu_specific.cuda_copy_status, protoexp->gpu_specific.cudastream_recv);
+}
+
+static void psm3_cuda_memcpy_DtoH_start(struct ips_proto *proto,
+								struct ips_gpu_hostbuf *ghb, uint32_t len)
+{
+	if (proto->gpu_specific.cudastream_send == NULL) {
+		PSM3_CUDA_CALL(cuStreamCreate, &proto->gpu_specific.cudastream_send,
+						CU_STREAM_NON_BLOCKING);
+	}
+	if (ghb->gpu_specific.cuda_copy_status == NULL) {
+		PSM3_CUDA_CALL(cuEventCreate, &ghb->gpu_specific.cuda_copy_status, CU_EVENT_DEFAULT);
+	}
+	PSM3_CUDA_CALL(cuMemcpyDtoHAsync, ghb->host_buf, (CUdeviceptr)ghb->gpu_buf,
+					len, proto->gpu_specific.cudastream_send);
+	PSM3_CUDA_CALL(cuEventRecord, ghb->gpu_specific.cuda_copy_status, proto->gpu_specific.cudastream_send);
+}
+
+static int psm3_cuda_memcpy_done(struct ips_gpu_hostbuf *ghb)
+{
+	CUresult status;
+	PSM3_CUDA_CHECK_EVENT(ghb->gpu_specific.cuda_copy_status, status);
+	return (status == CUDA_SUCCESS);
+}
+
+static void psm3_cuda_hostbuf_lazy_init(struct ips_gpu_hostbuf *ghb)
+{
+	ghb->gpu_specific.cuda_copy_status = NULL;
+}
+
+static void psm3_cuda_hostbuf_reset(struct ips_gpu_hostbuf *ghb)
+{
+	// nothing to do
+}
+
+static void psm3_cuda_hostbuf_destroy(struct ips_gpu_hostbuf *ghb)
+{
+	if (ghb->gpu_specific.cuda_copy_status != NULL) {
+		PSM3_CUDA_CALL(cuEventDestroy, ghb->gpu_specific.cuda_copy_status);
+	}
+	if (ghb->host_buf != NULL) {
+		PSM3_CUDA_CALL(cuMemFreeHost, ghb->host_buf);
+	}
+}
+
+static void psm3_cuda_memcpy_DtoD(void *dstptr, const void *srcptr, uint32_t len)
+{
+	PSM3_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)dstptr, (CUdeviceptr)srcptr, len);
+}
+
+static void psm3_cuda_memcpy_HtoD(void *dstptr, const void *srcptr, uint32_t len)
+{
+	PSM3_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)dstptr, srcptr, len);
+}
+
+static void psm3_cuda_memcpy_DtoH(void *dstptr, const void *srcptr, uint32_t len)
+{
+	PSM3_CUDA_CALL(cuMemcpyDtoH, dstptr, (CUdeviceptr)srcptr, len);
+}
+
+static void psm3_cuda_memcpy(void *dstptr, const void *srcptr, uint32_t len)
+{
+	PSM3_CUDA_CALL(cuMemcpy, (CUdeviceptr)dstptr, (CUdeviceptr)srcptr, len);
+}
+
+static void psm3_cuda_synchronize_memcpy(void)
+{
+	PSM3_CUDA_CALL(cuStreamSynchronize, 0);
+}
+
+/*
+ * CUDA documentation dictates the use of SYNC_MEMOPS attribute when the buffer
+ * pointer received into PSM has been allocated by the application and is the
+ * target of GPUDirect DMA operations.
+ *
+ * Normally, CUDA is permitted to implicitly execute synchronous memory
+ * operations as asynchronous operations, relying on commands arriving via CUDA
+ * for proper sequencing.  GDR, however, bypasses CUDA, enabling races, e.g.
+ * cuMemcpy sequenced before a GDR operation.
+ *
+ * SYNC_MEMOPS avoids this optimization.
+ *
+ * Note that allocations via the "VMM" API, i.e. cuMemCreate, do not support the
+ * SYNC_MEMOPS pointer attribute, and will return 801 (not supported). If we're
+ * using the newer context-level sync flag available in CUDA 12.1+ to avoid this
+ * issue, we will not set the pointer-level sync flag here.
+ */
+static void psm3_cuda_mark_buf_synchronous(const void *buf)
+{
+	bool check_for_not_supported = false;
+
+	switch (psm3_cuda_sync_mode) {
+	case PSM3_CUDA_SYNC_CTX:
+#if PSM3_CUDA_HAVE_CTX_SYNC_MEMOPS
+		// sync set at the context-level; nothing to do here
+		return;
+#else
+		// otherwise, intentional fall through to PTR behavior
+#endif
+	case PSM3_CUDA_SYNC_PTR:
+		// pointer level sync, handling all errors
+		break;
+	case PSM3_CUDA_SYNC_PTR_RELAXED:
+		// pointer level sync, ignoring not supported
+		check_for_not_supported = true;
+		break;
+	case PSM3_CUDA_SYNC_NONE:
+		return;
+	}
+
+	CUresult cudaerr;
+	int true_flag = 1;
+
+	cudaerr = PSM3_CUDA_EXEC(cuPointerSetAttribute,
+		&true_flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr)buf);
+
+	if_pf (check_for_not_supported && cudaerr == CUDA_ERROR_NOT_SUPPORTED) {
+#ifdef PSM_DEBUG
+		// query the handle just to be sure it is in fact a VMM alloc
+		CUmemGenericAllocationHandle h;
+		PSM3_CUDA_CALL(cuMemRetainAllocationHandle, &h, (CUdeviceptr)buf);
+		PSM3_CUDA_CALL(cuMemRelease, h);
+#endif
+		return;
+	}
+
+	PSM3_CUDA_CHECK(cuPointerSetAttribute, cudaerr);
+	return;
+}
+
+static void psm3_cuda_host_alloc(void **ret_ptr, uint32_t size)
+{
+	PSM3_CUDA_CALL(cuMemHostAlloc, (void **)ret_ptr, size,
+					CU_MEMHOSTALLOC_PORTABLE);
+}
+
+static void psm3_cuda_host_free(void *ptr)
+{
+	PSM3_CUDA_CALL(cuMemFreeHost, (void *)ptr);
+}
+
+static int psm3_cuda_gpu_addr_send_mr(struct psm2_mq_req *mqreq)
+{
+	return mqreq->is_buf_gpu_mem && ! mqreq->gpu_hostbuf_used;
+}
+
+static int psm3_cuda_gpu_addr_recv_mr(struct ips_tid_recv_desc *tidrecvc,
+							int gpu_hostbuf_used)
+{
+	return tidrecvc->is_ptr_gpu_backed;
+}
+
+//***************************************************************************
+//cuda support for PSM3_DEVICES "shm", via an IPC handle cache and Cuda IPC
+//In platforms with NVLINK between GPUs, Cuda IPC will use NVLINK.
+
+#define CUDA_MEMHANDLE_CACHE_SIZE 64
+
+/*
+ * rbtree cruft
+ */
+struct _cl_map_item;
+
+typedef struct
+{
+	unsigned long			start;			/* start virtual address */
+	CUipcMemHandle			cuda_ipc_handle; /* cuda ipc mem handle */
+	CUdeviceptr				cuda_ipc_dev_ptr;/* Cuda device pointer */
+	psm2_epid_t				epid;
+	struct _cl_map_item*	i_prev;			/* idle queue previous */
+	struct _cl_map_item*	i_next;			/* idle queue next */
+}__attribute__ ((aligned (128))) psm3_rbtree_cuda_memhandle_cache_mapitem_pl_t;
+
+typedef struct {
+	uint32_t		nelems;	/* number of elements in the cache */
+} psm3_rbtree_cuda_memhandle_cache_map_pl_t;
+
+/*
+ * Custom comparator
+ */
+typedef psm3_rbtree_cuda_memhandle_cache_mapitem_pl_t psm3_cuda_cache_item;
+
+static int psm3_cuda_cache_key_cmp(const psm3_cuda_cache_item *a, const psm3_cuda_cache_item *b)
+{
+	// we use epid as part of cache key so multi-ep and multi-process jobs
+	// can have a better cache hit rate.  In some cases we may end up with
+	// cache entries for the same buffer with different epid's all within the
+	// same multi-ep rank, but this does no harm other than to waste some
+	// cache space.  By including epid in key_cmp we have a chance to have
+	// separate cache entries for the same sbuf address in different
+	// sender's GPU virtual address space.
+	switch (psm3_epid_cmp_internal(a->epid, b->epid)) {
+	case -1: return -1;
+	case 1: return 1;
+	default:
+		break;
+	}
+
+	// The sender has used cuMemGetAddressRange to normalize the address
+	// so we can simply compare the start address of the allocation.
+	// Note cuIpcOpenMemHandle only needs the start address as well, so we
+	// ignore length
+	if (a->start < b->start)
+		return -1;
+	if (b->start < a->start)
+		return 1;
+
+	return 0;
+}
+
+
+/*
+ * Necessary rbtree cruft
+ */
+#define RBTREE_MI_PL    psm3_rbtree_cuda_memhandle_cache_mapitem_pl_t
+#define RBTREE_MAP_PL   psm3_rbtree_cuda_memhandle_cache_map_pl_t
+#define RBTREE_CMP(a,b) psm3_cuda_cache_key_cmp((a), (b))
+#define RBTREE_ASSERT   psmi_assert
+#define RBTREE_MAP_COUNT(PAYLOAD_PTR)   ((PAYLOAD_PTR)->nelems)
+#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
+
+#include "psm3_rbtree.h"
+#include "psm3_rbtree.c"
+
+/*
+ * Convenience rbtree cruft
+ */
+#define NELEMS(cache)	((cache)->map.payload.nelems)
+
+#define IHEAD(cache)	((cache)->map.root)
+#define LAST(cache)	(IHEAD(cache)->payload.i_prev)
+#define FIRST(cache)	(IHEAD(cache)->payload.i_next)
+#define INEXT(x)	((x)->payload.i_next)
+#define IPREV(x)	((x)->payload.i_prev)
+
+/*
+ * Actual module data
+ */
+struct psm3_cuda_memhandle_cache {
+	cl_qmap_t map;
+	mpool_t mpool;
+	uint32_t size;
+	psm2_mq_stats_t *stats;
+};
+typedef struct psm3_cuda_memhandle_cache *psm3_cuda_memhandle_cache_t;
+
+static void psm3_print_cuda_memhandle_cache_stats(psm2_mq_stats_t *stats)
+{
+	_HFI_DBG("limit=%lu,maxelems=%lu,hit=%lu,miss=%lu,evict=%lu,remove=%lu,clear=%lu\n",
+		stats->gpu_ipc_cache_limit, stats->gpu_ipc_cache_max_nelems,
+		stats->gpu_ipc_cache_hit, stats->gpu_ipc_cache_miss,
+		stats->gpu_ipc_cache_evict, stats->gpu_ipc_cache_remove,
+		stats->gpu_ipc_cache_clear);
+}
+
+/*
+ * This is the callback function when mempool are resized or destroyed.
+ * Upon calling cache fini mpool is detroyed which in turn calls this callback
+ * which helps in closing all memhandles.
+ */
+static void
+psm3_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
+{
+	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
+	if (!is_alloc) {
+		if(memcache_item->payload.start)
+			PSM3_CUDA_CALL(cuIpcCloseMemHandle,
+						memcache_item->payload.cuda_ipc_dev_ptr);
+	}
+}
+
+/*
+ * Creating mempool for cuda memhandle cache nodes.
+ */
+static psm2_error_t
+psm3_cuda_memhandle_mpool_alloc(psm3_cuda_memhandle_cache_t cache,
+							uint32_t memcache_size)
+{
+	psm2_error_t err;
+	if (memcache_size < 1)
+		return PSM2_PARAM_ERR;
+
+	cache->size = memcache_size;
+	/* Creating a memory pool of size PSM3_CUDA_MEMCACHE_SIZE
+	 * which includes the Root and NIL items
+	 */
+	cache->mpool = psm3_mpool_create_for_gpu(sizeof(cl_map_item_t),
+					cache->size,
+					cache->size, 0,
+					UNDEFINED, NULL, NULL,
+					psm3_cuda_memhandle_cache_alloc_func,
+					NULL);
+	if (cache->mpool == NULL) {
+		err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+				"Couldn't allocate CUDA host receive buffer pool");
+		return err;
+	}
+	return PSM2_OK;
+}
+
+/*
+ * allocate and initialize memhandle cache
+ * including rbtree.
+ */
+static psm2_error_t psm3_cuda_memhandle_cache_alloc(
+				psm3_cuda_memhandle_cache_t *cachep, uint32_t memcache_size,
+				psm2_mq_stats_t *stats)
+{
+	cl_map_item_t *root = NULL, *nil_item = NULL;
+
+	*cachep = (psm3_cuda_memhandle_cache_t)psmi_calloc(
+					NULL, UNDEFINED, 1, sizeof(**cachep));
+	if (! *cachep)
+		return PSM2_NO_MEMORY;
+
+	psm2_error_t err = psm3_cuda_memhandle_mpool_alloc(*cachep, memcache_size);
+	if (err != PSM2_OK)
+		goto fail;
+
+	root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
+	if (root == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+	nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
+	if (nil_item == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	nil_item->payload.start = 0;
+	nil_item->payload.epid = psm3_epid_zeroed_internal();
+	ips_cl_qmap_init(&(*cachep)->map,root,nil_item);
+	NELEMS(*cachep) = 0;
+
+	(*cachep)->stats = stats;
+
+	stats->gpu_ipc_cache_limit = memcache_size;
+	stats->gpu_ipc_cache_nelems = 0;
+	stats->gpu_ipc_cache_max_nelems = 0;
+	stats->gpu_ipc_cache_hit = 0;
+	stats->gpu_ipc_cache_miss = 0;
+	stats->gpu_ipc_cache_evict = 0;
+	stats->gpu_ipc_cache_remove = 0;
+	stats->gpu_ipc_cache_clear = 0;
+
+	return PSM2_OK;
+
+fail:
+	if (nil_item)
+		psmi_free(nil_item);
+	if (root)
+		psmi_free(root);
+	if ((*cachep)->mpool)
+		psm3_mpool_destroy((*cachep)->mpool);
+	psmi_free(*cachep);
+	return err;
+}
+
+static void psm3_cuda_memhandle_cache_free(psm3_cuda_memhandle_cache_t cache)
+{
+	psm3_print_cuda_memhandle_cache_stats(cache->stats);
+
+	if (cache->map.nil_item)
+		psmi_free(cache->map.nil_item);
+	if (cache->map.root)
+		psmi_free(cache->map.root);
+	if (cache->mpool)
+		psm3_mpool_destroy(cache->mpool);
+	psmi_free(cache);
+}
+
+/*
+ * Insert at the head of Idleq.
+ */
+static void
+psm3_cuda_idleq_insert(psm3_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item)
+{
+	if (FIRST(cache) == NULL) {
+		FIRST(cache) = memcache_item;
+		LAST(cache) = memcache_item;
+		return;
+	}
+	INEXT(FIRST(cache)) = memcache_item;
+	IPREV(memcache_item) = FIRST(cache);
+	FIRST(cache) = memcache_item;
+	INEXT(FIRST(cache)) = NULL;
+	return;
+}
+
+/*
+ * Remove least recent used element.
+ */
+static void
+psm3_cuda_idleq_remove_last(psm3_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item)
+{
+	if (!INEXT(memcache_item)) {
+		LAST(cache) = NULL;
+		FIRST(cache) = NULL;
+	} else {
+		LAST(cache) = INEXT(memcache_item);
+		IPREV(LAST(cache)) = NULL;
+	}
+	// Null-out now-removed memcache_item's next and prev pointers out of
+	// an abundance of caution
+	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
+}
+
+static void
+psm3_cuda_idleq_remove(psm3_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item)
+{
+	if (LAST(cache) == memcache_item) {
+		psm3_cuda_idleq_remove_last(cache, memcache_item);
+	} else if (FIRST(cache) == memcache_item) {
+		FIRST(cache) = IPREV(memcache_item);
+		INEXT(FIRST(cache)) = NULL;
+	} else {
+		INEXT(IPREV(memcache_item)) = INEXT(memcache_item);
+		IPREV(INEXT(memcache_item)) = IPREV(memcache_item);
+	}
+	// Null-out now-removed memcache_item's next and prev pointers out of
+	// an abundance of caution
+	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
+}
+
+static void
+psm3_cuda_idleq_reorder(psm3_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item)
+{
+	if (FIRST(cache) == memcache_item && LAST(cache) == memcache_item ) {
+		return;
+	}
+	psm3_cuda_idleq_remove(cache, memcache_item);
+	psm3_cuda_idleq_insert(cache, memcache_item);
+	return;
+}
+
+/*
+ * After a successful cache hit, item is validated by doing a
+ * memcmp on the handle stored and the handle we receive from the
+ * sender. If the validation fails the item is removed from the idleq,
+ * the rbtree, is put back into the mpool and cuIpcCloseMemHandle function
+ * is called.
+ * Cuda ipcMemHandles for distinct allocations are unique, even if the
+ * allocation was at the same address.  So this check catches stale cache
+ * entries.
+ */
+static psm2_error_t
+psm3_cuda_memhandle_cache_validate(psm3_cuda_memhandle_cache_t cache,
+				cl_map_item_t* memcache_item,
+				uintptr_t sbuf, CUipcMemHandle* handle,
+				psm2_epid_t epid)
+{
+	psmi_assert(!psm3_epid_cmp_internal(epid, memcache_item->payload.epid));
+	psmi_assert(sbuf == memcache_item->payload.start);
+	if (0 == memcmp(handle, &memcache_item->payload.cuda_ipc_handle,
+			sizeof(CUipcMemHandle))) {
+		return PSM2_OK;
+	}
+	_HFI_DBG("cache collision: new entry start=%lu\n", sbuf);
+
+	cache->stats->gpu_ipc_cache_remove++;
+	ips_cl_qmap_remove_item(&cache->map, memcache_item);
+	cache->stats->gpu_ipc_cache_nelems--;
+	PSM3_CUDA_CALL(cuIpcCloseMemHandle,
+				memcache_item->payload.cuda_ipc_dev_ptr);
+	psm3_cuda_idleq_remove(cache, memcache_item);
+	memset(memcache_item, 0, sizeof(*memcache_item));
+	psm3_mpool_put(memcache_item);
+	return PSM2_OK_NO_PROGRESS;
+}
+
+/*
+ * Current eviction policy: Least Recently Used.
+ */
+static void
+psm3_cuda_memhandle_cache_evict(psm3_cuda_memhandle_cache_t cache)
+{
+	cache->stats->gpu_ipc_cache_evict++;
+	cl_map_item_t *p_item = LAST(cache);
+	_HFI_VDBG("Removing (epid=%s,start=%lu,dev_ptr=0x%llX,it=%p) from cuda_memhandle_cachemap.\n",
+			psm3_epid_fmt_internal(p_item->payload.epid, 0), p_item->payload.start,
+			p_item->payload.cuda_ipc_dev_ptr, p_item);
+	ips_cl_qmap_remove_item(&cache->map, p_item);
+	cache->stats->gpu_ipc_cache_nelems--;
+	PSM3_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr);
+	psm3_cuda_idleq_remove_last(cache, p_item);
+	memset(p_item, 0, sizeof(*p_item));
+	psm3_mpool_put(p_item);
+}
+
+static psm2_error_t
+psm3_cuda_memhandle_cache_register(psm3_cuda_memhandle_cache_t cache,
+				uintptr_t sbuf, CUipcMemHandle* handle,
+				psm2_epid_t epid,
+				CUdeviceptr cuda_ipc_dev_ptr)
+{
+	if (NELEMS(cache) == cache->size)
+		psm3_cuda_memhandle_cache_evict(cache);
+
+	cl_map_item_t* memcache_item = psm3_mpool_get(cache->mpool);
+	/* memcache_item cannot be NULL as we evict
+	 * before the call to mpool_get. Check has
+	 * been fixed to help with klockwork analysis.
+	 */
+	if (memcache_item == NULL)
+		return PSM2_NO_MEMORY;
+	memcache_item->payload.start = sbuf;
+	memcache_item->payload.cuda_ipc_handle = *handle;
+	memcache_item->payload.cuda_ipc_dev_ptr = cuda_ipc_dev_ptr;
+	memcache_item->payload.epid = epid;
+	ips_cl_qmap_insert_item(&cache->map, memcache_item);
+	cache->stats->gpu_ipc_cache_nelems++;
+	if (cache->stats->gpu_ipc_cache_nelems > cache->stats->gpu_ipc_cache_max_nelems)
+		cache->stats->gpu_ipc_cache_max_nelems = cache->stats->gpu_ipc_cache_nelems;
+	psm3_cuda_idleq_insert(cache, memcache_item);
+	return PSM2_OK;
+}
+
+static void psm3_cuda_memhandle_cache_clear(psm3_cuda_memhandle_cache_t cache)
+{
+	_HFI_DBG("Closing all handles, clearing cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS(cache));
+	while (NELEMS(cache)) {
+		psm3_cuda_memhandle_cache_evict(cache);
+	}
+	cache->stats->gpu_ipc_cache_clear++;
+	_HFI_DBG("Closed all handles, cleared cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS(cache));
+}
+
+/*
+ * The key used to search the cache is the senders buf address pointer and
+ * epid.  The sender will have used cuMemGetAddressRange
+ * to find the start of the memory containing the buffer (supplied as sbuf).
+ * Upon match, we must validate the entry we find and may need to replace it.
+ */
+static CUdeviceptr
+psm3_cuda_memhandle_acquire(psm3_cuda_memhandle_cache_t cache,
+				uintptr_t sbuf, CUipcMemHandle* handle,
+				psm2_epid_t epid)
+{
+	_HFI_VDBG("sbuf=%lu,handle=%p,epid=%s\n",
+			sbuf, handle, psm3_epid_fmt_internal(epid, 0));
+
+	CUdeviceptr cuda_ipc_dev_ptr;
+	if(! cache) {
+		PSM3_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr,
+				*handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+		return cuda_ipc_dev_ptr;
+	}
+
+	psm3_cuda_cache_item key = {
+		.start = (unsigned long) sbuf,
+		.epid = epid
+	};
+
+	/*
+	 * preconditions:
+	 *  1) buffer [start,epid) may or may not be in cachemap already
+	 *  2) there are no duplicate entries in cachemap
+	 * postconditions:
+	 *  1) buffer is in cachemap with same handle, epid
+	 *  2) there are no duplicate entries in cachemap
+	 *
+	 * The key used to search the cache is the senders buf address pointer
+	 * and epid.
+	 * Upon a succesful hit in the cache, additional validation is required
+	 * as the handle could be stale.
+	 */
+	cl_map_item_t *p_item = ips_cl_qmap_searchv(&cache->map, &key);
+	if (p_item->payload.start) {
+		// confirm the entry for sbuf matches the handle and is not stale
+		if (psm3_cuda_memhandle_cache_validate(cache, p_item, sbuf, handle, epid) == PSM2_OK) {
+			cache->stats->gpu_ipc_cache_hit++;
+			psm3_cuda_idleq_reorder(cache, p_item);
+			return p_item->payload.cuda_ipc_dev_ptr;
+		}
+
+		// buffer found was stale psm3_cuda_memhandle_cache_validate()
+		// closed and removed existing entry.
+		// Should find no more duplicates
+#ifdef PSM_DEBUG
+		p_item = ips_cl_qmap_searchv(&cache->map, &key);
+		psmi_assert(! p_item->payload.start);
+#endif
+	}
+	cache->stats->gpu_ipc_cache_miss++;
+
+	CUresult cudaerr = PSM3_CUDA_CALL_EXCEPT(
+		CUDA_ERROR_ALREADY_MAPPED,
+		cuIpcOpenMemHandle,
+		&cuda_ipc_dev_ptr,
+		*handle,
+		CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+
+	if (cudaerr == CUDA_ERROR_ALREADY_MAPPED) {
+		// remote memory already mapped. Close all handles, clear cache,
+		// and try again
+		psm3_cuda_memhandle_cache_clear(cache);
+		PSM3_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle,
+			CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
+	}
+
+	psm3_cuda_memhandle_cache_register(cache, sbuf, handle,
+							epid, cuda_ipc_dev_ptr);
+	return cuda_ipc_dev_ptr;
+}
+
+static void
+psm3_cuda_memhandle_release(psm3_cuda_memhandle_cache_t cache,
+						CUdeviceptr cuda_ipc_dev_ptr)
+{
+	if(! cache)
+		PSM3_CUDA_CALL(cuIpcCloseMemHandle, cuda_ipc_dev_ptr);
+	return;
+}
+// end of CUDA IPC MemHandle Cache
+//***************************************************************************
+
+
+// RTS and CTS processing functions for PSM3_DEVICES "shm" to pass
+// Cuda IPC handles and permit use of NVLINK for intra-node transfers
+static psm2_error_t psm3_cuda_shm_init(struct ptl_am *ptl,
+								psm2_mq_stats_t *stats)
+{
+	// TBD - should we have generic names for these env variables
+	// PSM3_GPU_MEMCACHE_ENABLED, PSM3_GPU_MEMCACHE_SIZE?
+	union psmi_envvar_val env_memcache_enabled;
+
+	psm3_getenv("PSM3_CUDA_MEMCACHE_ENABLED",
+			"PSM cuda ipc memhandle cache enabled (default is enabled)",
+			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)1, &env_memcache_enabled);
+	if (env_memcache_enabled.e_uint) {
+		union psmi_envvar_val env_memcache_size;
+
+		psm3_getenv("PSM3_CUDA_MEMCACHE_SIZE",
+				"Size of the cuda ipc memhandle cache ",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val)CUDA_MEMHANDLE_CACHE_SIZE,
+				&env_memcache_size);
+		return psm3_cuda_memhandle_cache_alloc(
+					(psm3_cuda_memhandle_cache_t*)&ptl->memhandle_cache,
+					env_memcache_size.e_uint, stats);
+	}
+	return PSM2_OK;
+}
+
+static void psm3_cuda_shm_finalize(struct ptl_am *ptl)
+{
+	if (ptl->memhandle_cache)
+		psm3_cuda_memhandle_cache_free((psm3_cuda_memhandle_cache_t)ptl->memhandle_cache);
+	ptl->memhandle_cache = NULL;
+	return;
+}
+
+static psm2_error_t psm3_cuda_shm_epaddr_add(struct ptl_am *ptl,
+											struct am_epaddr *am_epaddr)
+{
+	// nothing to do
+	return PSM2_OK;
+}
+
+static void psm3_cuda_shm_epaddr_free(struct am_epaddr *am_epaddr)
+{
+	// nothing to do
+}
+
+static int psm3_cuda_shm_dev_fds_needed()
+{
+	// don't need to exchange dev_fds
+	return 0;
+}
+
+static void psm3_cuda_shm_dev_fds_send(struct ptl_am *ptl, struct am_epaddr *am_epaddr)
+{
+	// nothing to do
+}
+
+static psm2_error_t psm3_cuda_shm_dev_fds_connreq_poll(struct ptl_am *ptl, struct am_ptl_connection_req *req)
+{
+	// nothing to do
+	return PSM2_OK;
+}
+
+static psm2_error_t psm3_cuda_shm_dev_fds_check_exchanged(struct ptl_am *ptl, struct am_ptl_connection_req *req, int index)
+{
+	// nothing to do
+	return PSM2_OK;
+}
+
+static psm2_error_t psm3_cuda_shm_dev_fds_poll(struct ptl_am *ptl, psm2_error_t res)
+{
+	// nothing to do
+	return res;
+}
+
+// On Sender, place the IPC handle in the RTS
+// We put offset in the basic "args" parameters and the actual
+// IPC handle as payload due to it's size
+// Callers expect payload_size >0 when using GPU IPC and key off non-zero
+// payload size in RTS to identify a GPU IPC RTS
+// Save in the req the needed information about IPC resources allocated here
+// so psm3_cuda_process_cts and release them.
+static psm2_error_t psm3_cuda_shm_build_rts(struct ptl_am *ptl,
+			psm2_mq_req_t req, int *narg_p,
+			psm2_amarg_t *args, void **payload_p, size_t *payload_size_p,
+			union am_gpu_rts_payload *info)
+{
+	CUdeviceptr buf_base_ptr;
+	void *buf = req->req_data.buf;
+	PSM3_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf);
+
+	/* Offset in GPU buffer from which we copy data, we have to
+	 * send it separetly because this offset is lost
+	 * when cuIpcGetMemHandle is called */
+	req->gpu_specific.cuda_ipc_offset = (uint32_t)((uintptr_t)buf - (uintptr_t)buf_base_ptr);
+	args[2].u32w0 = (uint32_t)req->gpu_specific.cuda_ipc_offset;
+
+	PSM3_CUDA_CALL(cuIpcGetMemHandle, &req->gpu_specific.cuda_ipc_handle, (CUdeviceptr) buf);
+	*narg_p = 5;
+	*payload_p = (void*)&req->gpu_specific.cuda_ipc_handle;
+	*payload_size_p = sizeof(CUipcMemHandle);
+	req->gpu_specific.cuda_ipc_handle_attached = 1;
+	return PSM2_OK;
+}
+
+// On receiver, pull IPC information out of the RTS which our peer build using
+// psm3_cuda_shm_build_rts.  Information is saved to the req for subsequent
+// processing after tag matching via psm3_cuda_shm_rtsmatch
+static void psm3_cuda_shm_process_rts(psm2_mq_req_t req, void *buf, size_t len,
+				int narg, psm2_amarg_t *args)
+{
+	req->gpu_specific.cuda_ipc_handle = *((CUipcMemHandle*)buf);
+	psmi_assert(len == sizeof(CUipcMemHandle));
+	req->gpu_specific.cuda_ipc_handle_attached = 1;
+	req->gpu_specific.cuda_ipc_offset = args[2].u32w0;
+}
+
+// On receiver, use GPU IPC to copy data from the sender to this process
+// This is called on the receiver after psm3_cuda_process_rts has parsed the
+// incoming RTS and tag matching has matched the RTS with a receive buffer and
+// populated the req with information about the matched receiver buffer
+static int psm3_cuda_shm_rtsmatch(struct ptl_am *ptl, psm2_mq_req_t req)
+{
+	if (req->gpu_specific.cuda_ipc_handle_attached) {
+		CUdeviceptr cuda_ipc_dev_ptr = psm3_cuda_memhandle_acquire(
+								ptl->memhandle_cache,
+								req->rts_sbuf - req->gpu_specific.cuda_ipc_offset,
+								(CUipcMemHandle*)&req->gpu_specific.cuda_ipc_handle,
+								req->rts_peer->epid);
+		cuda_ipc_dev_ptr = cuda_ipc_dev_ptr + req->gpu_specific.cuda_ipc_offset;
+		/* cuMemcpy into the receive side buffer
+		 * based on its location */
+		if (req->is_buf_gpu_mem) {
+			/*PSM3_GPU_MEMCPY_DTOD*/
+			psm3_cuda_memcpy_DtoD(req->req_data.buf, (void*)cuda_ipc_dev_ptr,
+						req->req_data.recv_msglen);
+			//PSM3_GPU_SYNCHRONIZE_MEMCPY();
+			psm3_cuda_synchronize_memcpy();
+		} else {
+			/*PSM3_GPU_MEMCPY_DTOH*/
+			psm3_cuda_memcpy_DtoH(req->req_data.buf, (void*)cuda_ipc_dev_ptr,
+				req->req_data.recv_msglen);
+		}
+		psm3_cuda_memhandle_release(ptl->memhandle_cache,
+					cuda_ipc_dev_ptr - req->gpu_specific.cuda_ipc_offset);
+		req->gpu_specific.cuda_ipc_handle_attached = 0;
+		return 1;
+	}
+	return 0;
+}
+
+// On sender, we have now received the CTS corresponding to an RTS
+// we may have built in psm3_cuda_build_rts.  All we need to do here is release
+// the resources we allocated in psm3_cuda_build_rts.  We saved the  necessary
+// information tracking those resources in the send req.
+// Returns:
+// 	0 - the req was not for a GPU IO
+// 	1 - the req was for a GPU IO and we have released the resources
+static int psm3_cuda_shm_process_cts(psm2_mq_req_t req)
+{
+	if (req->gpu_specific.cuda_ipc_handle_attached) {
+		// no need to release any Cuda resources
+		req->gpu_specific.cuda_ipc_handle_attached = 0;
+		return 1;
+	}
+	return 0;
+}
+// end of RTS and CTS processing functions for PSM3_DEVICES "shm"
+//***************************************************************************
+
+static psm2_error_t psm3_cuda_get_cuda_permitted(struct psm2_ep *ep, bool *enable)
+{
+	switch (psm3_cuda_permitted_enforcement) {
+	case PSM3_CPE_REJECT:
+		_HFI_DBG("GET(CUDA_PERMITTED) rejected\n");
+		return PSM2_PARAM_ERR;
+	case PSM3_CPE_IGNORE:
+	case PSM3_CPE_OBEY:
+		*enable = ep->gpu_specific.cuda_permitted;
+		return PSM2_OK;
+	}
+
+	_HFI_ERROR("PSM3_CUDA_PERMITTED_ENFORCEMENT invalid: %u\n",
+		psm3_cuda_permitted_enforcement);
+	return PSM2_PARAM_ERR;
+}
+
+static psm2_error_t psm3_cuda_set_cuda_permitted(struct psm2_ep *ep, bool enable)
+{
+	switch (psm3_cuda_permitted_enforcement) {
+	case PSM3_CPE_REJECT:
+		_HFI_DBG("SET(CUDA_PERMITTED) rejected\n");
+		return PSM2_PARAM_ERR;
+	case PSM3_CPE_IGNORE:
+	case PSM3_CPE_OBEY:
+		ep->gpu_specific.cuda_permitted = enable;
+		return PSM2_OK;
+	}
+
+	_HFI_ERROR("PSM3_CUDA_PERMITTED_ENFORCEMENT invalid: %u\n",
+		psm3_cuda_permitted_enforcement);
+	return PSM2_PARAM_ERR;
+}
+
+static bool psm3_cuda_is_memcpy_permitted(struct psm2_ep *ep)
+{
+	switch (psm3_cuda_permitted_enforcement) {
+	case PSM3_CPE_REJECT:
+		// REJECT behaves as though the CUDA_PERMITTED option doesn't exist,
+		// so behave as per legacy and allow memcpy
+		return true;
+	case PSM3_CPE_IGNORE:
+		// IGNORE behaves as though CUDA_PERMITTED is always true
+		return true;
+	case PSM3_CPE_OBEY:
+		// OBEY requires we honor the config set by the user
+		return ep->gpu_specific.cuda_permitted;
+	}
+
+	_HFI_ERROR("PSM3_CUDA_PERMITTED_ENFORCEMENT invalid: %u\n",
+		psm3_cuda_permitted_enforcement);
+	return true;
+}
+
+struct psm3_gpu_hal psm3_cuda_hal = {
+	.type = "cuda",
+#ifdef PSM_HAVE_RNDV_MOD
+	.rv_major_rev_fail = 0,
+	.rv_minor_rev_fail = 0,
+	.rv_capability_expected = RV_CAP_NVIDIA_GPU,
+	.hal_cap_expected = PSM_HAL_CAP_NVIDIA_GPU,
+#endif
+	.ghfp_initialize = psm3_cuda_initialize,
+	.ghfp_finalize = psm3_cuda_finalize,
+	.ghfp_ep_open = psm3_cuda_ep_open,
+	.ghfp_ep_close = psm3_cuda_ep_close,
+	.ghfp_identify = psm3_cuda_identify,
+	.ghfp_verify_GPU_capabilities = psm3_cuda_verify_GPU_capabilities,
+	.ghfp_p2p_supported = psm3_cuda_p2p_supported,
+	.ghfp_gpudirect_supported = psm3_cuda_gpudirect_supported,
+	.ghfp_using_rv_for_mrs = psm3_cuda_using_rv_for_mrs,
+	.ghfp_get_pci_addr = psm3_cuda_get_pci_addr,
+#ifdef PSM_HAVE_RNDV_MOD
+	.ghfp_min_bar_size = psm3_cuda_min_bar_size,
+	.ghfp_check_phys_addr = psm3_cuda_check_phys_addr,
+	.ghfp_roundup_gdrcopy = psm3_cuda_roundup_gdrcopy,
+#ifdef PSM_HAVE_REG_MR
+	.ghfp_roundup_rv_reg_mr = psm3_cuda_roundup_rv_reg_mr,
+	.ghfp_init_rv_reg_mr_params = psm3_cuda_init_rv_reg_mr_params,
+#endif
+	.ghfp_init_rv_pin_mmap_params = psm3_cuda_init_rv_pin_mmap_params,
+	.ghfp_rv_reg_mmap_cleanup = psm3_cuda_rv_reg_mmap_cleanup,
+#endif /* PSM_HAVE_RNDV_MOD */
+#ifdef PSM_HAVE_REG_MR
+	.ghfp_cmp_mr = psm3_cuda_cmp_mr,
+	.ghfp_init_mr = psm3_cuda_init_mr,
+#endif
+	.ghfp_fetch_ctxt = psm3_cuda_fetch_ctxt,
+	.ghfp_refresh_ctxt = psm3_cuda_refresh_ctxt,
+	.ghfp_register_hostmem = psm3_cuda_register_hostmem,
+	.ghfp_unregister_hostmem = psm3_cuda_unregister_hostmem,
+	.ghfp_is_gpu_mem = psm3_cuda_is_gpu_mem,
+	.ghfp_prepare_HtoD_memcpys = psm3_cuda_prepare_HtoD_memcpys,
+	.ghfp_prepare_DtoH_memcpys = psm3_cuda_prepare_DtoH_memcpys,
+	.ghfp_shutdown_HtoD_memcpys = psm3_cuda_shutdown_HtoD_memcpys,
+	.ghfp_shutdown_DtoH_memcpys = psm3_cuda_shutdown_DtoH_memcpys,
+	.ghfp_memcpy_HtoD_start = psm3_cuda_memcpy_HtoD_start,
+	.ghfp_memcpy_DtoH_start = psm3_cuda_memcpy_DtoH_start,
+	.ghfp_memcpy_done = psm3_cuda_memcpy_done,
+	.ghfp_hostbuf_lazy_init = psm3_cuda_hostbuf_lazy_init,
+	.ghfp_hostbuf_reset = psm3_cuda_hostbuf_reset,
+	.ghfp_hostbuf_destroy = psm3_cuda_hostbuf_destroy,
+	.ghfp_memcpy_DtoD = psm3_cuda_memcpy_DtoD,
+	.ghfp_memcpy_HtoD = psm3_cuda_memcpy_HtoD,
+	.ghfp_memcpy_DtoH = psm3_cuda_memcpy_DtoH,
+	.ghfp_memcpy = psm3_cuda_memcpy,
+	.ghfp_synchronize_memcpy = psm3_cuda_synchronize_memcpy,
+	.ghfp_mark_buf_synchronous = psm3_cuda_mark_buf_synchronous,
+	.ghfp_host_alloc = psm3_cuda_host_alloc,
+	.ghfp_host_free = psm3_cuda_host_free,
+	.ghfp_gpu_addr_send_mr = psm3_cuda_gpu_addr_send_mr,
+	.ghfp_gpu_addr_recv_mr = psm3_cuda_gpu_addr_recv_mr,
+	// functions for PSM3_DEVICES "shm" RTS/CTS processing to enable
+	// use of GPU specific scale-up transfers within the given server
+	.ghfp_shm_init = psm3_cuda_shm_init,
+	.ghfp_shm_finalize = psm3_cuda_shm_finalize,
+	.ghfp_shm_epaddr_add = psm3_cuda_shm_epaddr_add,
+	.ghfp_shm_epaddr_free = psm3_cuda_shm_epaddr_free,
+	.ghfp_shm_dev_fds_needed = psm3_cuda_shm_dev_fds_needed,
+	.ghfp_shm_dev_fds_send = psm3_cuda_shm_dev_fds_send,
+	.ghfp_shm_dev_fds_connreq_poll = psm3_cuda_shm_dev_fds_connreq_poll,
+	.ghfp_shm_dev_fds_check_exchanged = psm3_cuda_shm_dev_fds_check_exchanged,
+	.ghfp_shm_dev_fds_poll = psm3_cuda_shm_dev_fds_poll,
+	.ghfp_shm_build_rts = psm3_cuda_shm_build_rts,
+	.ghfp_shm_process_rts = psm3_cuda_shm_process_rts,
+	.ghfp_shm_rtsmatch = psm3_cuda_shm_rtsmatch,
+	.ghfp_shm_process_cts = psm3_cuda_shm_process_cts,
+	.ghfp_get_cuda_permitted = psm3_cuda_get_cuda_permitted,
+	.ghfp_set_cuda_permitted = psm3_cuda_set_cuda_permitted,
+	.ghfp_is_memcpy_permitted = psm3_cuda_is_memcpy_permitted,
+};
+
+#endif /* PSM_CUDA */
diff --git a/prov/psm3/psm3/gpu/psm_gpu_hal.c b/prov/psm3/psm3/gpu/psm_gpu_hal.c
new file mode 100644
index 00000000000..e2c24b90cd7
--- /dev/null
+++ b/prov/psm3/psm3/gpu/psm_gpu_hal.c
@@ -0,0 +1,422 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2024 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2024 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sched.h>		/* cpu_set */
+#include <ctype.h>		/* isalpha */
+#include <stdbool.h>
+
+#include "psm_user.h"
+
+#ifdef PSM_HAVE_GPU
+
+#ifdef PSM_HAVE_RNDV_MOD
+#ifndef RV_CAP_GPU_DIRECT
+#error "Inconsistent build.  RV_CAP_GPU_DIRECT must be defined for GPU builds. Must use GPU enabled rv headers"
+#endif
+#include "psm2_hal.h"
+#endif /* PSM_HAVE_RNDV_MOD */
+
+int psm3_my_gpu_device;	// up to 10 bits identifying GPU within server
+
+int psm3_gpu_is_gdr_copy_enabled;
+uint32_t psm3_gpu_gdr_copy_limit_send;
+uint32_t psm3_gpu_gdr_copy_limit_recv;
+int psm3_gpu_is_gpudirect_enabled = 0;
+int psm3_gpu_is_driver_gpudirect_enabled = 0;
+uint32_t psm3_gpu_gpudirect_send_limit;
+
+/* All GPU transfers beyond this threshold use
+ * RNDV protocol. It is mostly a send side knob.
+ */
+uint32_t psm3_gpu_thresh_rndv;
+
+uint32_t psm3_gpu_gpudirect_rdma_send_limit;
+uint32_t psm3_gpu_gpudirect_rdma_send_limit_default;
+
+uint32_t psm3_gpu_gpudirect_rdma_recv_limit;
+uint32_t psm3_gpu_gpudirect_rdma_recv_limit_default;
+
+int psm3_gpu_is_driver_gpudirect_enabled;
+
+// default value for PSM3_GPU_RNDV_NIC_WINDOW
+const char *psm3_gpu_rndv_nic_window_default = NULL;
+
+// default value for PSM3_MQ_RNDV_SHM_GPU_THRESH
+// Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem
+unsigned psm3_gpu_mq_rndv_shm_gpu_thresh_default;
+
+uint64_t psm3_gpu_cache_evict;  // in bytes
+
+#ifdef PSM_HAVE_RNDV_MOD
+void psm3_gpu_rv_cap_string(char *buf, size_t size, uint64_t capability)
+{
+	int offset = 0;
+	buf[0] = '\0';
+	offset += snprintf(buf+offset, size-offset, (capability & RV_CAP_NVIDIA_GPU)?" cuda":"");
+	if (size > offset) {
+		offset += snprintf(buf+offset, size-offset, (capability & RV_CAP_INTEL_GPU)?" oneapi-ze":"");
+	}
+}
+
+// Based on the RV capability supported, add to the ptl_ips HAL capability.
+// Should only be called within an ptl_ips HAL once it has decided it will
+// open rv.
+void psm3_gpu_rv_set_hal_cap(uint64_t capability)
+{
+	if (capability & RV_CAP_NVIDIA_GPU & PSM3_GPU_RV_CAPABILITY_EXPECTED)
+		psmi_hal_add_cap(PSM_HAL_CAP_NVIDIA_GPU);
+	if (capability & RV_CAP_INTEL_GPU & PSM3_GPU_RV_CAPABILITY_EXPECTED)
+		psmi_hal_add_cap(PSM_HAL_CAP_INTEL_GPU);
+
+}
+
+static void psm3_gpu_roundup_gdrcopy(unsigned long buf, size_t size,
+								uintptr_t *pageaddr_p, uint64_t *pagelen_p)
+{
+	*pageaddr_p = (uintptr_t)buf;
+	*pagelen_p = (uint64_t)size;
+}
+#endif /* PSM_HAVE_RNDV_MOD */
+
+uint32_t psm3_gpu_query_feature_mask(void)
+{
+	uint32_t res =0;
+#ifdef PSM_CUDA
+	res |= PSM2_INFO_QUERY_FEATURE_CUDA;
+#endif
+#ifdef PSM_ONEAPI
+	res |= PSM2_INFO_QUERY_FEATURE_ONEAPI;
+#endif
+	return res;
+}
+
+// noop function for everything in HAL when no GPU selected
+static psm2_error_t psm3_gpu_noop(void)
+{
+	return PSM2_OK;
+}
+
+static int psm3_gpu_true(void)
+{
+	return 1;
+}
+
+static int psm3_gpu_zero(void)
+{
+	return 0;
+}
+
+#ifdef PSM_HAVE_RNDV_MOD
+static uint64_t psm3_gpu_zero64(void)
+{
+	return 0;
+}
+#endif
+
+struct psm3_gpu_hal psm3_gpu_noop_hal = {
+	.type = "none",
+#ifdef PSM_HAVE_RNDV_MOD
+	.rv_major_rev_fail = 0,
+	.rv_minor_rev_fail = 0,
+	.rv_capability_expected = 0,
+	.hal_cap_expected = 0,
+#endif
+	.ghfp_initialize = (psm2_error_t (*)(void))psm3_gpu_noop,
+	.ghfp_finalize = (void (*)(void))psm3_gpu_noop,
+	.ghfp_ep_open = (void (*)(void))psm3_gpu_noop,
+	.ghfp_ep_close = (void (*)(void))psm3_gpu_noop,
+	.ghfp_identify = (void (*)(char *accel_vers, size_t size))psm3_gpu_noop,
+	.ghfp_verify_GPU_capabilities = (void (*)(void))psm3_gpu_noop,
+	.ghfp_p2p_supported = (int (*)(void))psm3_gpu_zero,
+	.ghfp_gpudirect_supported = (int (*)(void))psm3_gpu_zero,
+	.ghfp_using_rv_for_mrs = (void (*)(void))psm3_gpu_noop,
+	.ghfp_get_pci_addr = (void (*)(uint32_t *domain_p, uint32_t *bus_p,
+					uint32_t *dev_p, uint32_t *func_p))psm3_gpu_noop,
+#ifdef PSM_HAVE_RNDV_MOD
+	.ghfp_min_bar_size = (uint64_t (*)(void))psm3_gpu_zero64,
+	.ghfp_check_phys_addr = (psm2_error_t (*)(uint64_t phys_addr))psm3_gpu_noop,
+	.ghfp_roundup_gdrcopy = (void (*)(unsigned long buf, size_t size,
+		uintptr_t *pageaddr_p, uint64_t *pagelen_p))psm3_gpu_roundup_gdrcopy,
+#ifdef PSM_HAVE_REG_MR
+	.ghfp_roundup_rv_reg_mr = (void (*)(struct psm2_ep *ep,
+				void **addr_, uint64_t *length_p, int access))psm3_gpu_noop,
+	.ghfp_init_rv_reg_mr_params = (int (*)(void *addr, uint64_t length, int access,
+			struct rv_mem_params *mparams,
+			union psm3_verbs_mr_gpu_specific *gpu_specific,
+			union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad))psm3_gpu_zero,
+#endif
+	.ghfp_init_rv_pin_mmap_params = (int (*)(void *addr, uint64_t length, int access,
+			struct rv_gpu_mem_params *params,
+			union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad))psm3_gpu_zero,
+	.ghfp_rv_reg_mmap_cleanup = (void (*)(void *addr, uint64_t length, int access,
+			union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad))psm3_gpu_noop,
+#endif /* PSM_HAVE_RNDV_MOD */
+#ifdef PSM_HAVE_REG_MR
+	.ghfp_cmp_mr = (int (*)(const union psm3_verbs_mr_gpu_specific *a,
+					const union psm3_verbs_mr_gpu_specific *b))psm3_gpu_zero,
+
+	.ghfp_init_mr = (void (*)(void *addr, uint64_t length, int access,
+					union psm3_verbs_mr_gpu_specific *gpu_specific))psm3_gpu_noop,
+#endif
+	.ghfp_fetch_ctxt = (void (*)(void))psm3_gpu_noop,
+	.ghfp_refresh_ctxt = (void (*)(void))psm3_gpu_noop,
+	.ghfp_register_hostmem = (void (*)(void *buf, uint32_t size))psm3_gpu_noop,
+	.ghfp_unregister_hostmem = (void (*)(void *buf))psm3_gpu_noop,
+	.ghfp_is_gpu_mem = (int (*)(const void *ptr))psm3_gpu_zero,
+	.ghfp_prepare_HtoD_memcpys = (void (*)(struct ips_protoexp *protoexp))psm3_gpu_noop,
+	.ghfp_prepare_DtoH_memcpys = (void (*)(struct ips_proto *proto))psm3_gpu_noop,
+	.ghfp_shutdown_HtoD_memcpys = (void (*)(struct ips_protoexp *protoexp))psm3_gpu_noop,
+	.ghfp_shutdown_DtoH_memcpys = (void (*)(struct ips_proto *proto))psm3_gpu_noop,
+	.ghfp_memcpy_HtoD_start = (void (*)(struct ips_protoexp *protoexp,
+					struct ips_gpu_hostbuf *ghb, uint32_t len))psm3_gpu_noop,
+	.ghfp_memcpy_DtoH_start = (void (*)(struct ips_proto *proto,
+					struct ips_gpu_hostbuf *ghb, uint32_t len))psm3_gpu_noop,
+	.ghfp_memcpy_done = (int (*)(struct ips_gpu_hostbuf *ghb))psm3_gpu_true,
+	.ghfp_hostbuf_lazy_init = (void (*)(struct ips_gpu_hostbuf *ghb))psm3_gpu_noop,
+	.ghfp_hostbuf_reset = (void (*)(struct ips_gpu_hostbuf *ghb))psm3_gpu_noop,
+	.ghfp_hostbuf_destroy = (void (*)(struct ips_gpu_hostbuf *ghb))psm3_gpu_noop,
+	.ghfp_memcpy_DtoD = (void (*)(void *dstptr, const void *srcptr, uint32_t len))psm3_gpu_noop,
+	.ghfp_memcpy_HtoD = (void (*)(void *dstptr, const void *srcptr, uint32_t len))psm3_gpu_noop,
+	.ghfp_memcpy_DtoH = (void (*)(void *dstptr, const void *srcptr, uint32_t len))psm3_gpu_noop,
+	.ghfp_memcpy = (void (*)(void *dstptr, const void *srcptr, uint32_t len))psm3_gpu_noop,
+	.ghfp_synchronize_memcpy = (void (*)(void))psm3_gpu_noop,
+	.ghfp_mark_buf_synchronous = (void (*)(const void *buf))psm3_gpu_noop,
+	.ghfp_host_alloc = (void (*)(void **ret_ptr, uint32_t size))psm3_gpu_noop,
+	.ghfp_host_free = (void (*)(void *ptr))psm3_gpu_noop,
+	.ghfp_gpu_addr_send_mr = (int (*)(struct psm2_mq_req *mqreq))psm3_gpu_noop,
+	.ghfp_gpu_addr_recv_mr = (int (*)(struct ips_tid_recv_desc *tidrecvc,
+					int gpu_hostbuf_used))psm3_gpu_noop,
+	// functions for PSM3_DEVICES "shm" RTS/CTS processing to enable
+	// use of GPU specific scale-up transfers within the given server
+	.ghfp_shm_init = (psm2_error_t (*)(struct ptl_am *ptl,
+					psm2_mq_stats_t *stats))psm3_gpu_noop,
+	.ghfp_shm_finalize = (void (*)(struct ptl_am *ptl))psm3_gpu_noop,
+	.ghfp_shm_epaddr_add = (psm2_error_t (*)(struct ptl_am *ptl,
+					struct am_epaddr *am_epaddr))psm3_gpu_noop,
+	.ghfp_shm_epaddr_free = (void (*)(struct am_epaddr *am_epaddr))psm3_gpu_noop,
+	.ghfp_shm_dev_fds_needed = (int (*)(void))psm3_gpu_zero,
+	.ghfp_shm_dev_fds_send = (void (*)(struct ptl_am *ptl,
+					struct am_epaddr *am_epaddr))psm3_gpu_noop,
+	.ghfp_shm_dev_fds_connreq_poll = (psm2_error_t (*)(struct ptl_am *ptl,
+					struct am_ptl_connection_req *req))psm3_gpu_noop,
+	.ghfp_shm_dev_fds_check_exchanged = (psm2_error_t (*)(struct ptl_am *ptl,
+					struct am_ptl_connection_req *req, int index))psm3_gpu_noop,
+	.ghfp_shm_dev_fds_poll = (psm2_error_t (*)(struct ptl_am *ptl, psm2_error_t res))psm3_gpu_noop,
+	.ghfp_shm_build_rts = (psm2_error_t (*)(struct ptl_am *ptl,
+				psm2_mq_req_t req, int *narg_p,
+				psm2_amarg_t *args, void **payload_p, size_t *payload_size_p,
+				union am_gpu_rts_payload *info_p))psm3_gpu_noop,
+	.ghfp_shm_process_rts = (void (*)(psm2_mq_req_t req, void *buf, size_t len,
+				int narg, psm2_amarg_t *args))psm3_gpu_noop,
+	.ghfp_shm_rtsmatch = (int (*)(struct ptl_am *ptl, psm2_mq_req_t req))psm3_gpu_zero,
+	.ghfp_shm_process_cts = (int (*)(psm2_mq_req_t sreq))psm3_gpu_zero,
+	.ghfp_get_cuda_permitted = (psm2_error_t (*)(struct psm2_ep *ep, bool *enable))psm3_gpu_zero,
+	.ghfp_set_cuda_permitted = (psm2_error_t (*)(struct psm2_ep *ep, bool enable))psm3_gpu_zero,
+	.ghfp_is_memcpy_permitted = (bool (*)(struct psm2_ep *ep))psm3_gpu_zero,
+};
+
+struct psm3_gpu_hal *psm3_gpu_hal = &psm3_gpu_noop_hal;
+
+// parse additional options and threshholds for GPU data movement
+static void psm3_gpu_env_init(void)
+{
+	int ret;
+
+	union psmi_envvar_val env_enable_gdr_copy;
+	psm3_getenv("PSM3_GDRCOPY",
+				"Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)",
+				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)1, &env_enable_gdr_copy);
+	psm3_gpu_is_gdr_copy_enabled = env_enable_gdr_copy.e_int;
+
+	union psmi_envvar_val env_gpu_thresh_rndv;
+	ret = psm3_getenv_range("PSM3_GPU_THRESH_RNDV",
+			"RNDV protocol is used for GPU send message sizes greater than the threshold",
+			NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)psm3_gpu_thresh_rndv,
+			(union psmi_envvar_val)0, (union psmi_envvar_val)UINT32_MAX,
+			NULL, NULL, &env_gpu_thresh_rndv);
+	if (ret > 0) {	// used default
+		/*
+		 * For backward compatibility, check if the old variable name is set.
+		 * Priority order: New name > old name > default value.
+		 */
+		psm3_getenv("PSM3_CUDA_THRESH_RNDV",
+			"[Deprecated, use PSM3_GPU_THRESH_RNDV]"
+			" RNDV protocol is used for GPU send message sizes greater than the threshold",
+			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)psm3_gpu_thresh_rndv,
+			&env_gpu_thresh_rndv);
+	}
+
+	psm3_gpu_thresh_rndv = env_gpu_thresh_rndv.e_uint;
+
+
+	union psmi_envvar_val env_gdr_copy_limit_send;
+	psm3_getenv("PSM3_GDRCOPY_LIMIT_SEND",
+				"GDR Copy is turned off on the send side"
+				" for message sizes greater than the limit"
+				" or larger than 1 MTU\n",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)GDR_COPY_LIMIT_SEND, &env_gdr_copy_limit_send);
+	psm3_gpu_gdr_copy_limit_send = env_gdr_copy_limit_send.e_int;
+
+	if (psm3_gpu_gdr_copy_limit_send < 8 || psm3_gpu_gdr_copy_limit_send > psm3_gpu_thresh_rndv)
+		psm3_gpu_gdr_copy_limit_send = max(GDR_COPY_LIMIT_SEND, psm3_gpu_thresh_rndv);
+
+	union psmi_envvar_val env_gdr_copy_limit_recv;
+	psm3_getenv("PSM3_GDRCOPY_LIMIT_RECV",
+				"GDR Copy is turned off on the recv side"
+				" for message sizes greater than the limit\n",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)GDR_COPY_LIMIT_RECV, &env_gdr_copy_limit_recv);
+	psm3_gpu_gdr_copy_limit_recv = env_gdr_copy_limit_recv.e_int;
+
+	if (psm3_gpu_gdr_copy_limit_recv < 8)
+		psm3_gpu_gdr_copy_limit_recv = GDR_COPY_LIMIT_RECV;
+
+	if (!psm3_gpu_is_gdr_copy_enabled)
+		psm3_gpu_gdr_copy_limit_send = psm3_gpu_gdr_copy_limit_recv = 0;
+}
+
+psm2_error_t psm3_gpu_initialize(void)
+{
+// TBD - what if customer exports CUDA and ONEAPI in a build with both?
+// TBD - how to interpret GPU_DIRECT when build has both enabled?
+// maybe we need to have a HAL function to check if any devices available
+#ifdef PSM_CUDA
+	union psmi_envvar_val env_enable_cuda;
+
+	psm3_getenv("PSM3_CUDA",
+			"Enable (set envvar to 1) for cuda support in PSM (Disabled by default)",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val)0, &env_enable_cuda);
+	// order important, always parse gpudirect
+	if (psmi_parse_gpudirect() || env_enable_cuda.e_int) {
+		psm2_error_t err;
+		// establish HAL for Cuda
+		psm3_gpu_hal = &psm3_cuda_hal;
+		err = psm3_cuda_hal.ghfp_initialize();
+		if (err != PSM2_OK)
+			return err;
+		psm3_gpu_env_init();
+	}
+#else /* PSM_CUDA */
+	/* PSM3_CUDA is not allowed for this build, so we check it's
+	 * presence but don't want to use psm3_getenv since we don't
+	 * want it to appear in PSM3_VERBOSE_ENV help text
+	 */
+	int enable_cuda = 0;
+	if (psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &enable_cuda,
+				INT_MIN, INT_MAX) == -2
+		|| enable_cuda) {
+		_HFI_INFO("WARNING: PSM built without CUDA enabled, PSM3_CUDA unavailable\n");
+	}
+#endif /* PSM_CUDA */
+#ifdef PSM_ONEAPI
+	union psmi_envvar_val env_enable_oneapi;
+	psm3_getenv("PSM3_ONEAPI_ZE",
+			"Enable (set envvar to 1) for OneAPI Level Zero (ZE) support in PSM (Disabled by default)",
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+			(union psmi_envvar_val)0, &env_enable_oneapi);
+	if (psmi_parse_gpudirect() || env_enable_oneapi.e_int) {
+		psm2_error_t err;
+		// establish HAL for Cuda
+		psm3_gpu_hal = &psm3_oneapi_ze_hal;
+		err = psm3_oneapi_ze_hal.ghfp_initialize();
+		if (err != PSM2_OK)
+			return err;
+		psm3_gpu_env_init();
+	}
+#else /* PSM_ONEAPI */
+	/* PSM3_ONEAPI_ZE is not allowed for this build, so we check it's
+	 * presence but don't want to use psm3_getenv since we don't
+	 * want it to appear in PSM3_VERBOSE_ENV help text
+	 */
+	int enable_oneapi = 0;
+	if (psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &enable_oneapi,
+				INT_MIN, INT_MAX) == -2
+		|| enable_oneapi) {
+		_HFI_INFO("WARNING: PSM built without ONEAPI_ZE enabled, PSM3_ONEAPI_ZE unavailable\n");
+	}
+#endif /* PSM_ONEAPI */
+	return PSM2_OK;
+}
+
+#else /* PSM_HAVE_GPU */
+
+psm2_error_t psm3_gpu_initialize(void)
+{
+	/* PSM3_GPUDIRECT is not allowed for this build, so we check it's
+	 * presence but don't want to use psm3_getenv since we don't
+	 * want it to appear in PSM3_VERBOSE_ENV help text
+	 * Note we check here, rather than in ips_proto_init, because
+	 * PSM3_GPUDIERECT can enable GPU for ptl_am (shm) as well as ips,
+	 * so if a user attempted a non-GPU build single node run with
+	 * PSM3_GPUDIRECT=1 and expected GPU handling in shm, they would not
+	 * get the behavior they expected
+	 */
+	unsigned int gpudirect = 0;
+	if (psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect,
+				0, UINT_MAX) == -2
+		|| gpudirect) {
+		_HFI_INFO("WARNING: PSM built with neither ONEAPI_ZE nor CUDA enabled, PSM3_GPUDIRECT unavailable\n");
+	}
+	return PSM2_OK;	// just a warning, non-fatal
+}
+
+#endif /* PSM_HAVE_GPU */
diff --git a/prov/psm3/psm3/gpu/psm_gpu_hal.h b/prov/psm3/psm3/gpu/psm_gpu_hal.h
new file mode 100644
index 00000000000..dccf99032d7
--- /dev/null
+++ b/prov/psm3/psm3/gpu/psm_gpu_hal.h
@@ -0,0 +1,817 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2024 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2024 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */
+
+#ifndef _PSMI_IN_USER_H
+#error psm_gpu_hal.h not meant to be included directly, include psm_user.h instead
+#endif
+
+#ifndef _PSMI_GPU_HAL_H
+#define _PSMI_GPU_HAL_H
+
+#ifdef PSM_HAVE_GPU
+
+#ifdef PSM_HAVE_RNDV_MOD
+#include <rdma/rv_user_ioctls.h>
+
+#if defined(PSM_ONEAPI)
+#ifndef RV_IOCTL_CAPABILITY
+// TBD we could have configure test this and disable PSM3_HAVE_RNDV_MOD
+// or perhaps even disable/fail oneapi in configure
+#error "PSM_ONEAPI requires rv_user_ioctls.h 1.3 (w/GPU 1.2) or later"
+#endif
+#endif
+
+/* we test *_GPU_DIRECT since those defines
+ * control the rv module ioctl header file interface
+ * This establishes the build time RV GPUs which could be supported.
+ */
+#if defined(NVIDIA_GPU_DIRECT) || defined(INTEL_GPU_DIRECT)
+
+#ifndef RV_CAP_GPU_DIRECT
+#error "Inconsistent build.  RV_CAP_GPU_DIRECT must be defined for GPU builds. Must use GPU enabled rv headers"
+#endif
+
+#ifdef INTEL_GPU_DIRECT
+#define PSM3_RV_GPU_TYPES_INTEL " oneapi-ze"
+#else
+#define PSM3_RV_GPU_TYPES_INTEL
+#endif
+#ifdef NVIDIA_GPU_DIRECT
+#define PSM3_RV_GPU_TYPES_NVIDIA " cuda"
+#else
+#define PSM3_RV_GPU_TYPES_NVIDIA
+#endif
+
+#define PSM3_RV_GPU_TYPES PSM3_RV_GPU_TYPES_INTEL PSM3_RV_GPU_TYPES_NVIDIA
+
+#define PSM3_GPU_FMT_RV_GPU_VER " gpu v%u.%u" PSM3_RV_GPU_TYPES
+#define PSM3_GPU_OUT_RV_GPU_VER \
+		, psm3_rv_get_gpu_user_major_bldtime_version() \
+		, psm3_rv_get_gpu_user_minor_bldtime_version()
+#else
+#define PSM3_GPU_FMT_RV_GPU_VER
+#define PSM3_GPU_OUT_RV_GPU_VER
+#endif
+
+#endif /* PSM_HAVE_RNDV_MOD */
+
+
+#ifdef PSM_ONEAPI
+#include <level_zero/ze_api.h>
+#include <level_zero/loader/ze_loader.h>
+
+#define MAX_ZE_DEVICES 8
+#define PSM3_GPU_TYPES_ONEAPI " oneapi-ze"
+#else
+#define PSM3_GPU_TYPES_ONEAPI
+#endif
+
+#ifdef PSM_CUDA
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+//#include <driver_types.h>
+#define PSM3_GPU_TYPES_CUDA " cuda"
+#else
+#define PSM3_GPU_TYPES_CUDA
+#endif
+
+// build time PSM3 GPU types included
+#define PSM3_GPU_TYPES PSM3_GPU_TYPES_ONEAPI PSM3_GPU_TYPES_CUDA
+
+// GPU specific fields within psm2_ep_t
+union psm2_ep_gpu_specific {
+#ifdef PSM_CUDA
+	struct {
+		bool cuda_permitted;
+	};
+#endif /* PSM_CUDA */
+};
+
+// GPU specific fields within psm2_mq_req for use during PSM3 shm IPC
+union psm2_mq_req_gpu_specific {
+#ifdef PSM_ONEAPI
+	struct {
+		union {
+			ze_ipc_mem_handle_t ze_ipc_handle; // for sender req
+			uint32_t ze_handle;     // receiver req pidfd or gem_handle
+		};
+		uint8_t ze_handle_attached;
+		uint8_t ze_alloc_type;
+		uint32_t ze_ipc_offset;
+#ifndef PSM_HAVE_PIDFD
+		uint32_t ze_device_index;
+#endif
+		uint64_t ze_alloc_id;
+	};
+#endif /* PSM_ONEAPI */
+#ifdef PSM_CUDA
+	struct {
+		CUipcMemHandle cuda_ipc_handle;
+		uint8_t cuda_ipc_handle_attached;
+		uint32_t cuda_ipc_offset;
+	};
+#endif /* PSM_CUDA */
+};
+
+// GPU specific fields within ips_gpu_hostbuf.gpu_specific
+// for use during PSM3 GPU Direct copy pipeline
+union gpu_hostbuf_gpu_specific {
+#ifdef PSM_ONEAPI
+	struct {
+		ze_event_pool_handle_t ze_event_pool;
+		ze_command_list_handle_t ze_command_lists[MAX_ZE_DEVICES];
+		ze_event_handle_t ze_copy_status;
+		int ze_cur_dev_inx;
+	};
+#endif /* PSM_ONEAPI */
+#ifdef PSM_CUDA
+	struct {
+		CUevent cuda_copy_status;
+	};
+#endif /* PSM_CUDA */
+};
+
+// GPU specific fields within ptl_am.gpu_specific
+// for use during PSM3 shm IPC
+union ptl_am_gpu_specific {
+#ifdef PSM_ONEAPI
+	struct {
+#ifndef PSM_HAVE_PIDFD
+		char *ze_listen_sockname; // /dev/shm filename for ze_ipc_socket
+		int ze_ipc_socket; // AF_UNIX listener sock to recv GPU Dev FDs
+		int ze_need_dev_fds_poll; // are there outstanding dev_fds to be polled
+#endif
+	};
+#endif /* PSM_ONEAPI */
+#ifdef PSM_CUDA
+	struct {
+		// nothing needed
+	};
+#endif /* PSM_CUDA */
+};
+
+// GPU specific fields within am_epaddr.gpu_specific
+// for use during PSM3 shm IPC
+union am_epaddr_gpu_specific {
+#ifdef PSM_ONEAPI
+	struct {
+#ifdef PSM_HAVE_PIDFD
+		int ze_pidfd;
+		int ze_pad;	// align to 64 bits
+#else
+		int ze_num_peer_fds;
+		int ze_peer_fds[MAX_ZE_DEVICES];
+		int ze_sock_connected_state;
+			/* ze_sock_connected_state state definitions */
+#define ZE_SOCK_NOT_CONNECTED                   0
+#define ZE_SOCK_DEV_FDS_SENT                    1
+#define ZE_SOCK_DEV_FDS_SENT_AND_RECD           2
+		int ze_sock;
+		int ze_pad;	// align to 64 bits
+#endif
+	};
+#endif /* PSM_ONEAPI */
+#ifdef PSM_CUDA
+	struct {
+		// nothing needed
+	};
+#endif /* PSM_CUDA */
+};
+
+// GPU specific fields for use as RTS payload
+// during PSM3 shm IPC
+union am_gpu_rts_payload {
+#ifdef PSM_ONEAPI
+	struct am_oneapi_ze_rts_payload {
+		uint32_t ze_handle;  /* GEM handle or file descriptor */
+		uint8_t ze_alloc_type; /* allocation type */
+	} ze;
+#endif /* PSM_ONEAPI */
+#ifdef PSM_CUDA
+	struct {
+		// nothing needed
+	};
+#endif /* PSM_CUDA */
+};
+
+// GPU specific fields within ips_protoexp.gpu_specific
+// for use during PSM3 rendezvous RDMA
+union ips_protoexp_gpu_specific {
+#ifdef PSM_ONEAPI
+	struct {
+		/* Will not be usd if psm3_oneapi_immed_async_copy */
+		ze_command_queue_handle_t ze_cq_recvs[MAX_ZE_DEVICES];
+	};
+#endif /* PSM_ONEAPI */
+#ifdef PSM_CUDA
+	struct {
+		CUstream cudastream_recv;
+	};
+#endif /* PSM_CUDA */
+};
+
+// GPU specific fields within ips_proto.gpu_specific
+// for use during PSM3 rendezvous RDMA
+union ips_proto_gpu_specific {
+#ifdef PSM_ONEAPI
+	struct {
+		/* Will not be usd if psm3_oneapi_immed_async_copy */
+		ze_command_queue_handle_t ze_cq_sends[MAX_ZE_DEVICES];
+	};
+#endif /* PSM_ONEAPI */
+#ifdef PSM_CUDA
+	struct {
+		CUstream cudastream_send;
+	};
+#endif /* PSM_CUDA */
+};
+
+#ifdef PSM_HAVE_REG_MR
+// GPU specific fields within psm3_verbs_mr
+union psm3_verbs_mr_gpu_specific {
+#ifdef PSM_ONEAPI
+	struct {
+		uint64_t ze_alloc_id;
+		uint64_t ze_base_addr;
+	};
+#define PSM3_GPU_MRC_FMT " id %"PRIu64" base 0x%"PRIx64
+#define PSM3_GPU_OUT_MRC(gpu_specific) ,(gpu_specific)->ze_alloc_id, (gpu_specific)->ze_base_addr
+#endif /* PSM_ONEAPI */
+#ifdef PSM_CUDA
+	struct {
+		// nothing needed
+	};
+#define PSM3_GPU_MRC_FMT ""
+#define PSM3_GPU_OUT_MRC(gpu_specific)
+#endif /* PSM_CUDA */
+};
+#endif /* PSM_HAVE_REG_MR */
+
+#ifdef PSM_HAVE_RNDV_MOD
+// scratch pad to save information needed in PSM3_GPU_RV_REG_MMAP_CLEANUP
+// This holds transient information which is allocated during
+// PSM3_GPU_INIT_RV_REG_MR_PARAMS and PSM3_GPU_INIT_RV_PIN_AND MMAP_PARAMS
+// and then released via PSM3_GPU_RV_REG_MMAP_CLEANUP immediately
+// after successful or failed RV registration or mmap
+union psm3_gpu_rv_reg_mmap_mem_scratchpad {
+#ifdef PSM_ONEAPI
+	struct {
+		ze_ipc_mem_handle_t ze_ipc_handle;
+		uint64_t ze_handle_fd;
+	};
+#endif /* PSM_ONEAPI */
+#ifdef PSM_CUDA
+	struct {
+		// nothing needed
+	};
+#endif /* PSM_CUDA */
+};
+#endif /* PSM_HAVE_RNDV_MOD */
+
+struct psm2_ep;
+struct ips_proto;
+struct ips_protoexp;
+struct ips_gpu_hostbuf;
+struct ips_tid_recv_desc;
+struct psm2_mq_req;
+struct ptl_am;;
+struct am_epaddr;;
+struct am_ptl_connection_req;
+
+
+extern int psm3_my_gpu_device;	// up to 10 bits identifying GPU within server
+
+extern int psm3_gpu_is_gdr_copy_enabled;
+/* This limit dictates when the sender turns off
+ * GDR Copy and uses SDMA. The limit needs to be less than equal
+ * GPU RNDV threshold (psm3_gpu_thresh_rndv)
+ * set to 0 if GDR Copy disabled
+ */
+extern uint32_t psm3_gpu_gdr_copy_limit_send;
+/* This limit dictates when the reciever turns off
+ * GDR Copy. The limit needs to be less than equal
+ * GPU RNDV threshold (psm3_gpu_thresh_rndv)
+ * set to 0 if GDR Copy disabled
+ */
+extern uint32_t psm3_gpu_gdr_copy_limit_recv;
+extern int psm3_gpu_is_gpudirect_enabled; // only for use during parsing of other params
+extern int psm3_gpu_is_driver_gpudirect_enabled; // only for use during parsing of other params
+
+/* All GPU transfers beyond this threshold use
+ * RNDV protocol. It is mostly a send side knob.
+ */
+extern uint32_t psm3_gpu_thresh_rndv;
+
+extern uint32_t psm3_gpu_gpudirect_rdma_send_limit;
+extern uint32_t psm3_gpu_gpudirect_rdma_send_limit_default;
+
+extern uint32_t psm3_gpu_gpudirect_rdma_recv_limit;
+extern uint32_t psm3_gpu_gpudirect_rdma_recv_limit_default;
+
+// default value for PSM3_GPU_RNDV_NIC_WINDOW
+extern const char *psm3_gpu_rndv_nic_window_default;
+
+// default value for PSM3_MQ_RNDV_SHM_GPU_THRESH
+// Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem)
+extern unsigned psm3_gpu_mq_rndv_shm_gpu_thresh_default;
+
+extern uint64_t psm3_gpu_cache_evict;
+
+extern struct psm3_gpu_hal {
+	const char *type;
+#ifdef PSM_HAVE_RNDV_MOD
+	uint16_t rv_major_rev_fail;
+	uint16_t rv_minor_rev_fail;
+	uint64_t rv_capability_expected;
+	uint32_t hal_cap_expected;
+#endif
+	psm2_error_t (*ghfp_initialize)(void);
+	void (*ghfp_finalize)(void);
+	void (*ghfp_ep_open)(void);
+	void (*ghfp_ep_close)(void);
+	void (*ghfp_identify)(char *accel_vers, size_t size);
+	void (*ghfp_verify_GPU_capabilities)(void);
+	int (*ghfp_p2p_supported)(void);
+	int (*ghfp_gpudirect_supported)(void);
+	void (*ghfp_using_rv_for_mrs)(void);
+	void (*ghfp_get_pci_addr)(uint32_t *domain, uint32_t *bus,
+								uint32_t *dev, uint32_t *func);
+#ifdef PSM_HAVE_RNDV_MOD
+	uint64_t (*ghfp_min_bar_size)(void);
+	psm2_error_t (*ghfp_check_phys_addr)(uint64_t phys_addr);
+	void (*ghfp_roundup_gdrcopy)(unsigned long buf, size_t size,
+			uintptr_t *pageaddr_p, uint64_t *pagelen_p);
+#ifdef PSM_HAVE_REG_MR
+	void (*ghfp_roundup_rv_reg_mr)(struct psm2_ep *ep,
+						void **addr_p, uint64_t *length_p, int access);
+	int (*ghfp_init_rv_reg_mr_params)(void *addr, uint64_t length, int access,
+						struct rv_mem_params *mparams,
+						union psm3_verbs_mr_gpu_specific *gpu_specific,
+						union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad);
+#endif
+	int (*ghfp_init_rv_pin_mmap_params)(void *addr, uint64_t length, int access,
+						struct rv_gpu_mem_params *params,
+						union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad);
+	void (*ghfp_rv_reg_mmap_cleanup)(void *addr, uint64_t length, int access,
+						union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad);
+#endif /* PSM_HAVE_RNDV_MOD */
+#ifdef PSM_HAVE_REG_MR
+	int (*ghfp_cmp_mr)(const union psm3_verbs_mr_gpu_specific *a,
+						const union psm3_verbs_mr_gpu_specific *b);
+	void (*ghfp_init_mr)(void *addr, uint64_t length, int access,
+						union psm3_verbs_mr_gpu_specific *gpu_specific);
+#endif
+	void (*ghfp_fetch_ctxt)(void);
+	void (*ghfp_refresh_ctxt)(void);
+	void (*ghfp_register_hostmem)(void *buf, uint32_t size);
+	void (*ghfp_unregister_hostmem)(void *buf);
+	int (*ghfp_is_gpu_mem)(const void *ptr);
+	void (*ghfp_prepare_HtoD_memcpys)(struct ips_protoexp *protoexp);
+	void (*ghfp_prepare_DtoH_memcpys)(struct ips_proto *proto);
+	void (*ghfp_shutdown_HtoD_memcpys)(struct ips_protoexp *protoexp);
+	void (*ghfp_shutdown_DtoH_memcpys)(struct ips_proto *proto);
+	void (*ghfp_memcpy_HtoD_start)(struct ips_protoexp *protoexp,
+								struct ips_gpu_hostbuf *ghb, uint32_t len);
+	void (*ghfp_memcpy_DtoH_start)(struct ips_proto *proto,
+								struct ips_gpu_hostbuf *ghb, uint32_t len);
+	int (*ghfp_memcpy_done)(struct ips_gpu_hostbuf *ghb);
+	void (*ghfp_hostbuf_lazy_init)(struct ips_gpu_hostbuf *ghb);
+	void (*ghfp_hostbuf_reset)(struct ips_gpu_hostbuf *ghb);
+	void (*ghfp_hostbuf_destroy)(struct ips_gpu_hostbuf *ghb);
+	void (*ghfp_memcpy_DtoD)(void *dstptr, const void *srcptr, uint32_t len);
+	void (*ghfp_memcpy_HtoD)(void *dstptr, const void *srcptr, uint32_t len);
+	void (*ghfp_memcpy_DtoH)(void *dstptr, const void *srcptr, uint32_t len);
+	void (*ghfp_memcpy)(void *dstptr, const void *srcptr, uint32_t len);
+	void (*ghfp_synchronize_memcpy)(void);
+	void (*ghfp_mark_buf_synchronous)(const void *buf);
+// TBD should it be unsigned size instead?
+	void (*ghfp_host_alloc)(void **ret_ptr, uint32_t size);
+	void (*ghfp_host_free)(void *ptr);
+		// should the send buffer be treated as GPU memory
+	int (*ghfp_gpu_addr_send_mr)(struct psm2_mq_req *mqreq);
+		// should the recv buffer be treated as GPU memory
+	int (*ghfp_gpu_addr_recv_mr)(struct ips_tid_recv_desc *tidrecvc,
+								int gpu_hostbuf_used);
+	// functions for PSM3_DEVICES "shm" RTS/CTS processing to enable
+	// use of GPU specific scale-up transfers within the given server
+	psm2_error_t (*ghfp_shm_init)(struct ptl_am *ptl,
+								psm2_mq_stats_t *stats);
+	void (*ghfp_shm_finalize)(struct ptl_am *ptl);
+	psm2_error_t (*ghfp_shm_epaddr_add)(struct ptl_am *ptl,
+							struct am_epaddr *am_epaddr);
+	void (*ghfp_shm_epaddr_free)(struct am_epaddr *am_epaddr);
+	int (*ghfp_shm_dev_fds_needed)(void);
+	void (*ghfp_shm_dev_fds_send)(struct ptl_am *ptl,
+					struct am_epaddr *am_epaddr);
+	psm2_error_t (*ghfp_shm_dev_fds_connreq_poll)(struct ptl_am *ptl,
+					struct am_ptl_connection_req *req);
+	psm2_error_t (*ghfp_shm_dev_fds_check_exchanged)(struct ptl_am *ptl,
+								struct am_ptl_connection_req *req, int index);
+	psm2_error_t (*ghfp_shm_dev_fds_poll)(struct ptl_am *ptl, psm2_error_t res);
+	psm2_error_t (*ghfp_shm_build_rts)(struct ptl_am *ptl,
+				psm2_mq_req_t req, int *narg_p,
+				psm2_amarg_t *args, void **payload_p, size_t *payload_size_p,
+				union am_gpu_rts_payload *info_p);
+	void (*ghfp_shm_process_rts)(psm2_mq_req_t req, void *buf, size_t len,
+								int narg, psm2_amarg_t *args);
+	int (*ghfp_shm_rtsmatch)(struct ptl_am *ptl, psm2_mq_req_t req);
+	int (*ghfp_shm_process_cts)(psm2_mq_req_t sreq);
+	psm2_error_t (*ghfp_get_cuda_permitted)(struct psm2_ep *ep, bool *enable);
+	psm2_error_t (*ghfp_set_cuda_permitted)(struct psm2_ep *ep, bool enable);
+	bool (*ghfp_is_memcpy_permitted)(struct psm2_ep *ep);
+} *psm3_gpu_hal;
+
+extern struct psm3_gpu_hal psm3_gpu_noop_hal;
+
+#ifdef PSM_CUDA
+extern struct psm3_gpu_hal psm3_cuda_hal;
+#endif
+
+#ifdef PSM_ONEAPI
+extern struct psm3_gpu_hal psm3_oneapi_ze_hal;
+#endif
+
+#ifdef PSM_HAVE_RNDV_MOD
+extern void psm3_gpu_rv_cap_string(char *buf, size_t size, uint64_t capability);
+extern void psm3_gpu_rv_set_hal_cap(uint64_t capability);
+#endif
+
+extern uint32_t psm3_gpu_query_feature_mask(void);
+extern psm2_error_t psm3_gpu_initialize(void);
+
+#define PSM3_GPU_TYPE  (psm3_gpu_hal->type)
+
+#define PSM3_GPU_IS_ENABLED (psm3_gpu_hal != &psm3_gpu_noop_hal)
+
+#define PSM3_GPU_IS_GDR_COPY_ENABLED (psm3_gpu_is_gdr_copy_enabled)
+#define PSM3_GPU_IS_DRIVER_GPUDIRECT_ENABLED (psm3_gpu_is_driver_gpudirect_enabled)
+
+// Only valid if called for a GPU buffer
+#define PSMI_USE_GDR_COPY_RECV(len) \
+			((len) >=1 && (len) <= psm3_gpu_gdr_copy_limit_recv)
+
+#ifdef PSM_HAVE_RNDV_MOD
+// RV GPU API version <= this unacceptable
+#define PSM3_GPU_RV_MAJOR_REV_FAIL \
+			(psm3_gpu_hal->rv_major_rev_fail)
+#define PSM3_GPU_RV_MINOR_REV_FAIL \
+			(psm3_gpu_hal->rv_minor_rev_fail)
+
+// capability bit corresponding to the GPU type which was selected by
+// PSM3_GPU_INITIALIZE
+#define PSM3_GPU_RV_CAPABILITY_EXPECTED \
+			(psm3_gpu_hal->rv_capability_expected)
+// ptl_ips HAL capability bit corresponding to the GPU type which was selected
+// by PSM3_GPU_INITIALIZE
+#define PSM3_GPU_HAL_CAP_EXPECTED \
+			(psm3_gpu_hal->hal_cap_expected)
+
+// not a HAL function table call,
+// return a string representing the GPU(s)
+// supported by the given RV reported runtime capability mask
+#define PSM3_GPU_RV_CAP_STRING(buf, size, capability) \
+		psm3_gpu_rv_cap_string(buf, size, capability)
+
+// not a HAL function table call,
+// Based on the RV capability supported, add to the ptl_ips HAL capability.
+// Should only be called within an ptl_ips HAL once it has decided it will
+// open rv.
+#define PSM3_GPU_RV_SET_HAL_CAP(capability) \
+		psm3_gpu_rv_set_hal_cap(capability)
+#endif /* PSM_HAVE_RNDV_MOD */
+
+// not a HAL function table call,
+// indicates features available in the build of PSM3
+#define PSM3_GPU_QUERY_FEATURE_MASK() \
+		psm3_gpu_query_feature_mask()
+
+// Initialization is unique, we will check which HALs are available
+// and selected and setup psm3_gpu_hal and then initialize the
+// selected HAL
+#define PSM3_GPU_INITIALIZE() psm3_gpu_initialize()
+
+// These are all front ends to the GPU HAL function table
+#define PSM3_GPU_FINALIZE() \
+		(psm3_gpu_hal->ghfp_finalize)()
+#define PSM3_GPU_EP_OPEN() \
+		(psm3_gpu_hal->ghfp_ep_open)()
+#define PSM3_GPU_EP_CLOSE() \
+		(psm3_gpu_hal->ghfp_ep_close)()
+
+#define PSM3_GPU_IDENTIFY(accel_vers, size) \
+		(psm3_gpu_hal->ghfp_identify)(accel_vers, size)
+#define PSM3_GPU_VERIFY_CAPABILITIES() \
+		(psm3_gpu_hal->ghfp_verify_GPU_capabilities)()
+#define PSM3_GPU_P2P_SUPPORTED() \
+		(psm3_gpu_hal->ghfp_p2p_supported)()
+#define PSM3_GPU_GPUDIRECT_SUPPORTED() \
+		(psm3_gpu_hal->ghfp_gpudirect_supported)()
+#define PSM3_GPU_USING_RV_FOR_MRS() \
+		(psm3_gpu_hal->ghfp_using_rv_for_mrs)()
+#define PSM3_GPU_GET_PCI_ADDR(domain_p, bus_p, dev_p, func_p) \
+		(psm3_gpu_hal->ghfp_get_pci_addr)(domain_p, bus_p, dev_p, func_p)
+#ifdef PSM_HAVE_RNDV_MOD
+#define PSM3_GPU_MIN_BAR_SIZE(void) \
+		(psm3_gpu_hal->ghfp_min_bar_size)()
+#define PSM3_GPU_CHECK_PHYS_ADDR(phys_addr) \
+		(psm3_gpu_hal->ghfp_check_phys_addr)(phys_addr)
+#define PSM3_GPU_ROUNDUP_GDRCOPY(buf, size, pageaddr_p, pagelen_p) \
+		(psm3_gpu_hal->ghfp_roundup_gdrcopy)(buf, size, pageaddr_p, pagelen_p)
+#ifdef PSM_HAVE_REG_MR
+#define PSM3_GPU_ROUNDUP_RV_REG_MR(ep, addr_p, length_p, access) \
+		(psm3_gpu_hal->ghfp_roundup_rv_reg_mr)(ep, addr_p, length_p, access)
+#endif
+#define PSM3_GPU_INIT_RV_REG_MR_PARAMS(addr, length, access, mparams, \
+			gpu_specific, scratchpad) \
+		(psm3_gpu_hal->ghfp_init_rv_reg_mr_params)(addr, length, access, \
+			mparams, gpu_specific, scratchpad)
+#define PSM3_GPU_INIT_RV_PIN_MMAP_PARAMS(addr, length, access, params, \
+			scratchpad) \
+		(psm3_gpu_hal->ghfp_init_rv_pin_mmap_params)(addr, length, access, \
+			params, scratchpad)
+#define PSM3_GPU_RV_REG_MMAP_CLEANUP(addr, length, access, scratchpad)\
+		(psm3_gpu_hal->ghfp_rv_reg_mmap_cleanup)(addr, length, access, \
+		scratchpad)
+#endif /* PSM_HAVE_RNDV_MOD */
+#ifdef PSM_HAVE_REG_MR
+#define PSM3_GPU_CMP_MR(a, b) \
+		(psm3_gpu_hal->ghfp_cmp_mr)(a, b)
+#define PSM3_GPU_INIT_MR(addr, length, access, gpu_specific) \
+		(psm3_gpu_hal->ghfp_init_mr)(addr, length, access, gpu_specific)
+#endif /* PSM_HAVE_RNDV_MOD */
+// if GPU HAL needs it, fetch current context of process and save internal to
+// GPU HAL for use in later calls.  Used by rcvthread at thread start
+// to ensure GPU APIs have a context if needed
+#define PSM3_GPU_FETCH_CTXT(void) \
+		(psm3_gpu_hal->ghfp_fetch_ctxt)(void)
+// if GPU HAL needs it, refresh current context of process based on copy
+// internal to HAL. Used by rcvthread at thread interrupt callback
+// to ensure GPU APIs have  a context if needed
+#define PSM3_GPU_REFRESH_CTXT(void) \
+		(psm3_gpu_hal->ghfp_refresh_ctxt)(void)
+// These calls permit the GPU specific code to preregister host memory
+// which was malloc()'ed.  This can speed up GPU memcpy for some GPUs
+#define PSM3_GPU_REGISTER_HOSTMEM(buf, size) \
+		(psm3_gpu_hal->ghfp_register_hostmem)(buf, size)
+#define PSM3_GPU_UNREGISTER_HOSTMEM(buf) \
+		(psm3_gpu_hal->ghfp_unregister_hostmem)(buf)
+// TBD - this is called alot, but seems we need to use function ptr
+// instead of macro, TBD if will affect latency, cost is probably
+// in function called, not in actual call/ret overhead
+#define PSM3_IS_GPU_MEM(ptr) \
+		(psm3_gpu_hal->ghfp_is_gpu_mem)(ptr)
+#define PSM3_IS_BUFFER_GPU_MEM(buf, len) ((len) && PSM3_IS_GPU_MEM(buf))
+#define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp) \
+		(psm3_gpu_hal->ghfp_prepare_HtoD_memcpys)(protoexp)
+#define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto) \
+		(psm3_gpu_hal->ghfp_prepare_DtoH_memcpys)(proto)
+#define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp) \
+		(psm3_gpu_hal->ghfp_shutdown_HtoD_memcpys)(protoexp)
+#define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto) \
+		(psm3_gpu_hal->ghfp_shutdown_DtoH_memcpys)(proto)
+#define PSM3_GPU_MEMCPY_HTOD_START(proto, ghb, len) \
+		(psm3_gpu_hal->ghfp_memcpy_HtoD_start)(proto, ghb, len)
+#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len) \
+		(psm3_gpu_hal->ghfp_memcpy_DtoH_start)(proto, ghb, len)
+#define PSM3_GPU_MEMCPY_DONE(ghb) \
+		(psm3_gpu_hal->ghfp_memcpy_done)(ghb)
+#define PSM3_GPU_HOSTBUF_LAZY_INIT(ghb) \
+		(psm3_gpu_hal->ghfp_hostbuf_lazy_init)(ghb)
+#define PSM3_GPU_HOSTBUF_RESET(ghb) \
+		(psm3_gpu_hal->ghfp_hostbuf_reset)(ghb)
+#define PSM3_GPU_HOSTBUF_DESTROY(ghb) \
+		(psm3_gpu_hal->ghfp_hostbuf_destroy)(ghb)
+#define PSM3_GPU_MEMCPY_DTOD(dstptr, srcptr, len) \
+		(psm3_gpu_hal->ghfp_memcpy_DtoD)(dstptr, srcptr, len)
+#define PSM3_GPU_MEMCPY_HTOD(dstptr, srcptr, len) \
+		(psm3_gpu_hal->ghfp_memcpy_HtoD)(dstptr, srcptr, len)
+#define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) \
+		(psm3_gpu_hal->ghfp_memcpy_DtoH)(dstptr, srcptr, len)
+#define PSM3_GPU_MEMCPY(dstptr, srcptr, len) \
+		(psm3_gpu_hal->ghfp_memcpy)(dstptr, srcptr, len)
+#define PSM3_GPU_SYNCHRONIZE_MEMCPY() \
+		(psm3_gpu_hal->ghfp_synchronize_memcpy)()
+#define PSM3_GPU_MARK_BUF_SYNCHRONOUS(buf) \
+		(psm3_gpu_hal->ghfp_mark_buf_synchronous)(buf)
+#define PSM3_GPU_HOST_ALLOC(ret_ptr, size) \
+		(psm3_gpu_hal->ghfp_host_alloc)(ret_ptr, size)
+#define PSM3_GPU_HOST_FREE(ptr) \
+		(psm3_gpu_hal->ghfp_host_free)(ptr)
+#define PSM3_GPU_ADDR_SEND_MR(mqreq) \
+		(psm3_gpu_hal->ghfp_gpu_addr_send_mr)(mqreq)
+#define PSM3_GPU_ADDR_RECV_MR(tidrecvc, gpu_hostbuf_used) \
+		(psm3_gpu_hal->ghfp_gpu_addr_recv_mr)(tidrecvc, gpu_hostbuf_used)
+// functions for PSM3_DEVICES "shm" RTS/CTS processing to enable
+// use of GPU specific scale-up transfers within the given server
+#define PSM3_GPU_SHM_INIT(ptl, stats) \
+		(psm3_gpu_hal->ghfp_shm_init)(ptl, stats)
+#define PSM3_GPU_SHM_FINALIZE(ptl) \
+		(psm3_gpu_hal->ghfp_shm_finalize)(ptl)
+#define PSM3_GPU_SHM_EPADDR_ADD(ptl, am_epaddr) \
+		(psm3_gpu_hal->ghfp_shm_epaddr_add)(ptl, am_epaddr)
+#define PSM3_GPU_SHM_EPADDR_FREE(am_epaddr) \
+		(psm3_gpu_hal->ghfp_shm_epaddr_free)(am_epaddr)
+#define PSM3_GPU_SHM_DEV_FDS_NEEDED() \
+		(psm3_gpu_hal->ghfp_shm_dev_fds_needed)()
+#define PSM3_GPU_SHM_DEV_FDS_SEND(ptl, am_epaddr) \
+		(psm3_gpu_hal->ghfp_shm_dev_fds_send)(ptl, am_epaddr)
+#define PSM3_GPU_SHM_DEV_FDS_CONNREQ_POLL(ptl, req) \
+		(psm3_gpu_hal->ghfp_shm_dev_fds_connreq_poll)(ptl, req)
+#define PSM3_GPU_SHM_DEV_FDS_CHECK_EXCHANGED(ptl, req, index) \
+		(psm3_gpu_hal->ghfp_shm_dev_fds_check_exchanged)(ptl, req, index)
+#define PSM3_GPU_SHM_DEV_FDS_POLL(ptl, res) \
+		(psm3_gpu_hal->ghfp_shm_dev_fds_poll)(ptl, res)
+#define PSM3_GPU_SHM_BUILD_RTS(ptl, req, narg_p, args, payload_p, payload_size_p, info_p) \
+		(psm3_gpu_hal->ghfp_shm_build_rts)(ptl, req, narg_p, args, payload_p, \
+											payload_size_p, info_p)
+#define PSM3_GPU_SHM_PROCESS_RTS(req, buf, len, narg, args) \
+		(psm3_gpu_hal->ghfp_shm_process_rts)(req, buf, len, narg, args)
+#define PSM3_GPU_SHM_RTSMATCH(ptl, req) \
+		(psm3_gpu_hal->ghfp_shm_rtsmatch)(ptl, req)
+#define PSM3_GPU_SHM_PROCESS_CTS(sreq) \
+		(psm3_gpu_hal->ghfp_shm_process_cts)(sreq)
+#define PSM3_GPU_GET_CUDA_PERMITTED(ep, enable) \
+		(psm3_gpu_hal->ghfp_get_cuda_permitted)(ep, enable)
+#define PSM3_GPU_SET_CUDA_PERMITTED(ep, enable) \
+		(psm3_gpu_hal->ghfp_set_cuda_permitted)(ep, enable)
+#define PSM3_GPU_IS_MEMCPY_PERMITTED(ep) \
+		(psm3_gpu_hal->ghfp_is_memcpy_permitted)(ep)
+
+#else /* PSM_HAVE_GPU */
+// GPU omitted from build
+
+#define PSM3_GPU_FMT_RV_GPU_VER
+#define PSM3_GPU_OUT_RV_GPU_VER
+
+#define PSM3_GPU_TYPES
+
+#define PSM3_GPU_IS_ENABLED (0)
+
+#define PSM3_GPU_IS_GDR_COPY_ENABLED (0)
+#define PSM3_GPU_IS_DRIVER_GPUDIRECT_ENABLED  (0)
+
+#ifdef PSM_HAVE_RNDV_MOD
+#define PSM3_GPU_RV_MAJOR_REV_FAIL (0)
+#define PSM3_GPU_RV_MINOR_REV_FAIL (0)
+
+#define PSM3_GPU_RV_CAPABILITY_EXPECTED (0)
+#define PSM3_GPU_HAL_CAP_EXPECTED (0)
+
+// we output " gpu unknown" since this being called means RV supports a GPU
+// but PSM3 build does not
+#define PSM3_GPU_RV_CAP_STRING(buf, size, capability) \
+		(void)snprintf(buf, size, " gpu unknown");
+
+#define PSM3_GPU_RV_SET_HAL_CAP(capability) do { } while (0)
+#endif /* PSM_HAVE_RNDV_MOD */
+
+// this is unique, indicates features available in the build of PSM3
+#define PSM3_GPU_QUERY_FEATURE_MASK() (0)
+
+#define PSM3_GPU_TYPE  "none"
+
+// Initialization is unique, we will check for GPU related parameters
+// and warn the user
+#define PSM3_GPU_INITIALIZE() (PSM2_OK)
+
+// These are all front ends to the GPU HAL function table
+// GPU omitted from build, so all HAL functions are no-ops
+// this avoids need for callers to check if GPU enabled and reduces clutter
+
+#define PSM3_GPU_FINALIZE() do { } while (0)
+#define PSM3_GPU_EP_OPEN() do { } while (0)
+#define PSM3_GPU_EP_CLOSE() do { } while (0)
+
+#define PSM3_GPU_IDENTIFY(accel_vers, size) \
+		do { accel_vers[0] = '\0'; } while (0)
+#define PSM3_GPU_VERIFY_CAPABILITIES() do { } while (0)
+#define PSM3_GPU_P2P_SUPPORTED() (0)
+#define PSM3_GPU_GPUDIRECT_SUPPORTED() (0)
+#define PSM3_GPU_USING_RV_FOR_MRS() do { } while (0)
+#define PSM3_GPU_IS_DRIVER_GPUDIRECT_DISABLED() (1)
+#define PSM3_GPU_GET_PCI_ADDR(domain_p, bus_p, dev_p, func_p) \
+		do { *domain_p = 0; *bus_p = 0; *dev_p = 0; *func_p = 0; ) while (0)
+#ifdef PSM_HAVE_RNDV_MOD
+#define PSM3_GPU_MIN_BAR_SIZE(void) (0)
+#define PSM3_GPU_CHECK_PHYS_ADDR(phys_addr) (PSM2_OK)
+#define PSM3_GPU_ROUNDUP_GDRCOPY(buf, size, pageaddr_p, pagelen_p) \
+		do { *pageaddr_p = (uintptr_t)buf; *pagelen_p = (uint64_t)size } while (0)
+#ifdef PSM_HAVE_REG_MR
+#define PSM3_GPU_ROUNDUP_RV_REG_MR(ep, addr_p, length_p, access) do { } while (0)
+#endif
+#define PSM3_GPU_INIT_RV_REG_MR_PARAMS(addr, length, access, mparams, \
+			gpu_specific, scratchpad) (0)
+#define PSM3_GPU_INIT_RV_PIN_MMAP_PARAMS(addr, length, access, params, \
+			scratchpad) (0)
+#define PSM3_GPU_RV_REG_MMAP_CLEANUP(addr, length, access, scratchpad)\
+		do { } while (0)
+#endif /* PSM_HAVE_RNDV_MOD */
+#ifdef PSM_HAVE_REG_MR
+#define PSM3_GPU_CMP_MR(a, b) (0)
+#define PSM3_GPU_INIT_MR(addr, length, access, gpu_specific) \
+		do { } while (0)
+#endif
+#define PSM3_GPU_FETCH_CTXT() do { } while (0)
+#define PSM3_GPU_REFRESH_CTXT() do { } while (0)
+#define PSM3_GPU_REGISTER_HOSTMEM(buf, size) do { } while (0)
+#define PSM3_GPU_UNREGISTER_HOSTMEM(buf) do { } while (0)
+#define PSM3_IS_GPU_MEM(ptr) (0)
+#define PSM3_IS_BUFFER_GPU_MEM(buf, len) (0)
+// maybe some of these should be psmi_assert instead since should not
+// be called if didn't have a GPU
+#define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp) do { } while (0)
+#define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto) do { } while (0)
+#define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp) do { } while (0)
+#define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto) do { } while (0)
+#define PSM3_GPU_MEMCPY_HTOD_START(proto, ghb, len) do { } while (0)
+#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len) do { } while (0)
+#define PSM3_GPU_MEMCPY_DONE(ghb) (1)
+#define PSM3_GPU_HOSTBUF_LAZY_INIT(ghb) do { } while (0)
+#define PSM3_GPU_HOSTBUF_RESET(ghb) do { } while (0)
+#define PSM3_GPU_HOSTBUF_DESTROY(ghb) do { } while (0)
+#define PSM3_GPU_MEMCPY_DTOD(dstptr, srcptr, len) do { } while (0)
+#define PSM3_GPU_MEMCPY_HTOD(dstptr, srcptr, len) do { } while (0)
+#define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) do { } while (0)
+#define PSM3_GPU_MEMCPY(dstptr, srcptr, len) do { } while (0)
+#define PSM3_GPU_SYNCHRONIZE_MEMCPY() do { } while (0)
+#define PSM3_GPU_MARK_BUF_SYNCHRONOUS(buf) do { } while (0)
+#define PSM3_GPU_HOST_ALLOC(ret_ptr, size) do { *(ret_ptr) = NULL; } while (0)
+#define PSM3_GPU_HOST_FREE(ptr) do { } while (0)
+#define PSM3_GPU_ADDR_SEND_MR(mqreq) (0)
+#define PSM3_GPU_ADDR_RECV_MR(tidrecvc, gpu_hostbuf_used) (0)
+// functions for PSM3_DEVICES "shm" RTS/CTS processing to enable
+// use of GPU specific scale-up transfers within the given server
+#define PSM3_GPU_SHM_INIT(ptl, stats) (PSM2_OK)
+#define PSM3_GPU_SHM_FINALIZE(ptl) (PSM2_OK)
+#define PSM3_GPU_SHM_EPADDR_ADD(ptl, amadddr) (PSM2_OK)
+#define PSM3_GPU_SHM_EPADDR_FREE(amadddr) do { } while (0)
+#define PSM3_GPU_SHM_DEV_FDS_NEEDED() (0)
+#define PSM3_GPU_SHM_DEV_FDS_SEND(ptl, am_epaddr) do { } while (0)
+#define PSM3_GPU_SHM_DEV_FDS_CONNREQ_POLL(ptl, req) (PSM2_OK)
+#define PSM3_GPU_SHM_DEV_FDS_CHECK_EXCHANGED(ptl, req, index) (PSM2_OK)
+#define PSM3_GPU_SHM_DEV_FDS_POLL(ptl, res) (res)
+#define PSM3_GPU_SHM_BUILD_RTS(ptl, req, narg_p, args, payload_p, payload_size_p, info_p) \
+			(PSM2_OK)
+#define PSM3_GPU_SHM_PROCESS_RTS(req, buf, len, narg, args) \
+			do { } while(0)
+#define PSM3_GPU_SHM_RTSMATCH(ptl, req) (0)
+#define PSM3_GPU_SHM_PROCESS_CTS(sreq) (0)
+#define PSM3_GPU_GET_CUDA_PERMITTED(ep, enable) ({ *(enable) = true; PSM2_OK; })
+#define PSM3_GPU_SET_CUDA_PERMITTED(ep, enable) (PSM2_OK)
+#define PSM3_GPU_IS_MEMCPY_PERMITTED(ep) (false)
+
+#endif /* PSM_HAVE_GPU */
+
+#endif /* _PSMI_GPU_HAL_H */
diff --git a/prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c b/prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c
new file mode 100644
index 00000000000..98e20d86cd8
--- /dev/null
+++ b/prov/psm3/psm3/gpu/psm_gpu_oneapi_ze.c
@@ -0,0 +1,3548 @@
+/*
+
+  This file is provided under a dual BSD/GPLv2 license.  When using or
+  redistributing this file, you may do so under either license.
+
+  GPL LICENSE SUMMARY
+
+  Copyright(c) 2024 Intel Corporation.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+
+  This program is distributed in the hope that it will be useful, but
+  WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  General Public License for more details.
+
+  Contact Information:
+  Intel Corporation, www.intel.com
+
+  BSD LICENSE
+
+  Copyright(c) 2024 Intel Corporation.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Intel Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+/* Copyright (c) 2003-2024 Intel Corporation. All rights reserved. */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sched.h>		/* cpu_set */
+#include <ctype.h>		/* isalpha */
+#include <stdbool.h>
+
+#include "psm_user.h"
+
+#ifdef PSM_ONEAPI
+#include <dlfcn.h>
+#include <level_zero/ze_api.h>
+#include <level_zero/loader/ze_loader.h>
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <linux/sockios.h>
+#include <sys/poll.h>
+#include "psm2_hal.h"
+#include "psm_mq_internal.h"
+#include "ptl_am/psm_am_internal.h"
+#include "ptl_ips/ips_proto.h"
+#include "ptl_ips/ips_expected_proto.h"
+#include "psmi_wrappers.h"
+#include <fcntl.h>
+#ifdef HAVE_DRM
+#include <sys/ioctl.h>
+#include <drm/i915_drm.h>
+#endif
+#ifdef HAVE_LIBDRM
+#include <sys/ioctl.h>
+#include <libdrm/i915_drm.h>
+#endif
+#ifdef PSM_HAVE_PIDFD
+#include <sys/syscall.h>
+#endif
+
+// if defined, use malloc for pipeline copy bounce buffers
+// otherwise, use zeMemAllocHost
+//#define PSM3_USE_ONEAPI_MALLOC
+
+// if defined, do not use zexDriverImportExternalPointer for malloced pipeline
+// copy bounce buffers
+// otherwise, use zexDriverImportExternalPointer when malloc buffer
+//#define PSM3_NO_ONEAPI_IMPORT
+
+// default value for PSM3_GPU_THRESH_RNDV
+#define PSM3_ONEAPI_ZE_GPU_THRESH_RNDV 8000
+// default value for PSM3_GPU_RNDV_NIC_WINDOW when using OneApi Level Zero GPU
+#define PSM3_ONEAPI_ZE_RNDV_NIC_WINDOW_DEFAULT "131072:524287,262144:1048575,524288"
+// default value for PSM3_GPUDIRECT_RDMA_SEND_LIMIT
+#define PSM3_ONEAPI_ZE_GPUDIRECT_RDMA_SEND_LIMIT_DEFAULT UINT_MAX
+// default value for PSM3_GPUDIRECT_RDMA_RECV_LIMIT
+#define PSM3_ONEAPI_ZE_GPUDIRECT_RDMA_RECV_LIMIT_DEFAULT 1
+// default value for PSM3_MQ_RNDV_SHM_GPU_THRESH
+// Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem
+#define PSM3_ONEAPI_ZE_MQ_RNDV_SHM_GPU_THRESH 127
+
+struct psm3_oneapi_ze_dev_ctxt {
+	ze_device_handle_t dev;
+	int dev_index; /* Index in psm3_oneapi_ze_devices[] */
+	uint32_t ordinal; /* CmdQGrp ordinal for the 1st copy_only engine */
+	uint32_t index;   /* Cmdqueue index within the CmdQGrp */
+	uint32_t num_queues; /* Number of queues in the CmdQGrp */
+	// for most sync copies
+	ze_command_queue_handle_t cq;	// NULL if psm3_oneapi_ze_immed_sync_copy
+	ze_command_list_handle_t cl;
+	// fields below are only used for large DTOD sync copy so can do 2
+	// parallel async copies then wait for both
+	ze_event_handle_t copy_status0;
+	ze_event_handle_t copy_status1;
+	ze_command_list_handle_t async_cl0;
+	ze_command_list_handle_t async_cl1;
+	ze_command_queue_handle_t async_cq0;// NULL if psm3_oneapi_ze_immed_sync_copy
+	ze_command_queue_handle_t async_cq1;// NULL if psm3_oneapi_ze_immed_sync_copy
+	ze_event_pool_handle_t event_pool;
+};
+
+static ze_driver_handle_t psm3_oneapi_ze_driver = NULL;
+static struct psm3_oneapi_ze_dev_ctxt psm3_oneapi_ze_devices[MAX_ZE_DEVICES];
+static int psm3_num_oneapi_ze_devices = 0;
+static struct psm3_oneapi_ze_dev_ctxt *psm3_oneapi_ze_cur_dev = NULL;
+
+/* ZE Loader(zel) And Runtime(ze) Library */
+static void *psm3_oneapi_ze_lib;
+static ze_api_version_t psm3_oneapi_ze_api_version = 0;
+static zel_version_t psm3_oneapi_ze_lib_version = { };
+
+/* This is a global oneapi_ze context
+ */
+static ze_context_handle_t psm3_oneapi_ze_context = NULL;
+
+#ifndef PSM_HAVE_PIDFD
+static int psm3_ze_dev_fds[MAX_ZE_DEVICES];
+static int psm3_num_ze_dev_fds;
+#endif
+static int psm3_oneapi_ze_immed_sync_copy;
+static int psm3_oneapi_ze_immed_async_copy;
+static unsigned psm3_oneapi_parallel_dtod_copy_thresh;
+
+#ifdef PSM_HAVE_RNDV_MOD
+// PSM3_RV_GPU_IGNORE_ALLOC_ID allows internal testing of GPU caching in RV
+// =0 -> default, alloc_id used to identify new buffers which have same
+// 		virt addr as an existing cache entry.  In which case a cache miss
+// 		and invalidation of the old cache entry occurs.
+// =1 -> an alloc_id of 0 is always used.  This has been demonstrated to
+// 		cause false cache hits which can lead to landing data in safe but
+// 		incorrect pages.  Useful only for development experiments and tests.
+// =2 -> for cache miss performance testing.  This will use a different alloc_id
+// 		per IO which will force cache invalidation on every IO.  So no
+// 		MR/mmap cache hits will occur, but all the normal MR handling will
+// 		occur just as if there was a miss when running in normal mode
+static int psm3_oneapi_ze_ignore_alloc_id;	// PSM3_RV_GPU_IGNORE_ALLOC_ID
+static uint64_t psm3_oneapi_ze_fake_alloc_id;	// for when PSM3_RV_GPU_IGNORE_ALLOC_ID==2
+#endif
+
+/* function pointers from dlopen access to oneapi_Ze shared library */
+#define PSM3_ZE_SYM_FP(name) PSM3_CONCAT(psm3_oneapi_ze_, name)
+static ze_result_t (*PSM3_ZE_SYM_FP(zeInit))(ze_init_flags_t flags);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeDriverGet))(uint32_t *pCount, ze_driver_handle_t *phDrivers);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeDeviceGet))(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeDevicePciGetPropertiesExt))(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties);
+#ifndef PSM3_NO_ONEAPI_IMPORT
+static ze_result_t (*PSM3_ZE_SYM_FP(zeDriverGetExtensionFunctionAddress))(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress);
+static ze_result_t (*PSM3_ZE_SYM_FP(zexDriverImportExternalPointer))(ze_driver_handle_t hDriver, void *ptr, size_t size);
+static ze_result_t (*PSM3_ZE_SYM_FP(zexDriverReleaseImportedPointer))(ze_driver_handle_t hDriver, void *ptr);
+#endif
+static ze_result_t (*PSM3_ZE_SYM_FP(zeContextCreate))(ze_driver_handle_t hDriver, const ze_context_desc_t *desc, ze_context_handle_t *phContext);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeContextDestroy))(ze_context_handle_t hContext);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandQueueCreate))(ze_context_handle_t hContext, ze_device_handle_t hDevice,const ze_command_queue_desc_t *desc, ze_command_queue_handle_t *phCommandQueue);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandQueueDestroy))(ze_command_queue_handle_t hCommandQueue);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandQueueExecuteCommandLists))(ze_command_queue_handle_t hCommandQueue, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ze_fence_handle_t hFence);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandQueueSynchronize))(ze_command_queue_handle_t hCommandQueue, uint64_t timeout);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListCreate))(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_list_desc_t *desc, ze_command_list_handle_t *phCommandList);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListDestroy))(ze_command_list_handle_t hCommandList);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListClose))(ze_command_list_handle_t hCommandList);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListReset))(ze_command_list_handle_t hCommandList);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListCreateImmediate))(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_queue_desc_t *desc, ze_command_list_handle_t *phCommandList);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListAppendMemoryCopy))(ze_command_list_handle_t hCommandList, void *dstptr, const void *srcptr, size_t size, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeCommandListAppendSignalEvent))(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeDeviceCanAccessPeer))(ze_device_handle_t hDevice, ze_device_handle_t hPeerDevice, ze_bool_t *value);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeDeviceGetCommandQueueGroupProperties))(ze_device_handle_t hDevice, uint32_t *pCount, ze_command_queue_group_properties_t *pCommandQueueGroupProperties);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeMemAllocHost))(ze_context_handle_t hContext, const ze_host_mem_alloc_desc_t *host_desc, size_t size, size_t alignment, void **pptr);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeMemAllocDevice))(ze_context_handle_t hContext, const ze_device_mem_alloc_desc_t *device_desc, size_t size, size_t alignment, ze_device_handle_t hDevice, void **pptr);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeMemFree))(ze_context_handle_t hContext, void *ptr);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeMemGetIpcHandle))(ze_context_handle_t hContext, const void *ptr, ze_ipc_mem_handle_t *pIpcHandle);
+#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+static ze_result_t (*PSM3_ZE_SYM_FP(zeMemPutIpcHandle))(ze_context_handle_t hContext, ze_ipc_mem_handle_t handle);
+#endif
+static ze_result_t (*PSM3_ZE_SYM_FP(zeMemOpenIpcHandle))(ze_context_handle_t hContext,ze_device_handle_t hDevice, ze_ipc_mem_handle_t handle, ze_ipc_memory_flags_t flags, void **pptr);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeMemCloseIpcHandle))(ze_context_handle_t hContext, const void *ptr);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeMemGetAddressRange))(ze_context_handle_t hContext, const void *ptr, void **pBase, size_t *pSize);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeMemGetAllocProperties))(ze_context_handle_t hContext, const void *ptr, ze_memory_allocation_properties_t *pMemAllocProperties, ze_device_handle_t *phDevice);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeEventPoolCreate))(ze_context_handle_t hContext, const ze_event_pool_desc_t *desc, uint32_t numDevices, ze_device_handle_t *phDevices, ze_event_pool_handle_t *phEventPool);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeEventPoolDestroy))(ze_event_pool_handle_t hEventPool);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeEventCreate))(ze_event_pool_handle_t hEventPool, const ze_event_desc_t *desc, ze_event_handle_t *phEvent);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeEventDestroy))(ze_event_handle_t hEvent);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeEventQueryStatus))(ze_event_handle_t hEvent);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeEventHostSynchronize))(ze_event_handle_t hEvent, uint64_t timeout);
+static ze_result_t (*PSM3_ZE_SYM_FP(zeEventHostReset))(ze_event_handle_t hEvent);
+static ze_result_t (*PSM3_ZE_SYM_FP(zelLoaderGetVersions))(size_t *num_elems, zel_component_version_t *versions);
+
+/* statistics counting each oneapi_ze call PSM3 makes */
+#define PSM3_ZE_SYM_COUNT(name) PSM3_CONCAT(psm3_oneapi_ze_count_, name)
+static uint64_t PSM3_ZE_SYM_COUNT(zeInit);
+static uint64_t PSM3_ZE_SYM_COUNT(zeDriverGet);
+static uint64_t PSM3_ZE_SYM_COUNT(zeDeviceGet);
+static uint64_t PSM3_ZE_SYM_COUNT(zeDevicePciGetPropertiesExt);
+#ifndef PSM3_NO_ONEAPI_IMPORT
+static uint64_t PSM3_ZE_SYM_COUNT(zeDriverGetExtensionFunctionAddress);
+static uint64_t PSM3_ZE_SYM_COUNT(zexDriverImportExternalPointer);
+static uint64_t PSM3_ZE_SYM_COUNT(zexDriverReleaseImportedPointer);
+#endif
+static uint64_t PSM3_ZE_SYM_COUNT(zeContextCreate);
+static uint64_t PSM3_ZE_SYM_COUNT(zeContextDestroy);
+static uint64_t PSM3_ZE_SYM_COUNT(zeCommandQueueCreate);
+static uint64_t PSM3_ZE_SYM_COUNT(zeCommandQueueDestroy);
+static uint64_t PSM3_ZE_SYM_COUNT(zeCommandQueueExecuteCommandLists);
+static uint64_t PSM3_ZE_SYM_COUNT(zeCommandQueueSynchronize);
+static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListCreate);
+static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListDestroy);
+static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListClose);
+static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListReset);
+static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListCreateImmediate);
+static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListAppendMemoryCopy);
+static uint64_t PSM3_ZE_SYM_COUNT(zeCommandListAppendSignalEvent);
+static uint64_t PSM3_ZE_SYM_COUNT(zeDeviceCanAccessPeer);
+static uint64_t PSM3_ZE_SYM_COUNT(zeDeviceGetCommandQueueGroupProperties);
+static uint64_t PSM3_ZE_SYM_COUNT(zeMemAllocHost);
+static uint64_t PSM3_ZE_SYM_COUNT(zeMemAllocDevice);
+static uint64_t PSM3_ZE_SYM_COUNT(zeMemFree);
+static uint64_t PSM3_ZE_SYM_COUNT(zeMemGetIpcHandle);
+#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+static uint64_t PSM3_ZE_SYM_COUNT(zeMemPutIpcHandle);
+#endif
+static uint64_t PSM3_ZE_SYM_COUNT(zeMemOpenIpcHandle);
+static uint64_t PSM3_ZE_SYM_COUNT(zeMemCloseIpcHandle);
+static uint64_t PSM3_ZE_SYM_COUNT(zeMemGetAddressRange);
+static uint64_t PSM3_ZE_SYM_COUNT(zeMemGetAllocProperties);
+static uint64_t PSM3_ZE_SYM_COUNT(zeEventPoolCreate);
+static uint64_t PSM3_ZE_SYM_COUNT(zeEventPoolDestroy);
+static uint64_t PSM3_ZE_SYM_COUNT(zeEventCreate);
+static uint64_t PSM3_ZE_SYM_COUNT(zeEventDestroy);
+static uint64_t PSM3_ZE_SYM_COUNT(zeEventQueryStatus);
+static uint64_t PSM3_ZE_SYM_COUNT(zeEventHostSynchronize);
+static uint64_t PSM3_ZE_SYM_COUNT(zeEventHostReset);
+static uint64_t PSM3_ZE_SYM_COUNT(zelLoaderGetVersions);
+
+static const char* psm3_oneapi_ze_result_to_string(const ze_result_t result) {
+#define ZE_RESULT_CASE(RES) case ZE_RESULT_##RES: return STRINGIFY(RES)
+
+	switch (result) {
+	ZE_RESULT_CASE(SUCCESS);
+	ZE_RESULT_CASE(NOT_READY);
+	ZE_RESULT_CASE(ERROR_UNINITIALIZED);
+	ZE_RESULT_CASE(ERROR_DEVICE_LOST);
+	ZE_RESULT_CASE(ERROR_INVALID_ARGUMENT);
+	ZE_RESULT_CASE(ERROR_OUT_OF_HOST_MEMORY);
+	ZE_RESULT_CASE(ERROR_OUT_OF_DEVICE_MEMORY);
+	ZE_RESULT_CASE(ERROR_MODULE_BUILD_FAILURE);
+	ZE_RESULT_CASE(ERROR_INSUFFICIENT_PERMISSIONS);
+	ZE_RESULT_CASE(ERROR_NOT_AVAILABLE);
+	ZE_RESULT_CASE(ERROR_UNSUPPORTED_VERSION);
+	ZE_RESULT_CASE(ERROR_UNSUPPORTED_FEATURE);
+	ZE_RESULT_CASE(ERROR_INVALID_NULL_HANDLE);
+	ZE_RESULT_CASE(ERROR_HANDLE_OBJECT_IN_USE);
+	ZE_RESULT_CASE(ERROR_INVALID_NULL_POINTER);
+	ZE_RESULT_CASE(ERROR_INVALID_SIZE);
+	ZE_RESULT_CASE(ERROR_UNSUPPORTED_SIZE);
+	ZE_RESULT_CASE(ERROR_UNSUPPORTED_ALIGNMENT);
+	ZE_RESULT_CASE(ERROR_INVALID_SYNCHRONIZATION_OBJECT);
+	ZE_RESULT_CASE(ERROR_INVALID_ENUMERATION);
+	ZE_RESULT_CASE(ERROR_UNSUPPORTED_ENUMERATION);
+	ZE_RESULT_CASE(ERROR_UNSUPPORTED_IMAGE_FORMAT);
+	ZE_RESULT_CASE(ERROR_INVALID_NATIVE_BINARY);
+	ZE_RESULT_CASE(ERROR_INVALID_GLOBAL_NAME);
+	ZE_RESULT_CASE(ERROR_INVALID_KERNEL_NAME);
+	ZE_RESULT_CASE(ERROR_INVALID_FUNCTION_NAME);
+	ZE_RESULT_CASE(ERROR_INVALID_GROUP_SIZE_DIMENSION);
+	ZE_RESULT_CASE(ERROR_INVALID_GLOBAL_WIDTH_DIMENSION);
+	ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ARGUMENT_INDEX);
+	ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
+	ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE);
+	ZE_RESULT_CASE(ERROR_INVALID_COMMAND_LIST_TYPE);
+	ZE_RESULT_CASE(ERROR_OVERLAPPING_REGIONS);
+	ZE_RESULT_CASE(ERROR_UNKNOWN);
+	default:
+		return "Unknown error";
+	}
+
+#undef ZE_RESULT_CASE
+}
+
+#define PSM3_ONEAPI_ZE_CALL(func, args...) do { \
+	ze_result_t result; \
+	PSM3_CONCAT(psm3_oneapi_ze_count_, func)++; \
+	result = PSM3_CONCAT(psm3_oneapi_ze_, func)(args);	\
+	if(result != ZE_RESULT_SUCCESS) { \
+		_HFI_ERROR( "OneAPI Level Zero failure: %s() (at %s:%d)" \
+			" returned 0x%x: %s\n", \
+			#func, __FILE__, __LINE__, result, \
+			psm3_oneapi_ze_result_to_string(result)); \
+		psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
+			"Error returned from OneAPI Level Zero function %s.\n", #func); \
+	}							\
+} while (0)
+
+// resolve a OneAPI Level Zero shared library symbol
+#define PSM3_ONEAPI_ZE_DLSYM(lib_ptr, func) do { \
+	PSM3_CONCAT(psm3_oneapi_ze_, func) = dlsym(lib_ptr, STRINGIFY(func)); \
+	if (!PSM3_CONCAT(psm3_oneapi_ze_, func)) { \
+		psm3_handle_error(PSMI_EP_NORETURN, \
+			PSM2_INTERNAL_ERR, \
+			"Unable to resolve %s symbol " \
+			"in OneAPI Level Zero library.\n", STRINGIFY(func)); \
+	} \
+} while (0)
+
+static int psm3_oneapi_ze_lib_load()
+{
+	psm2_error_t err = PSM2_OK;
+	char *dlerr;
+
+	PSM2_LOG_MSG("entering");
+	_HFI_VDBG("Loading OneAPI Level Zero library.\n");
+
+	psm3_oneapi_ze_lib = dlopen("libze_loader.so.1", RTLD_LAZY);
+	if (!psm3_oneapi_ze_lib) {
+		dlerr = dlerror();
+		_HFI_ERROR(
+			"Unable to open libze_loader.so.1.  Error %s\n",
+			dlerr ? dlerr : "no dlerror()");
+		goto fail;
+	}
+
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeInit);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDriverGet);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDeviceGet);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDevicePciGetPropertiesExt);
+#ifndef PSM3_NO_ONEAPI_IMPORT
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDriverGetExtensionFunctionAddress);
+#endif
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeContextCreate);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeContextDestroy);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandQueueCreate);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandQueueDestroy);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandQueueExecuteCommandLists);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandQueueSynchronize);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListCreate);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListDestroy);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListClose);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListReset);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListCreateImmediate);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListAppendMemoryCopy);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeCommandListAppendSignalEvent);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDeviceCanAccessPeer);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeDeviceGetCommandQueueGroupProperties);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemAllocHost);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemAllocDevice);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemFree);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemGetIpcHandle);
+#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemPutIpcHandle);
+#endif
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemOpenIpcHandle);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemCloseIpcHandle);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemGetAddressRange);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeMemGetAllocProperties);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventPoolCreate);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventPoolDestroy);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventCreate);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventDestroy);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventQueryStatus);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventHostSynchronize);
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zeEventHostReset);
+
+	/* ze loader API */
+	PSM3_ONEAPI_ZE_DLSYM(psm3_oneapi_ze_lib, zelLoaderGetVersions);
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+fail:
+	if (psm3_oneapi_ze_lib)
+		dlclose(psm3_oneapi_ze_lib);
+	err = psm3_handle_error(PSMI_EP_NORETURN,
+		PSM2_INTERNAL_ERR,
+		"Unable to load OneAPI Level Zero library.\n");
+	return err;
+}
+
+static void psm3_oneapi_ze_stats_register()
+{
+#define PSM3_ONEAPI_ZE_COUNT_DECLU64(func) \
+	PSMI_STATS_DECLU64(#func, NULL, &PSM3_CONCAT(psm3_oneapi_ze_count_, func))
+
+	struct psmi_stats_entry ze_entries[] = {
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeInit),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDriverGet),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGet),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDevicePciGetPropertiesExt),
+#ifndef PSM3_NO_ONEAPI_IMPORT
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDriverGetExtensionFunctionAddress),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zexDriverImportExternalPointer),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zexDriverReleaseImportedPointer),
+#endif
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeContextCreate),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeContextDestroy),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueCreate),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueDestroy),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueExecuteCommandLists),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueSynchronize),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListCreate),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListDestroy),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListClose),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListReset),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListCreateImmediate),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListAppendMemoryCopy),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeCommandListAppendSignalEvent),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDeviceCanAccessPeer),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGetCommandQueueGroupProperties),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemAllocHost),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemAllocDevice),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemFree),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemGetIpcHandle),
+#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemPutIpcHandle),
+#endif
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemOpenIpcHandle),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemCloseIpcHandle),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemGetAddressRange),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeMemGetAllocProperties),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventPoolCreate),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventPoolDestroy),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventCreate),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventDestroy),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventQueryStatus),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventHostSynchronize),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zeEventHostReset),
+		PSM3_ONEAPI_ZE_COUNT_DECLU64(zelLoaderGetVersions)
+	};
+#undef PSM3_ONEAPI_ZE_COUNT_DECLU64
+
+	psm3_stats_register_type("PSM_OneAPI_ZE_call_statistics",
+		"Count of OneAPI Level Zero calls per API entry point for the whole process.\n"
+		"When using an Intel(r) GPU, PSM3 may call Level Zero "
+		"APIs to access or transfer application buffers in GPU memory.",
+		PSMI_STATSTYPE_GPU,
+		ze_entries, PSMI_HOWMANY(ze_entries), NULL,
+		&psm3_oneapi_ze_count_zeInit, NULL); /* context must != NULL */
+}
+
+static void psm3_oneapi_ze_find_copy_only_engine(ze_device_handle_t dev,
+						struct psm3_oneapi_ze_dev_ctxt *ctxt)
+{
+	uint32_t count = 0;
+	ze_command_queue_group_properties_t *props = NULL;
+	int i;
+	int done = 0;
+
+	/* Set the default */
+	ctxt->ordinal = 0;
+	ctxt->index = 0;
+	ctxt->num_queues = 1;
+	PSM3_ONEAPI_ZE_CALL(zeDeviceGetCommandQueueGroupProperties, dev,
+				&count, NULL);
+	props = psmi_calloc(PSMI_EP_NONE, UNDEFINED, count, sizeof(*props));
+	if (!props) {
+		_HFI_ERROR("Failed to allocate mem for CmdQ Grp\n");
+		return;
+	}
+	PSM3_ONEAPI_ZE_CALL(zeDeviceGetCommandQueueGroupProperties, dev,
+				&count, props);
+
+	// pick the last command queue group which supports copy but not compute.
+	// For PVC this will be the xeLink copy engine which will also
+	// have numQueues >1 (TBD - perhaps only select if it has numQueues>1).
+	// This ordinal is then supplied to create Command Queues and Command Lists.
+	for (i = count - 1; i >= 0; i--) {
+		_HFI_DBG("GPU Queue Group %d: copy=%d Compute=%d num_queues=%d\n", i,
+			(props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) != 0,
+			(props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0,
+			(int)props[i].numQueues);
+		if (! done && (props[i].flags &
+						ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) &&
+			!(props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) {
+			ctxt->ordinal = i;
+			ctxt->num_queues = props[i].numQueues;
+			done = 1;
+			if (_HFI_DBG_ON) {
+				_HFI_DBG_ALWAYS("Selected GPU copy engine %d\n", i);
+			} else {
+				break;
+			}
+		}
+	}
+	psmi_free(props);
+}
+
+// for pipelined async GPU memcpy
+// *p_cq is left as NULL when psm3_oneapi_ze_immed_async_copy enabled
+static void psm3_oneapi_ze_async_cmd_create(struct psm3_oneapi_ze_dev_ctxt *ctxt,
+		ze_command_queue_handle_t *p_cq, ze_command_list_handle_t *p_cl)
+{
+	psmi_assert(! *p_cl);
+	if (psm3_oneapi_ze_immed_async_copy) {
+		ze_command_queue_desc_t cq_desc = {
+			.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+			.flags = 0,
+			.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+			.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL
+		};
+		cq_desc.ordinal = ctxt->ordinal;
+		cq_desc.index = ctxt->index++;
+		ctxt->index %= ctxt->num_queues;
+		PSM3_ONEAPI_ZE_CALL(zeCommandListCreateImmediate,
+			psm3_oneapi_ze_context, ctxt->dev, &cq_desc, p_cl);
+	} else {
+		if (! *p_cq) {
+			ze_command_queue_desc_t cq_desc = {
+				.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+				.flags = 0,
+				.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+				.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL
+			};
+			cq_desc.ordinal = ctxt->ordinal;
+			cq_desc.index = ctxt->index++;
+			ctxt->index %= ctxt->num_queues;
+			PSM3_ONEAPI_ZE_CALL(zeCommandQueueCreate,
+					psm3_oneapi_ze_context, ctxt->dev, &cq_desc, p_cq);
+		}
+		ze_command_list_desc_t cl_desc = {
+			.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
+			.flags = 0
+		};
+		cl_desc.commandQueueGroupOrdinal = ctxt->ordinal;
+		PSM3_ONEAPI_ZE_CALL(zeCommandListCreate,
+			psm3_oneapi_ze_context, ctxt->dev, &cl_desc, p_cl);
+	}
+}
+
+// create command queue for use in psm3_oneapi_ze_memcpy for sync memcpy
+static void psm3_oneapi_ze_cmd_create(ze_device_handle_t dev, struct psm3_oneapi_ze_dev_ctxt *ctxt)
+{
+	ze_command_queue_desc_t ze_cq_desc = {
+		.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+		.flags = 0,
+		//.mode set below
+		.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
+	};
+
+	psm3_oneapi_ze_find_copy_only_engine(dev, ctxt);
+	ze_cq_desc.ordinal = ctxt->ordinal;
+	ze_cq_desc.index = ctxt->index;
+
+	if (psm3_oneapi_ze_immed_sync_copy) {
+		ze_cq_desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
+		PSM3_ONEAPI_ZE_CALL(zeCommandListCreateImmediate, psm3_oneapi_ze_context,
+			dev, &ze_cq_desc, &ctxt->cl);
+	} else {
+		ze_command_list_desc_t ze_cl_desc = {
+			.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
+			.flags = 0
+		};
+		ze_cq_desc.mode = ZE_COMMAND_QUEUE_MODE_DEFAULT;
+
+		PSM3_ONEAPI_ZE_CALL(zeCommandQueueCreate, psm3_oneapi_ze_context,
+			dev, &ze_cq_desc, &ctxt->cq);
+
+		ze_cl_desc.commandQueueGroupOrdinal = ctxt->ordinal;
+		PSM3_ONEAPI_ZE_CALL(zeCommandListCreate, psm3_oneapi_ze_context,
+			dev, &ze_cl_desc, &ctxt->cl);
+	}
+	ctxt->dev = dev;
+
+	if (psm3_oneapi_parallel_dtod_copy_thresh < UINT_MAX) {
+		// create resources for dual copy mechanism
+		ze_event_pool_desc_t pool_desc = {
+				.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,
+				.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE,
+				.count = 2
+		};
+		ze_event_desc_t event_desc = {
+				.stype = ZE_STRUCTURE_TYPE_EVENT_DESC,
+				.signal = ZE_EVENT_SCOPE_FLAG_HOST,
+				.wait = ZE_EVENT_SCOPE_FLAG_HOST,
+		};
+		PSM3_ONEAPI_ZE_CALL(zeEventPoolCreate,
+				psm3_oneapi_ze_context, &pool_desc, 0, NULL, &ctxt->event_pool);
+
+		event_desc.index = 0;
+		PSM3_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc,
+				&ctxt->copy_status0);
+
+		event_desc.index = 1;
+		PSM3_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc,
+				&ctxt->copy_status1);
+
+		psm3_oneapi_ze_async_cmd_create(ctxt, &ctxt->async_cq0,
+				&ctxt->async_cl0);
+		psm3_oneapi_ze_async_cmd_create(ctxt, &ctxt->async_cq1,
+				&ctxt->async_cl1);
+	}
+}
+
+static void psm3_oneapi_ze_cmd_create_all(void)
+{
+	int i;
+	struct psm3_oneapi_ze_dev_ctxt *ctxt;
+	ze_context_desc_t ctxtDesc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, NULL, 0 };
+
+	if (!psm3_oneapi_ze_context)
+		PSM3_ONEAPI_ZE_CALL(zeContextCreate, psm3_oneapi_ze_driver, &ctxtDesc,
+				&psm3_oneapi_ze_context);
+
+	for (i = 0; i < psm3_num_oneapi_ze_devices; i++) {
+		ctxt = &psm3_oneapi_ze_devices[i];
+
+		if (!ctxt->cl) {
+			psm3_oneapi_ze_cmd_create(ctxt->dev, ctxt);
+			_HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n",
+						i, ctxt->dev);
+		}
+	}
+	if (psm3_num_oneapi_ze_devices > 0)
+		psm3_oneapi_ze_cur_dev = &psm3_oneapi_ze_devices[0];
+}
+
+static void psm3_oneapi_ze_cmd_destroy_all(void)
+{
+	int i;
+	struct psm3_oneapi_ze_dev_ctxt *ctxt;
+
+	for (i = 0; i < psm3_num_oneapi_ze_devices; i++) {
+		ctxt = &psm3_oneapi_ze_devices[i];
+
+		if (ctxt->async_cl1 != NULL) {
+			PSM3_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl1);
+			ctxt->async_cl1 = NULL;
+		}
+		if (ctxt->async_cq1 != NULL) {
+			PSM3_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq1);
+			ctxt->async_cq1 = NULL;
+		}
+		if (ctxt->async_cl0 != NULL) {
+			PSM3_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl0);
+			ctxt->async_cl0 = NULL;
+		}
+		if (ctxt->async_cq0 != NULL) {
+			PSM3_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq0);
+			ctxt->async_cq0 = NULL;
+		}
+		if (ctxt->copy_status1 != NULL) {
+			PSM3_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status1);
+			ctxt->copy_status1 = NULL;
+		}
+		if (ctxt->copy_status0 != NULL) {
+			PSM3_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status0);
+			ctxt->copy_status0 = NULL;
+		}
+		if (ctxt->event_pool != NULL) {
+			PSM3_ONEAPI_ZE_CALL(zeEventPoolDestroy, ctxt->event_pool);
+			ctxt->event_pool = NULL;
+		}
+		if (ctxt->cl) {
+			PSM3_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->cl);
+			ctxt->cl = NULL;
+		}
+		if (ctxt->cq) {
+			PSM3_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->cq);
+			ctxt->cq = NULL;
+		}
+	}
+	psm3_oneapi_ze_cur_dev = NULL;
+
+	/* Also destroy psm3_oneapi_ze_context */
+	if (psm3_oneapi_ze_context) {
+		PSM3_ONEAPI_ZE_CALL(zeContextDestroy, psm3_oneapi_ze_context);
+		psm3_oneapi_ze_context = NULL;
+	}
+}
+
+/*
+ * get OneAPI alloc_id for a GPU address
+ *
+ * The address should be part of a buffer allocated from an OneAPI
+ * library call (zeMemAllocDevice() or zeMemAllocHost()).
+ * The alloc_id changes on each OneAPI allocation call. PSM3/rv uses the
+ * alloc_id to determine if a cache hit is a potentially stale entry which
+ * should be invalidated.
+ */
+static uint64_t psm3_oneapi_ze_get_alloc_id(void *addr, uint8_t *type)
+{
+	ze_memory_allocation_properties_t mem_props = {
+		.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES
+	};
+	ze_device_handle_t device;
+
+	PSM3_ONEAPI_ZE_CALL(zeMemGetAllocProperties, psm3_oneapi_ze_context,
+				addr, &mem_props, &device);
+	if (type)
+		*type = (uint8_t)mem_props.type;
+	/*
+	 * id is unique across all allocates on all devices within a given
+	 * process
+	 */
+	return mem_props.id;
+}
+
+//***************************************************************************
+//OneAPI Level Zero support for IPC handles
+//IPC Handles are used both for PSM3 shm intranode copies via xeLink
+//as well as for dma_buf use during MR creation for GPU Direct DMA and RDMA
+#ifndef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+static int psm3_oneapi_ze_ipc_handle_cached(const void *buf,
+				ze_ipc_mem_handle_t ipc_handle)
+{
+	static int first = 1;
+	static int cached = 0;
+	ze_ipc_mem_handle_t tmp_ipc_handle;
+	int tmp_fd;
+
+	/* Only detect the first time */
+	if (!first)
+		return cached;
+
+	PSM3_ONEAPI_ZE_CALL(zeMemGetIpcHandle, psm3_oneapi_ze_context,
+				buf, &tmp_ipc_handle);
+	tmp_fd = *(uint32_t *)tmp_ipc_handle.data;
+	if (tmp_fd == *(uint32_t *)ipc_handle.data)
+		cached = 1;
+	else
+		close(tmp_fd);
+
+	first = 0;
+	_HFI_VDBG("fd %u tmp_fd %d cached %d\n", *(uint32_t *)ipc_handle.data,
+						tmp_fd, cached);
+
+	return cached;
+}
+#endif
+
+#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+#define ONEAPI_PUTQUEUE_SIZE -1
+
+// queue for delayed Put to get better GetIpcHandle performance
+// while having an upper bound on number of active Ipc Handles
+// sized based on PSM3_ONEAPI_PUTQUEUE_SIZE
+struct {
+	psmi_lock_t lock;
+	struct oneapi_handle_array {
+		uint8_t valid;
+		ze_ipc_mem_handle_t ze_ipc_handle;
+	} *array;
+	unsigned index;	// where to add next entry and remove oldest
+	int size;	// number of slots in queue, -1 disables put
+} psm3_oneapi_ze_putqueue;
+#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */
+
+static psm2_error_t psm3_oneapi_ze_putqueue_alloc(void)
+{
+#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+	union psmi_envvar_val env;
+	psm3_getenv("PSM3_ONEAPI_PUTQUEUE_SIZE",
+				"How many Ipc Handle Puts to queue for shm send and nic Direct GPU Access [-1 disables Put, 0 disables queue]",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)ONEAPI_PUTQUEUE_SIZE, &env);
+	_HFI_DBG("OneApi PutQueue Size=%d\n", env.e_int);
+	psm3_oneapi_ze_putqueue.size = env.e_int;
+	if (env.e_int > 0) {
+		psm3_oneapi_ze_putqueue.array = (struct oneapi_handle_array *)psmi_calloc(
+										PSMI_EP_NONE, UNDEFINED, env.e_int,
+										sizeof(*psm3_oneapi_ze_putqueue.array));
+		if (! psm3_oneapi_ze_putqueue.array)
+			return PSM2_NO_MEMORY;
+		psm3_oneapi_ze_putqueue.index = 0;
+		psmi_init_lock(&psm3_oneapi_ze_putqueue.lock);
+	}
+#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */
+	return PSM2_OK;
+}
+
+#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+static void psm3_oneapi_ze_get_dmabuf_fd(const void *buf, uint64_t *handle_fd)
+{
+	ze_memory_allocation_properties_t mem_props = {};
+	ze_device_handle_t device_ptr;
+	ze_external_memory_export_fd_t export_fd = {};
+
+	export_fd.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_EXPORT_FD;
+	export_fd.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF;
+
+	mem_props.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES;
+	mem_props.pNext = &export_fd;
+
+	PSM3_ONEAPI_ZE_CALL(zeMemGetAllocProperties, psm3_oneapi_ze_context,
+			    buf, &mem_props, &device_ptr);
+	*handle_fd = export_fd.fd;
+}
+#endif
+
+#ifdef PSM_HAVE_RNDV_MOD
+static void psm3_oneapi_ze_get_ipc_handle(const void *buf, ze_ipc_mem_handle_t *ipc_handle, uint64_t *handle_fd)
+{
+	PSM3_ONEAPI_ZE_CALL(zeMemGetIpcHandle, psm3_oneapi_ze_context,
+		(const void *)buf, ipc_handle);
+#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+	psm3_oneapi_ze_get_dmabuf_fd(buf, handle_fd);
+#else
+	*handle_fd = *(uint32_t *)ipc_handle->data;
+#endif
+}
+#endif /* PSM_HAVE_RNDV_MOD */
+
+static void psm3_oneapi_ze_put_ipc_handle(const void *buf, ze_ipc_mem_handle_t ipc_handle)
+{
+#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+	if (! psm3_oneapi_ze_putqueue.array) {	// queue disabled
+		if (psm3_oneapi_ze_putqueue.size >= 0)	// negative size disables Put
+			PSM3_ONEAPI_ZE_CALL(zeMemPutIpcHandle, psm3_oneapi_ze_context, ipc_handle);
+		return;
+	}
+	PSMI_LOCK(psm3_oneapi_ze_putqueue.lock);
+	if (psm3_oneapi_ze_putqueue.array[psm3_oneapi_ze_putqueue.index].valid) {
+		// Put the oldest one to make room for new entry
+		ze_ipc_mem_handle_t tmp_ipc_handle =
+			psm3_oneapi_ze_putqueue.array[psm3_oneapi_ze_putqueue.index].ze_ipc_handle;
+		PSM3_ONEAPI_ZE_CALL(zeMemPutIpcHandle, psm3_oneapi_ze_context, tmp_ipc_handle);
+	}
+	// queue the new one
+	psm3_oneapi_ze_putqueue.array[psm3_oneapi_ze_putqueue.index].valid = 1;
+	psm3_oneapi_ze_putqueue.array[psm3_oneapi_ze_putqueue.index++].ze_ipc_handle = ipc_handle;
+	psm3_oneapi_ze_putqueue.index %= psm3_oneapi_ze_putqueue.size;
+	PSMI_UNLOCK(psm3_oneapi_ze_putqueue.lock);
+#else /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */
+	// for older Agama with handle "cache" but no reference counting
+	// no way to put handle without affecting all IOs using that buffer
+	// on ATS w/o Agama handle cache, no benefit to holding onto fd so close
+	if (!psm3_oneapi_ze_ipc_handle_cached(buf, ipc_handle))
+		close(*(uint32_t *)ipc_handle.data);
+#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */
+}
+
+static void psm3_oneapi_ze_putqueue_free(void)
+{
+#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+#if 0 // we are shutting down, so don't worry about Putting the queued handles
+	int i;
+
+	// no need for lock, destroying object, no more callers
+	for (i=0; i < psm3_oneapi_ze_putqueue.size; i++) {
+		if (psm3_oneapi_ze_putqueue.array[i].valid) {
+			ze_ipc_mem_handle_t ipc_handle = psm3_oneapi_ze_putqueue.array[i].ze_ipc_handle;
+			PSM3_ONEAPI_ZE_CALL(zeMemPutIpcHandle, psm3_oneapi_ze_context, ipc_handle);
+		}
+	}
+#endif /* 0 */
+	if (psm3_oneapi_ze_putqueue.array) {
+		psmi_free(psm3_oneapi_ze_putqueue.array);
+		psm3_oneapi_ze_putqueue.array = NULL;
+		psmi_destroy_lock(&psm3_oneapi_ze_putqueue.lock);
+	}
+#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */
+}
+//***************************************************************************
+
+static psm2_error_t psm3_oneapi_ze_initialize(void)
+{
+	psm2_error_t err = PSM2_OK;
+	uint32_t ze_driver_count = 1;
+	uint32_t ze_device_count = 0;
+	ze_device_handle_t devices[MAX_ZE_DEVICES];
+	zel_component_version_t *zel_comps = NULL;
+	size_t num_zel_comps;
+	int i;
+	union psmi_envvar_val env;
+
+	PSM2_LOG_MSG("entering");
+	_HFI_DBG("Init Level Zero library.\n");
+
+	psm3_oneapi_ze_stats_register();
+	err = psm3_oneapi_ze_lib_load();
+	if (err != PSM2_OK)
+		goto fail;
+
+	psm3_getenv("PSM3_ONEAPI_IMMED_SYNC_COPY",
+				"Use Immediate CommandList for synchronous copy to/from GPU]",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)1, &env);
+	psm3_oneapi_ze_immed_sync_copy = env.e_int;
+
+	psm3_getenv("PSM3_ONEAPI_IMMED_ASYNC_COPY",
+				"Use Immediate CommandList for asynchronous pipeline copy to/from GPU]",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)1, &env);
+	psm3_oneapi_ze_immed_async_copy = env.e_int;
+
+	psm3_getenv("PSM3_ONEAPI_PARALLEL_DTOD_COPY_THRESH",
+				"Use parallel CommandLists for GPU to GPU copy larger than threshold",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val)(256*1024-1), &env);
+	// no benefit below 128K-1, plus the copy is spilt at a 64K boundary
+	psm3_oneapi_parallel_dtod_copy_thresh = max(128*1024-1, env.e_uint);
+
+
+	PSM3_ONEAPI_ZE_CALL(zeInit, ZE_INIT_FLAG_GPU_ONLY);
+
+	/* Need to query count before alloc array */
+	PSM3_ONEAPI_ZE_CALL(zelLoaderGetVersions, &num_zel_comps, NULL);
+	if (num_zel_comps > 0) {
+		zel_comps = (zel_component_version_t *)psmi_calloc(
+				PSMI_EP_NONE, UNDEFINED, sizeof(zel_component_version_t),
+				num_zel_comps);
+		PSM3_ONEAPI_ZE_CALL(zelLoaderGetVersions, &num_zel_comps, zel_comps);
+
+		/* Loop looking for "loader" name */
+		for (i = 0; i < num_zel_comps; i++) {
+			if (!strncmp(zel_comps[i].component_name, "loader", sizeof("loader"))){
+				psm3_oneapi_ze_lib_version = zel_comps[i].component_lib_version;
+				psm3_oneapi_ze_api_version	= zel_comps[i].spec_version;
+				break;
+			}
+		}
+		psmi_free(zel_comps);
+		if (i == num_zel_comps) {
+			_HFI_DBG("WARNING: 'loader' not found among the %zd components reported"
+				" by zelLoaderGetVersions, unable to report Level-Zero version",
+				num_zel_comps);
+		}
+	} else {
+		_HFI_DBG("WARNING: no components reported by zelLoaderGetVersions,"
+			" unable to report Level-Zero version");
+	}
+
+	PSM3_ONEAPI_ZE_CALL(zeDriverGet, &ze_driver_count, &psm3_oneapi_ze_driver);
+#ifndef PSM3_NO_ONEAPI_IMPORT
+	PSM3_ONEAPI_ZE_CALL(zeDriverGetExtensionFunctionAddress, psm3_oneapi_ze_driver, "zexDriverImportExternalPointer", (void **)&psm3_oneapi_ze_zexDriverImportExternalPointer);
+	PSM3_ONEAPI_ZE_CALL(zeDriverGetExtensionFunctionAddress, psm3_oneapi_ze_driver, "zexDriverReleaseImportedPointer", (void **)&psm3_oneapi_ze_zexDriverReleaseImportedPointer);
+#endif
+
+	PSM3_ONEAPI_ZE_CALL(zeDeviceGet, psm3_oneapi_ze_driver, &ze_device_count, NULL);
+	if (ze_device_count > MAX_ZE_DEVICES)
+		ze_device_count = MAX_ZE_DEVICES;
+	PSM3_ONEAPI_ZE_CALL(zeDeviceGet, psm3_oneapi_ze_driver, &ze_device_count, devices);
+
+	ze_context_desc_t ctxtDesc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, NULL, 0 };
+	PSM3_ONEAPI_ZE_CALL(zeContextCreate, psm3_oneapi_ze_driver, &ctxtDesc, &psm3_oneapi_ze_context);
+	_HFI_DBG("ze_driver %p %u devices first device %p ze_context %p\n",
+			psm3_oneapi_ze_driver, ze_device_count, devices[0], psm3_oneapi_ze_context);
+
+	for (i = 0; i < ze_device_count; i++) {
+		psm3_oneapi_ze_devices[i].dev_index = i;
+		psm3_oneapi_ze_cmd_create(devices[i], &psm3_oneapi_ze_devices[i]);
+		_HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n",
+				i, psm3_oneapi_ze_devices[i].dev);
+	}
+
+	psm3_num_oneapi_ze_devices = ze_device_count;
+	if (psm3_num_oneapi_ze_devices > 0)
+		psm3_oneapi_ze_cur_dev = &psm3_oneapi_ze_devices[0];
+
+	err = psm3_oneapi_ze_putqueue_alloc();
+	if (err != PSM2_OK)
+		goto fail;
+
+#ifndef PSM_HAVE_PIDFD
+	psm3_num_ze_dev_fds = 0;
+#endif
+
+#ifdef PSM_HAVE_RNDV_MOD
+	// these env only needed when rv being used, since hidden, always parse
+	{
+		union psmi_envvar_val env;
+
+		psm3_getenv("PSM3_RV_GPU_IGNORE_ALLOC_ID",
+				"Disable use of alloc_id to identify GPU MRs to invalidate in RV GPU cache. 1=ignore, 2=use fake id to get 100% miss",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
+				(union psmi_envvar_val)0, &env);
+		psm3_oneapi_ze_ignore_alloc_id = env.e_int;
+	}
+#endif /* PSM_HAVE_RNDV_MOD */
+
+	if (! psm3_gpu_thresh_rndv) // sockets HAL could set new default
+		psm3_gpu_thresh_rndv = PSM3_ONEAPI_ZE_GPU_THRESH_RNDV;
+	psm3_gpu_rndv_nic_window_default = PSM3_ONEAPI_ZE_RNDV_NIC_WINDOW_DEFAULT;
+	psm3_gpu_gpudirect_rdma_send_limit_default = PSM3_ONEAPI_ZE_GPUDIRECT_RDMA_SEND_LIMIT_DEFAULT;
+	psm3_gpu_gpudirect_rdma_recv_limit_default = PSM3_ONEAPI_ZE_GPUDIRECT_RDMA_RECV_LIMIT_DEFAULT;
+	psm3_gpu_mq_rndv_shm_gpu_thresh_default = PSM3_ONEAPI_ZE_MQ_RNDV_SHM_GPU_THRESH;
+
+	PSM2_LOG_MSG("leaving");
+	return err;
+fail:
+	err = psm3_handle_error(PSMI_EP_NORETURN,
+		PSM2_INTERNAL_ERR,
+		"Unable to initialize PSM3 OneAPI Level Zero support.\n");
+	return err;
+}
+
+static void psm3_oneapi_ze_finalize(void)
+{
+	psm3_stats_deregister_type(PSMI_STATSTYPE_GPU, &psm3_oneapi_ze_count_zeInit);
+	/*
+	 * Trying to destroy command list, queue, and context will result in
+	 *  segfaults here.
+	 */
+	/*psm3_oneapi_ze_putqueue_free();
+	psm3_oneapi_ze_cmd_destroy();
+	if (psm3_oneapi_ze_context) {
+		PSM3_ONEAPI_ZE_CALL(zeContextDestroy, psm3_oneapi_ze_context);
+		psm3_oneapi_ze_context = NULL;
+	} */
+}
+
+// called on every EP open
+static void psm3_oneapi_ze_ep_open(void)
+{
+	/* Make sure ze_context and command queue/list are available.
+	 * They could be destroyed when final EP is closed
+	 * If another endpoint is opened after that, the code here can
+	 * recreate the context, command queue and list.
+	 */
+	if (!psm3_oneapi_ze_cur_dev)
+		psm3_oneapi_ze_cmd_create_all();
+}
+
+// called on final EP close
+static void psm3_oneapi_ze_ep_close(void)
+{
+	/*
+	 * It would be ideal to destroy the global command list, queue, and
+	 * context in psm3_finalize() (via psm3_oneapi_ze_finalize).
+	 * Unfortunately, it will cause segfaults in Level-zero library.
+	 */
+	psm3_oneapi_ze_putqueue_free();
+	psm3_oneapi_ze_cmd_destroy_all();
+}
+
+static void psm3_oneapi_ze_identify(char *accel_vers, size_t size)
+{
+	char ze_api_ver[64] = "unknown";
+	char ze_loader_ver[64] = "unknown";
+
+	if (psm3_oneapi_ze_api_version)
+		snprintf(ze_api_ver, sizeof(ze_api_ver), "%d.%d",
+			ZE_MAJOR_VERSION(psm3_oneapi_ze_api_version), ZE_MINOR_VERSION(psm3_oneapi_ze_api_version));
+	if (psm3_oneapi_ze_lib_version.major || psm3_oneapi_ze_lib_version.minor || psm3_oneapi_ze_lib_version.patch)
+		snprintf(ze_loader_ver, sizeof(ze_loader_ver), "v%d.%d.%d",
+			psm3_oneapi_ze_lib_version.major, psm3_oneapi_ze_lib_version.minor, psm3_oneapi_ze_lib_version.patch);
+	snprintf(accel_vers, size, "%s %s Level-Zero Runtime %s (%s) built against interface %d.%d\n",
+		psm3_get_mylabel(), psm3_ident_tag,
+		ze_api_ver, ze_loader_ver,
+		ZE_MAJOR_VERSION(ZE_API_VERSION_CURRENT), ZE_MINOR_VERSION(ZE_API_VERSION_CURRENT));
+}
+
+static void psm3_oneapi_ze_verify_GPU_capabilities(void)
+{
+	// nothing to do
+}
+
+static int psm3_oneapi_ze_p2p_supported()
+{
+	static int p2p_supported = -1; // -1 indicates "unset"
+	uint32_t num_devices = 0;
+	uint32_t dev;
+	ze_device_handle_t devices[MAX_ZE_DEVICES];
+
+	if (likely(p2p_supported > -1)) return p2p_supported;
+
+	p2p_supported = 0;
+
+	PSM3_ONEAPI_ZE_CALL(zeDeviceGet, psm3_oneapi_ze_driver, &num_devices, NULL);
+	if (num_devices > MAX_ZE_DEVICES)
+		num_devices = MAX_ZE_DEVICES;
+	PSM3_ONEAPI_ZE_CALL(zeDeviceGet, psm3_oneapi_ze_driver, &num_devices, devices);
+
+	for (dev = 0; dev < num_devices; dev++) {
+		ze_device_handle_t device;
+		device = devices[dev];
+
+		if (num_devices > 1 && device != psm3_oneapi_ze_cur_dev->dev) {
+			ze_bool_t canAccessPeer = 0;
+
+			PSM3_ONEAPI_ZE_CALL(zeDeviceCanAccessPeer, psm3_oneapi_ze_cur_dev->dev,
+					device, &canAccessPeer);
+			if (canAccessPeer != 1)
+				_HFI_DBG("ONEAPI device %d does not support P2P from current device (Non-fatal error)\n", dev);
+			else
+				p2p_supported |= (1 << dev);
+		} else {
+			/* Always support p2p on the same GPU */
+			psm3_my_gpu_device = dev;
+			p2p_supported |= (1 << dev);
+		}
+	}
+
+	return p2p_supported;
+}
+
+static int psm3_oneapi_ze_gpudirect_supported(void)
+{
+	/* Is there any OneAPI Level Zero device property that can indicate this? */
+	return 1;
+}
+
+
+static void psm3_oneapi_ze_get_pci_addr(uint32_t *domain_p, uint32_t *bus_p,
+							uint32_t *dev_p, uint32_t *func_p)
+{
+	ze_pci_ext_properties_t PciProperties;
+
+	_HFI_DBG("%d Level Zero GPUs found\n", psm3_num_oneapi_ze_devices);
+	if (! psm3_num_oneapi_ze_devices)
+		return;
+
+	// caling middleware will have limited GPUs visible to process
+	PSM3_ONEAPI_ZE_CALL(zeDevicePciGetPropertiesExt,
+						psm3_oneapi_ze_devices[0].dev, &PciProperties);
+	*domain_p = PciProperties.address.domain;
+	*bus_p = PciProperties.address.bus;
+	*dev_p = PciProperties.address.device;
+	*func_p = PciProperties.address.function;
+}
+
+#ifdef PSM_HAVE_RNDV_MOD
+static uint64_t psm3_oneapi_ze_min_bar_size(void)
+{
+	// implement later
+	return 0;
+}
+
+static psm2_error_t psm3_oneapi_ze_check_phys_addr(uint64_t phys_addr)
+{
+	return PSM2_OK;
+}
+
+static void psm3_oneapi_ze_roundup_gdrcopy(unsigned long buf, size_t size,
+							uintptr_t *pageaddr_p, uint64_t *pagelen_p)
+{
+	PSM3_ONEAPI_ZE_CALL(zeMemGetAddressRange, psm3_oneapi_ze_context,
+							(const void *)buf, (void **)pageaddr_p, pagelen_p);
+}
+
+#ifdef PSM_HAVE_REG_MR
+static void psm3_oneapi_ze_roundup_rv_reg_mr(struct psm2_ep *ep,
+							void **addr_p, uint64_t *length_p, int access)
+{
+#define MAX_USER_MR_SIZE (32 * 1024)
+	void *base;
+	size_t len;
+	uint64_t page_offset;
+
+	PSM3_ONEAPI_ZE_CALL(zeMemGetAddressRange, psm3_oneapi_ze_context,
+			(const void *)*addr_p, &base, &len);
+	/*
+	 * Need to register MR with base address and total length.
+	 * However, for Mellanox cards, the max buffer size for a
+	 * user MR registered through the rv module is 32k bytes.
+	 * Otherwise, it will fail with IB_WC_MW_BIND_ERR. For fast
+	 * registration MR through RV (kernel MR and GPU MR), there
+	 * is also a upper limit (max_fast_reg_page_list_len) imposed
+	 * by the underlying RDMA device (eg 256MB for mlx5).
+	 */
+	if (strncasecmp(ep->dev_name, "mlx5_0", 3) == 0 &&
+		!(access & IBV_ACCESS_KERNEL)) {
+		if (len > MAX_USER_MR_SIZE) {
+			/*
+			 * Register the first 32k if the buffer stays in the
+			 * range. Otherwise, align the buffer to page boundary.
+			 */
+			if (((char *)*addr_p + *length_p) <=
+				((char *)base + MAX_USER_MR_SIZE)) {
+				*addr_p = base;
+				*length_p = MAX_USER_MR_SIZE;
+			} else {
+				page_offset = ((uint64_t)*addr_p) &
+					      GPU_PAGE_OFFSET_MASK;
+				*addr_p = (void *)
+					  ROUNDDOWN64P2((uint64_t)*addr_p,
+							PSMI_GPU_PAGESIZE);
+				*length_p = *length_p + page_offset;
+			}
+		} else {
+			/* Register the entire buffer */
+			*addr_p = base;
+			*length_p = len;
+		}
+	} else {
+		uint64_t start, end;
+		uint64_t mr_len;
+		uint64_t offset;
+		uint64_t limit = ep->verbs_ep.max_fmr_size;
+
+		/* Buffer end + 1 */
+		end = (uint64_t)base + len;
+		/* Offset of the requested buffer chunk */
+		offset = (uint64_t)*addr_p - (uint64_t)base;
+                /*
+		 * Start address of next MR.
+		 * The idea is to avoid fragment the entire buffer as few times
+		 * as possible to avoid overlapped MRs and increae cache hit
+		 * rate. Therefore, we can't just start from page boundary of
+		 * the requested buffer address:
+		 * start = ROUNDDOWN64P2((uint64_t)*addr_p, PSMI_GPU_PAGESIZE);
+		 */
+		start = (uint64_t)base + (offset / limit) * limit;
+		mr_len = end - start;
+		if (mr_len > limit)
+			mr_len = limit;
+		/*
+		 * If the chunk does not cross the (start + mr_len) boundary,
+		 * register the max chunk size or the remainder of the entire
+		 * buffer. Otherwise, align the buffer to page size and just
+		 * register the requested chunk size plus the offset.
+		 */
+		if (((uint64_t)*addr_p + *length_p) <= (start + mr_len)) {
+			*addr_p = (void *)start;
+			*length_p = mr_len;
+		} else {
+			page_offset = ((uint64_t)*addr_p) &
+				      GPU_PAGE_OFFSET_MASK;
+			*addr_p = (void *)ROUNDDOWN64P2((uint64_t)*addr_p,
+							PSMI_GPU_PAGESIZE);
+			*length_p = *length_p + page_offset;
+		}
+	}
+}
+
+// add OneAPI Level Zero specific information to the mparams in prep for the
+// RV_IOCTL_REG_MEM ioctl to rv
+// for reg_mr the gpu_specific->ze_alloc_id is obtained in caller and
+// retained in the psm2_verbs_mr_t for future cache hit checks
+static int psm3_oneapi_ze_init_rv_reg_mr_params(
+				void *addr, uint64_t length, int access,
+				struct rv_mem_params *mparams,
+				union psm3_verbs_mr_gpu_specific *gpu_specific,
+				union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad)
+{
+	// need to save off ipc_handle here for cleanup below
+	psm3_oneapi_ze_get_ipc_handle(addr, &scratchpad->ze_ipc_handle,
+							&scratchpad->ze_handle_fd);
+	mparams->in.ipc_handle = (uint32_t)scratchpad->ze_handle_fd;
+	if (!mparams->in.ipc_handle) {
+		_HFI_ERROR("zeMemGetIpcHandle for %p returned empty handle: 0x%02x%02x%02x%02x %02x%02x%02x%02x\n",
+				addr, scratchpad->ze_ipc_handle.data[0],
+				scratchpad->ze_ipc_handle.data[1],
+				scratchpad->ze_ipc_handle.data[2],
+				scratchpad->ze_ipc_handle.data[3],
+				scratchpad->ze_ipc_handle.data[4],
+				scratchpad->ze_ipc_handle.data[5],
+				scratchpad->ze_ipc_handle.data[6],
+				scratchpad->ze_ipc_handle.data[7]);
+		// tends to mean out of fd's
+		return ENOSPC;
+	}
+	mparams->in.alloc_id = psm3_oneapi_ze_ignore_alloc_id?
+			(psm3_oneapi_ze_ignore_alloc_id==1?
+				0:psm3_oneapi_ze_fake_alloc_id++)
+			:gpu_specific->ze_alloc_id;
+	mparams->in.base_addr = gpu_specific->ze_base_addr;
+	return 0;
+}
+#endif /* PSM_HAVE_REG_MR */
+
+// add OneAPI Level Zero specific information to the params in prep for the
+// RV_IOCTL_PIN_MMAP ioctl to rv
+// for pin_mmap the alloc_id is obtained here and there is no caching in the
+// caller.
+static int psm3_oneapi_ze_init_rv_pin_mmap_params(
+				void *addr, uint64_t length, int access,
+				struct rv_gpu_mem_params *params,
+				union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad)
+{
+	uint64_t alloc_id;
+
+	psm3_oneapi_ze_get_ipc_handle(addr, &scratchpad->ze_ipc_handle, &scratchpad->ze_handle_fd);
+	params->in.ipc_handle = (uint32_t)scratchpad->ze_handle_fd;
+	if (!params->in.ipc_handle) {
+		_HFI_ERROR("No ipc_handle: 0x%02x%02x%02x%02x %02x%02x%02x%02x\n",
+				scratchpad->ze_ipc_handle.data[0],
+				scratchpad->ze_ipc_handle.data[1],
+				scratchpad->ze_ipc_handle.data[2],
+				scratchpad->ze_ipc_handle.data[3],
+				scratchpad->ze_ipc_handle.data[4],
+				scratchpad->ze_ipc_handle.data[5],
+				scratchpad->ze_ipc_handle.data[6],
+				scratchpad->ze_ipc_handle.data[7]);
+		return EFAULT;
+	}
+	alloc_id = psm3_oneapi_ze_get_alloc_id(addr, NULL);
+	// id is unique across all allocs on all devices in a process
+	params->in.alloc_id = psm3_oneapi_ze_ignore_alloc_id?
+				(psm3_oneapi_ze_ignore_alloc_id==1?
+					0:psm3_oneapi_ze_fake_alloc_id++)
+				:alloc_id;
+	_HFI_VDBG("addr 0x%"PRIx64" length %"PRIu64" id %"PRIu64" access 0x%x\n",
+				(uint64_t)addr, length, alloc_id, access);
+	return 0;
+}
+
+// cleanup OneAPI Level Zero specific scratchpad from
+// psm3_oneapi_ze_init_rv_reg_mr_params or
+// psm3_oneapi_ze_init_rv_pin_mmap_params
+// called on success or error path, makes sure not to polute errno
+// as it can reflect the earlier error for the error path in caller.
+static void psm3_oneapi_ze_rv_reg_mmap_cleanup(
+				void *addr, uint64_t length, int access,
+				union psm3_gpu_rv_reg_mmap_mem_scratchpad *scratchpad)
+{
+	if (scratchpad->ze_handle_fd) {
+		int save_errno = errno;
+		psm3_oneapi_ze_put_ipc_handle((const void *)addr, scratchpad->ze_ipc_handle);
+		// no need to clear scratchpad
+		errno = save_errno;
+	}
+}
+#endif /* PSM_HAVE_RNDV_MOD */
+
+#ifdef PSM_HAVE_REG_MR
+// compare GPU specific fields in verbs MR cache entry
+static int psm3_oneapi_ze_cmp_mr(const union psm3_verbs_mr_gpu_specific *a,
+		const union psm3_verbs_mr_gpu_specific *b)
+{
+	if (a->ze_alloc_id < b->ze_alloc_id)
+		return -1;
+	else if (a->ze_alloc_id > b->ze_alloc_id)
+		return 1;
+	else
+		return 0;
+}
+
+// initialize GPU specific fields in verbs MR cache entry
+static void psm3_oneapi_ze_init_mr(void *addr, uint64_t length, int access,
+				union psm3_verbs_mr_gpu_specific *gpu_specific)
+{
+	void *base = NULL;
+	size_t len;
+
+	if (access & IBV_ACCESS_IS_GPU_ADDR)
+		PSM3_ONEAPI_ZE_CALL(zeMemGetAddressRange,
+				    psm3_oneapi_ze_context, (const void *)addr,
+				    &base, &len);
+	gpu_specific->ze_base_addr = (uint64_t)base;
+	gpu_specific->ze_alloc_id = (access & IBV_ACCESS_IS_GPU_ADDR)?
+				    psm3_oneapi_ze_get_alloc_id(addr, NULL) : 0;
+}
+#endif /* PSM_HAVE_REG_MR */
+
+static void psm3_oneapi_ze_fetch_ctxt(void)
+{
+	// nothing to do
+}
+
+// ensure psm3_cu_ctxt reflects our most recent psm3_cu_ctxt
+static void psm3_oneapi_ze_refresh_ctxt(void)
+{
+	// nothing to do
+}
+
+static void psm3_oneapi_ze_register_hostmem(void *buf, uint32_t size)
+{
+#ifndef PSM3_NO_ONEAPI_IMPORT
+	PSM3_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, psm3_oneapi_ze_driver,
+				buf, size);
+#endif
+}
+
+static void psm3_oneapi_ze_unregister_hostmem(void *buf)
+{
+#ifndef PSM3_NO_ONEAPI_IMPORT
+	ze_result_t result;
+	//PSM3_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, psm3_oneapi_ze_driver,
+	//		buf);
+	psm3_oneapi_ze_count_zexDriverReleaseImportedPointer++;
+	result = psm3_oneapi_ze_zexDriverReleaseImportedPointer(psm3_oneapi_ze_driver,
+				buf);
+	if (result != ZE_RESULT_SUCCESS) {
+		_HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psm3_oneapi_ze_result_to_string(result));
+	}
+#endif
+}
+
+/*
+ * Two usages:
+ *   (1) ctxt == NULL: check if the buffer is allocated from Level-zero.
+ *       In this case, change psm3_oneapi_ze_cur_dev if device has changed.
+ *   (2) ctxt != NULL: try to get the device context.
+ *       In this case, don't change psm3_oneapi_ze_cur_dev.
+ */
+PSMI_ALWAYS_INLINE(
+int
+psm3_is_oneapi_ze_mem(const void *ptr, struct psm3_oneapi_ze_dev_ctxt **ctxt))
+{
+	ze_memory_allocation_properties_t mem_props = {
+		ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES
+	};
+	ze_device_handle_t dev;
+	ze_result_t result;
+	int ret = 0;
+
+	psm3_oneapi_ze_count_zeMemGetAllocProperties++;
+	result = psm3_oneapi_ze_zeMemGetAllocProperties(psm3_oneapi_ze_context, ptr, &mem_props,
+						&dev);
+	if (result == ZE_RESULT_SUCCESS &&
+		(mem_props.type != ZE_MEMORY_TYPE_UNKNOWN)) {
+		ret = 1;
+		_HFI_VDBG("ptr %p type %d dev %p oneapi_ze_cur_dev %p\n",
+				ptr, mem_props.type, dev, psm3_oneapi_ze_cur_dev->dev);
+		/*
+		 * Check if the gpu device has changed.
+		 * If we are trying to get the device context (!ctxt),
+		 * don't change psm3_oneapi_ze_cur_dev.
+		 * If the buffer is allocated through zeMemAllocHost,
+		 * there will be no device associated with it (dev == NULL).
+		 * In this case, use the current device context.
+		 */
+		if (!dev) {
+			if (ctxt)
+				*ctxt = psm3_oneapi_ze_cur_dev;
+			return ret;
+		}
+		if (ctxt || (!ctxt && dev != psm3_oneapi_ze_cur_dev->dev)) {
+			int i;
+
+			for (i = 0; i < psm3_num_oneapi_ze_devices; i++) {
+				if (psm3_oneapi_ze_devices[i].dev == dev) {
+					if (ctxt)
+						*ctxt = &psm3_oneapi_ze_devices[i];
+					else
+						psm3_oneapi_ze_cur_dev = &psm3_oneapi_ze_devices[i];
+					break;
+				}
+			}
+			_HFI_VDBG("check ze_device[%d-%d] for dev %p: no match\n", 0, psm3_num_oneapi_ze_devices-1, dev);
+		}
+	}
+
+	return ret;
+}
+
+PSMI_ALWAYS_INLINE(
+struct psm3_oneapi_ze_dev_ctxt *
+psm3_oneapi_ze_dev_ctxt_get(const void *ptr))
+{
+	struct psm3_oneapi_ze_dev_ctxt *ctxt = NULL;
+
+	psm3_is_oneapi_ze_mem(ptr, &ctxt);
+
+	return ctxt;
+}
+
+static int psm3_oneapi_ze_is_gpu_mem(const void *ptr)
+{
+	return psm3_is_oneapi_ze_mem(ptr, NULL);
+}
+
+static void psm3_oneapi_ze_prepare_HtoD_memcpys(struct ips_protoexp *protoexp)
+{
+	int i;
+
+	for (i = 0; i < MAX_ZE_DEVICES; i++)
+		protoexp->gpu_specific.ze_cq_recvs[i] = NULL;
+}
+
+static void psm3_oneapi_ze_prepare_DtoH_memcpys(struct ips_proto *proto)
+{
+	int i;
+
+	for (i = 0; i < MAX_ZE_DEVICES; i++)
+		proto->gpu_specific.ze_cq_sends[i] = NULL;
+}
+
+static void psm3_oneapi_ze_shutdown_HtoD_memcpys(struct ips_protoexp *protoexp)
+{
+	int i;
+
+	for (i = 0; i < MAX_ZE_DEVICES; i++) {
+		if (protoexp->gpu_specific.ze_cq_recvs[i]) {
+			PSM3_ONEAPI_ZE_CALL(zeCommandQueueDestroy, protoexp->gpu_specific.ze_cq_recvs[i]);
+			protoexp->gpu_specific.ze_cq_recvs[i] = NULL;
+		}                  
+	}
+}
+
+static void psm3_oneapi_ze_shutdown_DtoH_memcpys(struct ips_proto *proto)
+{
+	int i;
+
+	for (i = 0; i < MAX_ZE_DEVICES; i++) {
+		if (proto->gpu_specific.ze_cq_sends[i]) {
+			PSM3_ONEAPI_ZE_CALL(zeCommandQueueDestroy, proto->gpu_specific.ze_cq_sends[i]);
+			proto->gpu_specific.ze_cq_sends[i] = NULL;
+		}
+	}           
+}
+
+static void psm3_oneapi_ze_memcpy_HtoD_start(struct ips_protoexp *protoexp,
+									struct ips_gpu_hostbuf *ghb, uint32_t len)
+{
+	ze_event_pool_desc_t pool_desc = {
+		.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,
+		.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE,
+		.count = 1
+	};
+	ze_event_desc_t event_desc = {
+		.stype = ZE_STRUCTURE_TYPE_EVENT_DESC,
+		.signal = ZE_EVENT_SCOPE_FLAG_HOST,
+		.wait = ZE_EVENT_SCOPE_FLAG_HOST,
+		.index = 0
+	};
+	struct psm3_oneapi_ze_dev_ctxt *ctxt;
+	int inx;
+		
+	ctxt = psm3_oneapi_ze_dev_ctxt_get(ghb->gpu_buf);
+	if (!ctxt) {
+		psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+						"%s HTOD: unknown GPU device for addr %p\n",
+						__FUNCTION__, ghb->gpu_buf);
+		return; /* NOT REACHED */
+	}
+	if (ghb->gpu_specific.ze_event_pool == NULL) {
+		PSM3_ONEAPI_ZE_CALL(zeEventPoolCreate,
+				psm3_oneapi_ze_context, &pool_desc, 0, NULL, &ghb->gpu_specific.ze_event_pool);
+	}
+	if (ghb->gpu_specific.ze_copy_status == NULL) {
+		PSM3_ONEAPI_ZE_CALL(zeEventCreate,
+			ghb->gpu_specific.ze_event_pool, &event_desc, &ghb->gpu_specific.ze_copy_status);
+	}
+	inx = ctxt->dev_index;
+	if (! ghb->gpu_specific.ze_command_lists[inx]) {
+		psm3_oneapi_ze_async_cmd_create(ctxt,
+				&protoexp->gpu_specific.ze_cq_recvs[inx], &ghb->gpu_specific.ze_command_lists[inx]);
+	}
+	ghb->gpu_specific.ze_cur_dev_inx = inx;
+	PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ghb->gpu_specific.ze_command_lists[inx],
+		ghb->gpu_buf, ghb->host_buf, len, ghb->gpu_specific.ze_copy_status, 0, NULL);
+	if (! psm3_oneapi_ze_immed_async_copy) {
+		PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ghb->gpu_specific.ze_command_lists[inx]);
+		PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,
+			protoexp->gpu_specific.ze_cq_recvs[inx], 1, &ghb->gpu_specific.ze_command_lists[inx], NULL);
+	}
+}
+
+static void psm3_oneapi_ze_memcpy_DtoH_start(struct ips_proto *proto,
+									struct ips_gpu_hostbuf *ghb, uint32_t len)
+{
+	ze_event_pool_desc_t pool_desc = {
+		.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,
+		.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE,
+		.count = 1
+	};
+	ze_event_desc_t event_desc = {
+		.stype = ZE_STRUCTURE_TYPE_EVENT_DESC,
+		.signal = ZE_EVENT_SCOPE_FLAG_HOST,
+		.wait = ZE_EVENT_SCOPE_FLAG_HOST,
+		.index = 0
+	};
+	struct psm3_oneapi_ze_dev_ctxt *ctxt;
+	int inx;
+	
+	ctxt = psm3_oneapi_ze_dev_ctxt_get(ghb->gpu_buf);
+	if (!ctxt) {
+		psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					"%s DTOH: unknown GPU device for addr %p\n",
+					__FUNCTION__, ghb->gpu_buf);
+		return; /* NOT REACHED */
+	}
+	if (ghb->gpu_specific.ze_event_pool == NULL) {
+		PSM3_ONEAPI_ZE_CALL(zeEventPoolCreate,
+			psm3_oneapi_ze_context, &pool_desc, 0, NULL, &ghb->gpu_specific.ze_event_pool);
+	}
+	if (ghb->gpu_specific.ze_copy_status == NULL) {
+		PSM3_ONEAPI_ZE_CALL(zeEventCreate, ghb->gpu_specific.ze_event_pool, &event_desc,
+			&ghb->gpu_specific.ze_copy_status);
+	}
+	inx = ctxt->dev_index;
+	if (! ghb->gpu_specific.ze_command_lists[inx]) {
+		psm3_oneapi_ze_async_cmd_create(ctxt, &proto->gpu_specific.ze_cq_sends[inx],
+				&ghb->gpu_specific.ze_command_lists[inx]);
+	}
+	ghb->gpu_specific.ze_cur_dev_inx = inx;
+	PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ghb->gpu_specific.ze_command_lists[inx],
+		ghb->host_buf, ghb->gpu_buf, len, ghb->gpu_specific.ze_copy_status, 0, NULL);
+	if (! psm3_oneapi_ze_immed_async_copy) {
+		PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ghb->gpu_specific.ze_command_lists[inx]);
+		PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,
+			proto->gpu_specific.ze_cq_sends[inx], 1, &ghb->gpu_specific.ze_command_lists[inx], NULL);
+	}
+}
+
+static int psm3_oneapi_ze_memcpy_done(struct ips_gpu_hostbuf *ghb)
+{
+	ze_result_t result;
+	psm3_oneapi_ze_count_zeEventQueryStatus++;
+
+	result = psm3_oneapi_ze_zeEventQueryStatus(ghb->gpu_specific.ze_copy_status);
+	if (result == ZE_RESULT_SUCCESS) {
+		return 1;
+	} else if (result == ZE_RESULT_NOT_READY) {
+		return 0;
+	} else {
+		_HFI_ERROR("OneAPI Level Zero failure: %s() (at %s:%d) returned 0x%x: %s\n",
+			"zeEventQueryStatus",  __FILE__, __LINE__, result,
+			psm3_oneapi_ze_result_to_string(result));
+		psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"Error returned from OneAPI Level Zero function %s.\n",
+			"zeEventQueryStatus");
+	}
+	return 0;
+}
+
+// when allocating bounce buffers either malloc w/Import or
+// zeMemAllocHost can be used.  zeMemAllocHost tends to perform
+// better in the subsequent GPU copy's AppendMemoryCopy.  However
+// zeMemAllocHost results in a GPU-like address which requires dmabuf
+// so we can't use zeMemAllocHost for DMA to/from the bounce buffer
+// unless rv is available to handle GPU addresses (eg. PSM3_GPUDIRECT=1)
+
+static void *psm3_oneapi_ze_host_alloc_malloc(unsigned size)
+{
+	void *ret_ptr = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
+#ifndef PSM3_NO_ONEAPI_IMPORT
+	PSM3_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, psm3_oneapi_ze_driver, ret_ptr, size);
+#endif
+	return ret_ptr;
+}
+
+static void psm3_oneapi_ze_host_free_malloc(void *ptr)
+{
+#ifndef PSM3_NO_ONEAPI_IMPORT
+	PSM3_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, psm3_oneapi_ze_driver, ptr);
+#endif
+	psmi_free(ptr);
+}
+
+#ifndef PSM3_USE_ONEAPI_MALLOC
+static void *psm3_oneapi_ze_host_alloc_zemem(unsigned size)
+{
+	void *ret_ptr;
+	ze_host_mem_alloc_desc_t host_desc = {
+		.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
+		.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW
+	};
+	PSM3_ONEAPI_ZE_CALL(zeMemAllocHost, psm3_oneapi_ze_context,
+						&host_desc, size, 8, &ret_ptr);
+	return ret_ptr;
+}
+
+static void psm3_oneapi_ze_host_free_zemem(void *ptr)
+{
+	PSM3_ONEAPI_ZE_CALL(zeMemFree, psm3_oneapi_ze_context, ptr);
+}
+
+static void *(*psm3_oneapi_ze_host_alloc_ptr)(unsigned size) = psm3_oneapi_ze_host_alloc_malloc;
+static void (*psm3_oneapi_ze_host_free_ptr)(void *ptr) = psm3_oneapi_ze_host_free_malloc;
+static int psm3_oneapi_ze_using_zemem_alloc = 0;
+#endif /* PSM3_USE_ONEAPI_MALLOC */
+
+// this is only called if GPU Direct is enabled in rv such that
+// GDR Copy and/or RDMA MRs can provide GPU-like addresses to rv
+static void psm3_oneapi_ze_using_rv_for_mrs(void)
+{
+#ifndef PSM3_USE_ONEAPI_MALLOC
+	psm3_oneapi_ze_host_alloc_ptr = psm3_oneapi_ze_host_alloc_zemem;
+	psm3_oneapi_ze_host_free_ptr = psm3_oneapi_ze_host_free_zemem;
+	psm3_oneapi_ze_using_zemem_alloc = 1;
+#endif
+}
+
+static void psm3_oneapi_ze_host_alloc(void **ret_ptr, uint32_t size)
+{
+#ifdef PSM3_USE_ONEAPI_MALLOC
+	*ret_ptr = psm3_oneapi_ze_host_alloc_malloc(size);
+#else
+	*ret_ptr = (*psm3_oneapi_ze_host_alloc_ptr)(size);
+#endif
+}
+
+static void psm3_oneapi_ze_host_free(void *ptr)
+{
+#ifdef PSM3_USE_ONEAPI_MALLOC
+	psm3_oneapi_ze_host_free_malloc(ptr);
+#else
+	(*psm3_oneapi_ze_host_free_ptr)(ptr);
+#endif
+}
+
+static void psm3_oneapi_ze_hostbuf_lazy_init(struct ips_gpu_hostbuf *ghb)
+{
+	int i;
+
+	ghb->gpu_specific.ze_event_pool = NULL;
+	ghb->gpu_specific.ze_copy_status = NULL;
+	for (i = 0; i < MAX_ZE_DEVICES; i++)
+		ghb->gpu_specific.ze_command_lists[i] = NULL;
+}
+
+static void psm3_oneapi_ze_hostbuf_reset(struct ips_gpu_hostbuf *ghb)
+{
+	if (! psm3_oneapi_ze_immed_async_copy) {
+		PSM3_ONEAPI_ZE_CALL(zeCommandListReset,
+				ghb->gpu_specific.ze_command_lists[ghb->gpu_specific.ze_cur_dev_inx]);
+	}
+	PSM3_ONEAPI_ZE_CALL(zeEventHostReset, ghb->gpu_specific.ze_copy_status);
+}
+
+static void psm3_oneapi_ze_hostbuf_destroy(struct ips_gpu_hostbuf *ghb)
+{
+	int i;
+
+	if (ghb->gpu_specific.ze_copy_status != NULL) {
+		PSM3_ONEAPI_ZE_CALL(zeEventDestroy, ghb->gpu_specific.ze_copy_status);
+	}
+	if (ghb->host_buf != NULL) {
+		psm3_oneapi_ze_host_free(ghb->host_buf);
+	}
+	if (ghb->gpu_specific.ze_event_pool != NULL) {
+		PSM3_ONEAPI_ZE_CALL(zeEventPoolDestroy, ghb->gpu_specific.ze_event_pool);
+	}
+	for (i = 0; i < MAX_ZE_DEVICES; i++) {
+		if (ghb->gpu_specific.ze_command_lists[i]) {
+			PSM3_ONEAPI_ZE_CALL( zeCommandListDestroy, ghb->gpu_specific.ze_command_lists[i]);
+			ghb->gpu_specific.ze_command_lists[i] = NULL;
+		}
+	}
+}
+
+// synchronous GPU memcpy
+static void psm3_oneapi_ze_memcpy_internal(void *dstptr, const void *srcptr, size_t size)
+{
+	struct psm3_oneapi_ze_dev_ctxt *ctxt;
+
+	psmi_assert(size > 0);
+	ctxt = psm3_oneapi_ze_dev_ctxt_get(dstptr);
+	if (!ctxt) {
+		ctxt = psm3_oneapi_ze_dev_ctxt_get(srcptr);
+		if (!ctxt) {
+			_HFI_ERROR("dst %p src %p not GPU buf for copying\n",
+					dstptr, srcptr);
+			return;
+		}
+	}
+	if (psm3_oneapi_ze_immed_sync_copy) {
+		PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl,
+					dstptr, srcptr, size, NULL, 0, NULL);
+	} else {
+		PSM3_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl);
+		PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl,
+					dstptr, srcptr, size, NULL, 0, NULL);
+		PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl);
+		PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq,
+					1, &ctxt->cl, NULL);
+		PSM3_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX);
+	}
+}
+
+// synchronous GPU memcpy DTOD (xeLink)
+static void psm3_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size)
+{
+	struct psm3_oneapi_ze_dev_ctxt *ctxt;
+
+	psmi_assert(size > 0);
+	ctxt = psm3_oneapi_ze_dev_ctxt_get(dstptr);
+	if (!ctxt) {
+		_HFI_ERROR("dst %p src %p not GPU buf for copying\n",
+				dstptr, srcptr);
+		return;
+	}
+	if (size <= psm3_oneapi_parallel_dtod_copy_thresh) {
+		if (psm3_oneapi_ze_immed_sync_copy) {
+			PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl,
+					dstptr, srcptr, size, NULL, 0, NULL);
+		} else {
+			PSM3_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl);
+			PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl,
+					dstptr, srcptr, size, NULL, 0, NULL);
+			PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl);
+			PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq,
+					1, &ctxt->cl, NULL);
+			PSM3_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX);
+		}
+	} else {
+		// for large DTOD copies, start 2 parallel commands
+		// then wait for both
+		size_t size0 = ROUNDUP64P2(size/2, 64*1024);
+		size_t size1 = size - size0;
+
+		if (psm3_oneapi_ze_immed_sync_copy) {
+			PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0,
+					dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL);
+
+			PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1,
+					(void*)((uintptr_t)dstptr+size0),
+					(void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1,
+					0, NULL);
+		} else {
+			PSM3_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl0);
+			PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0,
+					dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL);
+			PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl0);
+			PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq0,
+					1, &ctxt->async_cl0, NULL);
+
+			PSM3_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl1);
+			PSM3_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1,
+					(void*)((uintptr_t)dstptr+size0),
+					(void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1,
+					0, NULL);
+			PSM3_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl1);
+			PSM3_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq1,
+					1, &ctxt->async_cl1, NULL);
+		}
+		// 2nd copy may be slightly smaller so waity for it first so
+		// can potentially hide its Reset latency while 1st copy completes
+		PSM3_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status1, UINT32_MAX);
+		PSM3_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status1);
+
+		PSM3_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status0, UINT32_MAX);
+		PSM3_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status0);
+	}
+}
+
+static void psm3_oneapi_ze_memcpy_DtoD(void *dstptr, const void *srcptr, uint32_t len)
+{
+	psm3_oneapi_ze_memcpy_DTOD(dstptr, srcptr, len);
+}
+
+static void psm3_oneapi_ze_memcpy_HtoD(void *dstptr, const void *srcptr, uint32_t len)
+{
+	psm3_oneapi_ze_memcpy_internal(dstptr, srcptr, len);
+}
+
+static void psm3_oneapi_ze_memcpy_DtoH(void *dstptr, const void *srcptr, uint32_t len)
+{
+	psm3_oneapi_ze_memcpy_internal(dstptr, srcptr, len);
+}
+
+static void psm3_oneapi_ze_memcpy(void *dstptr, const void *srcptr, uint32_t len)
+{
+	psm3_oneapi_ze_memcpy_internal(dstptr, srcptr, len);
+}
+
+static void psm3_oneapi_ze_synchronize_memcpy(void)
+{
+	/* Not needed for OneAPI Level Zero */
+}
+
+static void psm3_oneapi_ze_mark_buf_synchronous(const void *buf)
+{
+	/* not needed for OneAPI ZE */
+}
+
+static int psm3_oneapi_ze_gpu_addr_send_mr(struct psm2_mq_req *mqreq)
+{
+#ifdef PSM3_USE_ONEAPI_MALLOC
+	// HOST_ALLOC memory treated as CPU memory for Verbs MRs
+	return (mqreq->is_buf_gpu_mem && ! mqreq->gpu_hostbuf_used);
+#else
+	// HOST_ALLOC memory treated as GPU memory for Verbs MRs
+	/// Note: gpu_hostbuf_used" only set if is_buf_gpu_mem
+	return mqreq->is_buf_gpu_mem &&
+			(! mqreq->gpu_hostbuf_used || psm3_oneapi_ze_using_zemem_alloc );
+#endif
+}
+
+static int psm3_oneapi_ze_gpu_addr_recv_mr(struct ips_tid_recv_desc *tidrecvc,
+							int gpu_hostbuf_used)
+{
+#ifdef PSM3_USE_ONEAPI_MALLOC
+	// HOST_ALLOC memory treated as CPU memory for Verbs MRs
+	return tidrecvc->is_ptr_gpu_backed;
+#else
+	// HOST_ALLOC memory treated as GPU memory for Verbs MRs
+	/// Note: gpu_hostbuf_used" only set if is_buf_gpu_mem
+	return tidrecvc->is_ptr_gpu_backed
+			|| (gpu_hostbuf_used && psm3_oneapi_ze_using_zemem_alloc);
+#endif
+}
+
+//***************************************************************************
+//OneAPI Level Zero support for PSM3_DEVICES "shm", via an IPC handle cache and
+//OneAPI Level Zero IPC
+//In platforms with xeLink between GPUs, OneAPI Level Zero IPC will use xeLink.
+
+#define ONEAPI_MEMHANDLE_CACHE_SIZE 64
+
+#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
+/*
+ * rbtree cruft
+ */
+struct _cl_map_item;
+
+struct psm3_oneapi_ze_memhandle_cache;
+typedef struct psm3_oneapi_ze_memhandle_cache *psm3_oneapi_ze_memhandle_cache_t;
+
+typedef struct
+{
+	unsigned long           start;           /* start(base) virtual address
+												in peer process */
+	uint32_t                ze_handle;       /* Sender's GEM handle or fd */
+	uint64_t                alloc_id;        /* ze alloc_id */
+	void                    *buf_ptr;        /* buffer pointer in this
+												process */
+	psm2_epid_t             epid;
+	struct _cl_map_item*    i_prev;          /* idle queue previous */
+	struct _cl_map_item*    i_next;          /* idle queue next */
+	psm3_oneapi_ze_memhandle_cache_t cache;           /* only for gem_handle close */
+}__attribute__ ((aligned (128))) psm3_rbtree_oneapi_ze_memhandle_cache_mapitem_pl_t;
+
+typedef struct {
+	uint32_t                nelems;          /* number of elements in the cache */
+} psm3_rbtree_oneapi_ze_memhandle_cache_map_pl_t;
+
+static psm2_error_t psm3_oneapi_ze_memhandle_mpool_alloc(
+					psm3_oneapi_ze_memhandle_cache_t cache, uint32_t memcache_size);
+static void psm3_oneapi_ze_memhandle_delete(void *buf_ptr);
+
+/*
+ * Custom comparator
+ */
+typedef psm3_rbtree_oneapi_ze_memhandle_cache_mapitem_pl_t psm3_oneapi_ze_cache_item;
+
+static int psm3_oneapi_ze_cache_key_cmp(const psm3_oneapi_ze_cache_item *a,
+						const psm3_oneapi_ze_cache_item *b)
+{
+	// we use epid as part of cache key so multi-ep and multi-process jobs
+	// can have a better cache hit rate.  In some cases we may end up with
+	// cache entries for the same buffer with different epid's all within the
+	// same multi-ep rank, but this does no harm other than to waste some
+	// cache space.  By including epid in key_cmp we have a chance to have
+	// separate cache entries for the same sbuf address in different
+	// sender's GPU virtual address space.
+	switch (psm3_epid_cmp_internal(a->epid, b->epid)) {
+	case -1: return -1;
+	case 1: return 1;
+	default:
+		break;
+	}
+
+	// The sender has used zeMemGetAddressRange to normalize the address
+	// so we can simply compare the start address of the allocation.
+	// Note zeMemOpenIpcHandle only needs the start address as well, so we
+	// ignore length
+	if (a->start < b->start)
+		return -1;
+	if (b->start < a->start)
+		return 1;
+
+	return 0;
+}
+
+
+/*
+ * Necessary rbtree cruft
+ */
+#define RBTREE_MI_PL    psm3_rbtree_oneapi_ze_memhandle_cache_mapitem_pl_t
+#define RBTREE_MAP_PL   psm3_rbtree_oneapi_ze_memhandle_cache_map_pl_t
+#define RBTREE_CMP(a,b) psm3_oneapi_ze_cache_key_cmp((a), (b))
+#define RBTREE_ASSERT   psmi_assert
+#define RBTREE_MAP_COUNT(PAYLOAD_PTR)     ((PAYLOAD_PTR)->nelems)
+#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
+
+#include "psm3_rbtree.h"
+#include "psm3_rbtree.c"
+
+/*
+ * Convenience rbtree cruft
+ */
+#define NELEMS(cache)	((cache)->map.payload.nelems)
+
+#define IHEAD(cache)	((cache)->map.root)
+#define LAST(cache)	(IHEAD(cache)->payload.i_prev)
+#define FIRST(cache)	(IHEAD(cache)->payload.i_next)
+#define INEXT(x)	((x)->payload.i_next)
+#define IPREV(x)	((x)->payload.i_prev)
+
+/*
+ * Actual module data
+ */
+struct psm3_oneapi_ze_memhandle_cache {
+	cl_qmap_t map;
+	mpool_t mpool;
+	uint32_t size;
+	psm2_mq_stats_t *stats;
+};
+
+static void psm3_print_oneapi_ze_memhandle_cache_stats(psm2_mq_stats_t *stats)
+{
+	_HFI_DBG("limit=%lu,maxelems=%lu,hit=%lu,miss=%lu,evict=%lu,remove=%lu,clear=%lu\n",
+		stats->gpu_ipc_cache_limit, stats->gpu_ipc_cache_max_nelems,
+		stats->gpu_ipc_cache_hit, stats->gpu_ipc_cache_miss,
+		stats->gpu_ipc_cache_evict, stats->gpu_ipc_cache_remove,
+		stats->gpu_ipc_cache_clear);
+}
+
+/*
+ * This is the callback function when mempool are resized or destroyed.
+ * Upon calling cache free mpool is destroyed which in turn calls this callback
+ * which helps in closing all memhandles.
+ * TBD - only called for !is_alloc when destroying so could avoid keeping
+ * cache pointer in memcache_item.  But when GEM_CLOSE is not needed
+ * memhandle_delete won't need destroyng flag and can remove cache pointer then
+ */
+static void
+psm3_oneapi_ze_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
+{
+	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
+	if (!is_alloc) {
+		if(memcache_item->payload.start)
+			psm3_oneapi_ze_memhandle_delete(memcache_item->payload.buf_ptr);
+	}
+}
+
+/*
+ * Creating mempool for ze memhandle cache nodes.
+ */
+static psm2_error_t
+psm3_oneapi_ze_memhandle_mpool_alloc(psm3_oneapi_ze_memhandle_cache_t cache,
+							uint32_t memcache_size)
+{
+	psm2_error_t err;
+	if (memcache_size < 1)
+		return PSM2_PARAM_ERR;
+
+	cache->size = memcache_size;
+	/* Creating a memory pool of size PSM3_ONEAPI_MEMCACHE_SIZE
+	 * which includes the Root and NIL items
+	 */
+	cache->mpool = psm3_mpool_create_for_gpu(sizeof(cl_map_item_t),
+					cache->size,
+					cache->size, 0,
+					UNDEFINED, NULL, NULL,
+					psm3_oneapi_ze_memhandle_cache_alloc_func,
+					NULL);
+	if (cache->mpool == NULL) {
+		err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
+				"Couldn't allocate ONEAPI host receive buffer pool");
+		return err;
+	}
+	return PSM2_OK;
+}
+
+/*
+ * allocate and initialize memhandle cache
+ * including rbtree.
+ */
+static psm2_error_t psm3_oneapi_ze_memhandle_cache_alloc(psm3_oneapi_ze_memhandle_cache_t *cachep,
+									uint32_t memcache_size,
+									psm2_mq_stats_t *stats)
+{
+	cl_map_item_t *root = NULL, *nil_item = NULL;
+
+	*cachep = (psm3_oneapi_ze_memhandle_cache_t)psmi_calloc(
+						NULL, UNDEFINED, 1, sizeof(**cachep));
+	if (! *cachep)
+		return PSM2_NO_MEMORY;
+
+	psm2_error_t err = psm3_oneapi_ze_memhandle_mpool_alloc(
+					*cachep, memcache_size);
+	if (err != PSM2_OK)
+		return err;
+
+	root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
+	if (root == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+	nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
+	if (nil_item == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	nil_item->payload.start = 0;
+	nil_item->payload.epid = psm3_epid_zeroed_internal();
+	ips_cl_qmap_init(&(*cachep)->map,root,nil_item);
+	NELEMS(*cachep) = 0;
+
+	(*cachep)->stats = stats;
+
+	stats->gpu_ipc_cache_limit = memcache_size;
+	stats->gpu_ipc_cache_nelems = 0;
+	stats->gpu_ipc_cache_max_nelems = 0;
+	stats->gpu_ipc_cache_hit = 0;
+	stats->gpu_ipc_cache_miss = 0;
+	stats->gpu_ipc_cache_evict = 0;
+	stats->gpu_ipc_cache_remove = 0;
+	stats->gpu_ipc_cache_clear = 0;
+
+	return PSM2_OK;
+
+fail:
+	if (nil_item)
+		psmi_free(nil_item);
+	if (root)
+		psmi_free(root);
+	if ((*cachep)->mpool)
+		psm3_mpool_destroy((*cachep)->mpool);
+	psmi_free(*cachep);
+	return err;
+}
+
+static void psm3_oneapi_ze_memhandle_cache_free(psm3_oneapi_ze_memhandle_cache_t cache)
+{
+	psm3_print_oneapi_ze_memhandle_cache_stats(cache->stats);
+
+	if (cache->map.nil_item)
+		psmi_free(cache->map.nil_item);
+	if (cache->map.root)
+		psmi_free(cache->map.root);
+	if (cache->mpool)
+		psm3_mpool_destroy(cache->mpool);
+	psmi_free(cache);
+}
+
+/*
+ * Insert at the head of Idleq.
+ */
+static void
+psm3_oneapi_ze_idleq_insert(psm3_oneapi_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item)
+{
+	if (FIRST(cache) == NULL) {
+		FIRST(cache) = memcache_item;
+		LAST(cache) = memcache_item;
+		return;
+	}
+	INEXT(FIRST(cache)) = memcache_item;
+	IPREV(memcache_item) = FIRST(cache);
+	FIRST(cache) = memcache_item;
+	INEXT(FIRST(cache)) = NULL;
+	return;
+}
+
+/*
+ * Remove least recent used element.
+ */
+static void
+psm3_oneapi_ze_idleq_remove_last(psm3_oneapi_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item)
+{
+	if (!INEXT(memcache_item)) {
+		LAST(cache) = NULL;
+		FIRST(cache) = NULL;
+	} else {
+		LAST(cache) = INEXT(memcache_item);
+		IPREV(LAST(cache)) = NULL;
+	}
+	// Null-out now-removed memcache_item's next and prev pointers out of
+	// an abundance of caution
+	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
+}
+
+static void
+psm3_oneapi_ze_idleq_remove(psm3_oneapi_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item)
+{
+	if (LAST(cache) == memcache_item) {
+		psm3_oneapi_ze_idleq_remove_last(cache, memcache_item);
+	} else if (FIRST(cache) == memcache_item) {
+		FIRST(cache) = IPREV(memcache_item);
+		INEXT(FIRST(cache)) = NULL;
+	} else {
+		INEXT(IPREV(memcache_item)) = INEXT(memcache_item);
+		IPREV(INEXT(memcache_item)) = IPREV(memcache_item);
+	}
+	// Null-out now-removed memcache_item's next and prev pointers out of
+	// an abundance of caution
+	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
+}
+
+static void
+psm3_oneapi_ze_idleq_reorder(psm3_oneapi_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item)
+{
+	if (FIRST(cache) == memcache_item && LAST(cache) == memcache_item ) {
+		return;
+	}
+	psm3_oneapi_ze_idleq_remove(cache, memcache_item);
+	psm3_oneapi_ze_idleq_insert(cache, memcache_item);
+	return;
+}
+
+/*
+ * After a successful cache hit, item is validated by doing a
+ * memcmp on the handle stored and the handle we receive from the
+ * sender. If the validation fails the item is removed from the idleq,
+ * the rbtree, is put back into the mpool and ZeMemCloseIpcHandle function
+ * is called.
+ * Level Zero's alloc_id will be unique per allocation, even if the allocation
+ * was at the same address.  In some cases, but not always, the ipc_handle
+ * will also be different.  So we validate both, although just checking alloc_id
+ * would be sufficient.
+ */
+
+static psm2_error_t
+psm3_oneapi_ze_memhandle_cache_validate(psm3_oneapi_ze_memhandle_cache_t cache,
+				cl_map_item_t* memcache_item,
+				uintptr_t sbuf, uint32_t handle,
+				psm2_epid_t epid, uint64_t alloc_id)
+{
+	psmi_assert(!psm3_epid_cmp_internal(epid, memcache_item->payload.epid));
+	psmi_assert(sbuf == memcache_item->payload.start);
+	if (handle == memcache_item->payload.ze_handle &&
+		alloc_id == memcache_item->payload.alloc_id) {
+		return PSM2_OK;
+	}
+	_HFI_DBG("cache remove stale entry: new start=%lu,handle=%u,alloc_id=%lu\n",
+			sbuf, handle, alloc_id);
+
+	cache->stats->gpu_ipc_cache_remove++;
+	ips_cl_qmap_remove_item(&cache->map, memcache_item);
+	cache->stats->gpu_ipc_cache_nelems--;
+	psm3_oneapi_ze_memhandle_delete(memcache_item->payload.buf_ptr);
+	psm3_oneapi_ze_idleq_remove(cache, memcache_item);
+	memset(memcache_item, 0, sizeof(*memcache_item));
+	psm3_mpool_put(memcache_item);
+	return PSM2_OK_NO_PROGRESS;
+}
+
+/*
+ * Current eviction policy: Least Recently Used.
+ */
+static void
+psm3_oneapi_ze_memhandle_cache_evict(psm3_oneapi_ze_memhandle_cache_t cache)
+{
+	cache->stats->gpu_ipc_cache_evict++;
+	cl_map_item_t *p_item = LAST(cache);
+	_HFI_VDBG("Removing (epid=%s,start=%lu,dev_ptr=%p,it=%p) from ze_memhandle_cachemap.\n",
+			psm3_epid_fmt_internal(p_item->payload.epid, 0), p_item->payload.start,
+			p_item->payload.buf_ptr, p_item);
+	ips_cl_qmap_remove_item(&cache->map, p_item);
+	cache->stats->gpu_ipc_cache_nelems--;
+	psm3_oneapi_ze_memhandle_delete(p_item->payload.buf_ptr);
+	psm3_oneapi_ze_idleq_remove_last(cache, p_item);
+	memset(p_item, 0, sizeof(*p_item));
+	psm3_mpool_put(p_item);
+}
+
+static psm2_error_t
+psm3_oneapi_ze_memhandle_cache_register(psm3_oneapi_ze_memhandle_cache_t cache,
+				uintptr_t sbuf, uint32_t handle,
+				psm2_epid_t epid,
+				void *buf_ptr, uint64_t alloc_id)
+{
+	if (NELEMS(cache) == cache->size)
+		psm3_oneapi_ze_memhandle_cache_evict(cache);
+
+	cl_map_item_t* memcache_item = psm3_mpool_get(cache->mpool);
+	/* memcache_item cannot be NULL as we evict
+	 * before the call to mpool_get. Check has
+	 * been fixed to help with klockwork analysis.
+	 */
+	if (memcache_item == NULL)
+		return PSM2_NO_MEMORY;
+	memcache_item->payload.start = sbuf;
+	memcache_item->payload.ze_handle = handle;
+	memcache_item->payload.buf_ptr = buf_ptr;
+	memcache_item->payload.alloc_id = alloc_id;
+	memcache_item->payload.epid = epid;
+	memcache_item->payload.cache = cache;
+	ips_cl_qmap_insert_item(&cache->map, memcache_item);
+	cache->stats->gpu_ipc_cache_nelems++;
+	if (cache->stats->gpu_ipc_cache_nelems > cache->stats->gpu_ipc_cache_max_nelems)
+		cache->stats->gpu_ipc_cache_max_nelems = cache->stats->gpu_ipc_cache_nelems;
+	psm3_oneapi_ze_idleq_insert(cache, memcache_item);
+	_HFI_VDBG("registered: handle %u sbuf 0x%lx ptr %p alloc_id %lu\n",
+			handle, sbuf, buf_ptr, alloc_id);
+	return PSM2_OK;
+}
+
+#ifndef PSM_HAVE_PIDFD
+static inline psm2_error_t psm3_oneapi_ze_prepare_fds_for_ipc_import(
+		uint32_t gem_handle, int device_index, int *ipc_fd,
+		psm2_epaddr_t epaddr)
+{
+	am_epaddr_t *am_epaddr = (am_epaddr_t*)epaddr;
+	int fd;
+	struct drm_prime_handle open_fd = {0, 0, -1};
+
+	if (device_index >= psm3_num_oneapi_ze_devices || device_index < 0) {
+		_HFI_ERROR("psm3_oneapi_ze_memhandle_acquire received invalid device_index from peer: %d\n",
+			device_index);
+		psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"device_index "
+			"invalid - received from peer: %d",
+			device_index);
+		return PSM2_INTERNAL_ERR;
+	}
+	fd = am_epaddr->gpu_specific.ze_peer_fds[device_index];
+	psm3_oneapi_ze_cur_dev = &psm3_oneapi_ze_devices[device_index];
+	open_fd.flags = DRM_CLOEXEC | DRM_RDWR;
+	open_fd.handle = gem_handle;
+	if (ioctl(fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &open_fd) < 0) {
+		_HFI_ERROR("ioctl failed for DRM_IOCTL_PRIME_HANDLE_TO_FD: %s\n", strerror(errno));
+		psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"ioctl "
+			"failed for DRM_IOCTL_PRIME_HANDLE_TO_FD errno=%d",
+			errno);
+		return PSM2_INTERNAL_ERR;
+	}
+	*ipc_fd = open_fd.fd;
+
+	return PSM2_OK;
+}
+#else
+static inline psm2_error_t psm3_oneapi_ze_prepare_fds_for_ipc_import(
+		uint32_t handle, int device_index, int *ipc_fd,
+		psm2_epaddr_t epaddr)
+{
+	int fd;
+	am_epaddr_t *am_epaddr = (am_epaddr_t *)epaddr;
+
+	fd = syscall(__NR_pidfd_getfd, am_epaddr->gpu_specific.ze_pidfd, handle, 0);
+	if (fd < 0) {
+		_HFI_ERROR("pidfd_getfd failed %d: %s\n", fd, strerror(errno));
+		psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+			"pidfd_getfd failed errno=%d (%s)",
+			errno, strerror(errno));
+		return PSM2_INTERNAL_ERR;
+	}
+	*ipc_fd = fd;
+
+	return PSM2_OK;
+}
+#endif /* PSM_HAVE_PIDFD */
+#endif /* defined(HAVE_DRM) || defined(HAVE_LIBDRM) */
+
+#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
+static void *psm3_oneapi_ze_import_ipc_buf(uint32_t fd, uint8_t alloc_type)
+{
+	ze_external_memory_import_fd_t import_desc = {};
+	void *ze_ipc_buf = NULL;
+
+	import_desc.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD;
+	import_desc.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF;
+	import_desc.fd = fd;
+
+	switch(alloc_type) {
+	case ZE_MEMORY_TYPE_HOST:
+	{
+		ze_host_mem_alloc_desc_t host_desc = {};
+
+		host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
+		host_desc.pNext = &import_desc;
+		/* size & alignment are not used since this is an import.*/
+		PSM3_ONEAPI_ZE_CALL(zeMemAllocHost, psm3_oneapi_ze_context, &host_desc,
+					0, 0, &ze_ipc_buf);
+	}
+		break;
+	case ZE_MEMORY_TYPE_DEVICE:
+	{
+		ze_device_mem_alloc_desc_t dev_desc = {};
+
+		dev_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
+		dev_desc.pNext = &import_desc;
+		/* size & alignment are not used since this is an import. */
+		PSM3_ONEAPI_ZE_CALL(zeMemAllocDevice, psm3_oneapi_ze_context, &dev_desc,
+					0, 0, psm3_oneapi_ze_cur_dev->dev, &ze_ipc_buf);
+	}
+		break;
+	default:
+		_HFI_ERROR("Invalid alloc_type %u for fd %u\n",
+				alloc_type, fd);
+		return NULL;
+	}
+
+	return ze_ipc_buf;
+}
+#endif /* defined(HAVE_DRM) || defined(HAVE_LIBDRM) */
+
+/*
+ * The key used to search the cache is the senders buf address pointer and
+ * epid.  The sender will have used zeMemGetAddressRange
+ * to find the start of the memory containing the buffer (supplied as sbuf)
+ * Upon match, we must validate the entry we find and may need to replace it.
+ */
+static void *
+psm3_oneapi_ze_memhandle_acquire(psm3_oneapi_ze_memhandle_cache_t cache,
+			uintptr_t sbuf, uint32_t handle,
+			psm2_epaddr_t epaddr, int device_index,
+			uint64_t alloc_id, uint8_t alloc_type)
+{
+	void *buf_ptr = NULL;
+	psm2_epid_t epid = epaddr->epid;
+#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
+	int ipc_fd = -1;
+#endif
+	_HFI_VDBG("sbuf=%lu,handle=%u,epid=%s\n",
+			sbuf, handle, psm3_epid_fmt_internal(epid, 0));
+#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
+
+	if (!cache) {
+		if (psm3_oneapi_ze_prepare_fds_for_ipc_import(handle, device_index, &ipc_fd,
+							epaddr) == PSM2_OK) {
+			buf_ptr = psm3_oneapi_ze_import_ipc_buf(ipc_fd, alloc_type);
+			if (ipc_fd >= 0) {
+				if (close(ipc_fd) < 0) {
+					_HFI_ERROR("close failed for ipc_fd: %s\n", strerror(errno));
+					psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+						"close "
+						"failed for ipc_fd %d errno=%d",
+						ipc_fd, errno);
+					return NULL;
+				}
+			}
+		}
+		return buf_ptr;
+	}
+
+	psm3_oneapi_ze_cache_item key = {
+		.start = (unsigned long) sbuf,
+		.epid = epid
+	};
+
+	/*
+	 * preconditions:
+	 *  1) buffer [start,epid) may or may not be in cache->map already
+	 *  2) there are no duplicate entries in cache->map
+	 * postconditions:
+	 *  1) buffer is in cache->map with same handle, epid, alloc_id
+	 *  2) there are no duplicate entries in cache->map
+	 *
+	 * The key used to search the cache is the senders buf address pointer
+	 * and epid.
+	 * Upon a succesful hit in the cache, additional validation is required
+	 * as the handle or alloc_id could be stale.
+	 */
+	cl_map_item_t *p_item = ips_cl_qmap_searchv(&cache->map, &key);
+	if (p_item->payload.start) {
+		// confirm the entry for sbuf matches the handle and is not stale
+		if (psm3_oneapi_ze_memhandle_cache_validate(cache, p_item, sbuf, handle,
+							epid, alloc_id) == PSM2_OK) {
+			cache->stats->gpu_ipc_cache_hit++;
+			psm3_oneapi_ze_idleq_reorder(cache, p_item);
+			return p_item->payload.buf_ptr;
+		}
+
+		// buffer found was stale am_oneapi_memhandle_cache_validate()
+		// closed and removed existing entry.
+		// Should find no more duplicates
+#ifdef PSM_DEBUG
+		p_item = ips_cl_qmap_searchv(&cache->map, &key);
+		psmi_assert(! p_item->payload.start);
+#endif
+	}
+	cache->stats->gpu_ipc_cache_miss++;
+
+	if (psm3_oneapi_ze_prepare_fds_for_ipc_import(handle, device_index, &ipc_fd,
+						epaddr) == PSM2_OK) {
+		buf_ptr = psm3_oneapi_ze_import_ipc_buf(ipc_fd, alloc_type);
+		if (ipc_fd >= 0) {
+			if (close(ipc_fd) < 0) {
+				_HFI_ERROR("close failed for ipc_fd: %s\n", strerror(errno));
+				psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					"close "
+					"failed for ipc_fd %d errno=%d",
+					ipc_fd, errno);
+				return NULL;
+			}
+		}
+		if (!buf_ptr)
+			return NULL;
+	} else {
+		return NULL;
+	}
+
+	psm3_oneapi_ze_memhandle_cache_register(cache, sbuf, handle, epid, buf_ptr,
+					alloc_id);
+	return buf_ptr;
+#else // if no drm, set up to return NULL as oneapi ipc handles don't work without drm
+// TBD - caller will assert when we return NULL, so should it be a build error
+// if DRM not available?  What works without DRM?
+	buf_ptr = NULL;
+	return buf_ptr;
+#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM)
+
+}
+
+#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
+static void psm3_oneapi_ze_memhandle_delete(void *buf_ptr)
+{
+	/* Release the reference to the buffer */
+	PSM3_ONEAPI_ZE_CALL(zeMemFree, psm3_oneapi_ze_context, buf_ptr);
+
+#ifndef PSM_HAVE_PIDFD
+	/*
+	 * If pidfd is not used, we need to call GEM_CLOSE ioctl to remove the
+	 * GEM handle from the handle cache of the peer device file's
+	 * private file data in the kernel to avoid handle leak. However, we
+	 * will have a potential risk condition that will fail a later request:
+	 * (1) 3 requests with buf1, buf2, and buf1 are sent from sender side.
+	 *     Requests 1 and 3 uses the same buffer and therefore have the
+	 *     same gem_handle1.
+	 * (2) buf1 is received and put into cache;
+	 * (3) buf2 is received and buf1 is evicted from cache due to some
+	 *     condition (small cache size). As a result, gem_handle1 is closed
+	 *     through GEM_CLOSE ioctl. buf2 is put into cache.
+	 * (4) Request 3 (with buf1) is received and HANDLE_TO_FD ioctl will
+	 *     fail because the gem_handle has been removed from peer device
+	 *     file's handle cache.
+	 * For this reason, we prefer to leak the GEM handle over calling
+	 * GEM_CLOSE.
+	 */
+#endif
+}
+#endif /* HAVE_DRM or HAVE_LIBDRM */
+
+static void
+psm3_oneapi_ze_memhandle_release(psm3_oneapi_ze_memhandle_cache_t cache,
+			void *buf_ptr)
+{
+#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
+	if (!cache)
+		psm3_oneapi_ze_memhandle_delete(buf_ptr);
+#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM)
+	return;
+}
+
+// end of oneAPI Level Zero IPC MemHandle Cache
+//***************************************************************************
+
+// IPC Handle management for OneAPI Level Zero
+
+#ifndef PSM_HAVE_PIDFD
+/*
+ * psm3_onapi_ze_init_fds - initialize the file descriptors (ze_dev_fds) 
+ *
+ * Open the file descriptors for our GPUs (psm3_ze_dev_fds[])
+ *
+ * The file descriptors are used in intra-node communication to pass to peers
+ * via socket with sendmsg/recvmsg SCM_RIGHTS message type.
+ *
+ */
+
+static psm2_error_t psm3_onapi_ze_init_fds(void)
+{
+	const char *dev_dir = "/dev/dri/by-path/";
+	const char *suffix = "-render";
+	DIR *dir;
+	struct dirent *ent = NULL;
+	char dev_name[NAME_MAX];
+	int i = 0, ret;
+
+	if (psm3_num_ze_dev_fds)
+		return PSM2_OK;
+
+	dir = opendir(dev_dir);
+	if (dir == NULL)
+		return PSM2_INTERNAL_ERR;
+
+	while ((ent = readdir(dir)) != NULL) {
+		if (ent->d_name[0] == '.' ||
+			strstr(ent->d_name, suffix) == NULL)
+			continue;
+
+		memset(dev_name, 0, sizeof(dev_name));
+		ret = snprintf(dev_name, NAME_MAX, "%s%s", dev_dir, ent->d_name);
+		if (ret < 0 || ret >= NAME_MAX) {
+			_HFI_INFO("GPU dev name too long: %s%s\n", dev_dir, ent->d_name);
+			goto err;
+		}
+
+		psm3_ze_dev_fds[i] = open(dev_name, O_RDWR);
+		if (psm3_ze_dev_fds[i] == -1) {
+			_HFI_INFO("Failed to open %s GPU dev FD: %s\n", dev_name,
+						strerror(errno));
+			goto err;
+		}
+		_HFI_DBG("Opened %s GPU dev FD: %d\n", dev_name,
+				psm3_ze_dev_fds[i]);
+		i++;
+		psm3_num_ze_dev_fds++;
+	}
+	(void) closedir(dir);
+	_HFI_DBG("Opened %d GPU dev FDs\n", psm3_num_ze_dev_fds);
+	return PSM2_OK;
+
+err:
+	(void) closedir(dir);
+	return PSM2_INTERNAL_ERR;
+}
+
+/*
+ * psm3_oneapi_ze_get_dev_fds - fetch device file descriptors
+ *
+ * Returns a pointer to ze_dev_fds while putting the number
+ * of fds into the in/out nfds parameter
+ *
+ */
+
+static int *psm3_oneapi_ze_get_dev_fds(int *nfds)
+{
+	*nfds = psm3_num_ze_dev_fds;
+	return psm3_ze_dev_fds;
+}
+
+/*
+ * psm3_oneapi_ze_sendmsg_fds - send device file descriptors over socket w/ sendmsg
+ *
+ * Prepares message of type SCM_RIGHTS, copies file descriptors as payload,
+ * and sends over socket via sendmsg while creating appropriate fd numbers
+ * for dest (effectively a dup(2) of our file descriptor)
+ *
+ * returns -errno on error or number of bytes sent (>0) on success
+ */
+
+static int psm3_oneapi_ze_sendmsg_fds(int sock, int *fds, int nfds, psm2_epid_t epid)
+{
+	struct msghdr msg;
+	struct cmsghdr *cmsg;
+	struct iovec iov;
+	int64_t peer_id = *(int64_t *)&epid;
+	char *ctrl_buf;
+	size_t ctrl_size;
+	int ret;
+
+	ctrl_size = sizeof(*fds) * nfds;
+	ctrl_buf = (char *)psmi_calloc(NULL, UNDEFINED, 1, CMSG_SPACE(ctrl_size));
+	if (!ctrl_buf)
+		return -ENOMEM;
+
+	iov.iov_base = &peer_id;
+	iov.iov_len = sizeof(peer_id);
+
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_control = ctrl_buf;
+	msg.msg_controllen = CMSG_SPACE(ctrl_size);
+
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	cmsg = CMSG_FIRSTHDR(&msg);
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type = SCM_RIGHTS;
+	cmsg->cmsg_len = CMSG_LEN(ctrl_size);
+	memcpy(CMSG_DATA(cmsg), fds, ctrl_size);
+
+	ret = sendmsg(sock, &msg, 0);
+	if (ret < 0)
+		ret = -errno;
+	else if (! ret)
+		ret = -EAGAIN;
+
+	psmi_free(ctrl_buf);
+	return ret;
+}
+
+/*
+ * psm3_oneapi_ze_recvmsg_fds - receive device file descriptors from socket w/ recvmsg
+ *
+ * Prepares message buffer of type SCM_RIGHTS, receives message from socket
+ * via recvmsg, and copies device file descriptors to in/out parameter.
+ * The received file descriptors are usable in our process and need to
+ * be closed when done being used
+ *
+ * returns -errno on error or number of bytes received (>0) on success
+ */
+
+static int psm3_oneapi_ze_recvmsg_fd(int sock, int *fds, int nfds, psm2_epid_t epid)
+{
+	struct msghdr msg;
+	struct cmsghdr *cmsg;
+	struct iovec iov;
+	int64_t peer_id = *(int64_t *)&epid;
+	char *ctrl_buf;
+	size_t ctrl_size;
+	int ret;
+
+	ctrl_size = sizeof(*fds) * nfds;
+	ctrl_buf = (char *)psmi_calloc(NULL, UNDEFINED, 1, CMSG_SPACE(ctrl_size));
+	if (!ctrl_buf)
+		return -ENOMEM;
+
+	iov.iov_base = &peer_id;
+	iov.iov_len = sizeof(peer_id);
+
+	memset(&msg, 0, sizeof(msg));
+	msg.msg_control = ctrl_buf;
+	msg.msg_controllen = CMSG_SPACE(ctrl_size);
+	msg.msg_iov = &iov;
+	msg.msg_iovlen = 1;
+
+	ret = recvmsg(sock, &msg, 0);
+	if (ret < 0) {
+		ret = -errno;
+	} else if (ret != sizeof(peer_id)) {
+		_HFI_CONNDBG("recvmsg from: %s returns %d expect %u\n",
+						psm3_epid_fmt_addr(epid, 0), ret,
+						(unsigned)sizeof(peer_id) );
+		ret = -EAGAIN;
+		goto out;
+	}
+
+	psmi_assert(!(msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)));
+	cmsg = CMSG_FIRSTHDR(&msg);
+	psmi_assert(cmsg && cmsg->cmsg_len == CMSG_LEN(ctrl_size) &&
+		cmsg->cmsg_level == SOL_SOCKET &&
+		cmsg->cmsg_type == SCM_RIGHTS && CMSG_DATA(cmsg));
+	memcpy(fds, CMSG_DATA(cmsg), ctrl_size);
+out:
+	psmi_free(ctrl_buf);
+	return ret;
+}
+
+/*
+ * psm3_onapi_ze_init_ipc_socket - initialize ipc socket in ep
+ *
+ * Set up the AF_UNIX ipc socket in the ep for listen mode. Name it
+ * using our epid, and bind it.
+ *
+ */
+
+static psm2_error_t psm3_onapi_ze_init_ipc_socket(struct ptl_am *ptl)
+{
+	psm2_error_t err = PSM2_OK;
+	int ret;
+	struct sockaddr_un sockaddr = {0};
+	socklen_t len = sizeof(sockaddr);
+
+	if ((ptl->gpu_specific.ze_ipc_socket = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+		_HFI_ERROR("error creating GPU dev FDs AF_UNIX sock: %s\n",
+					strerror(errno));
+		err =  PSM2_INTERNAL_ERR;
+		goto fail;
+	}
+
+	sockaddr.sun_family = AF_UNIX;
+	snprintf(sockaddr.sun_path, 108, "/dev/shm/psm3_shm.ze_sock2.%ld.%s",
+				(long int) getuid(), psm3_epid_fmt_internal(ptl->epid, 0));
+	ptl->gpu_specific.ze_listen_sockname = psmi_strdup(NULL, sockaddr.sun_path);
+	if (ptl->gpu_specific.ze_listen_sockname == NULL) {
+		err = PSM2_NO_MEMORY;
+		goto fail;
+	}
+
+	if ((ret = bind(ptl->gpu_specific.ze_ipc_socket, (struct sockaddr *) &sockaddr, len)) < 0) {
+		_HFI_ERROR("error binding GPU dev FDs AF_UNIX sock to %s: %s\n",
+					sockaddr.sun_path, strerror(errno));
+		err = PSM2_INTERNAL_ERR;
+		goto fail;
+	}
+
+	if ((ret = listen(ptl->gpu_specific.ze_ipc_socket, 256)) < 0) {
+		_HFI_ERROR("error listening on GPU dev FDs AF_UNIX sock %s: %s\n",
+					sockaddr.sun_path, strerror(errno));
+		err = PSM2_INTERNAL_ERR;
+		goto fail;
+	}
+	return PSM2_OK;
+
+fail:
+	if (ptl->gpu_specific.ze_ipc_socket >= 0)
+		close(ptl->gpu_specific.ze_ipc_socket);
+	ptl->gpu_specific.ze_ipc_socket = -1;
+	if (ptl->gpu_specific.ze_listen_sockname)
+		psmi_free(ptl->gpu_specific.ze_listen_sockname);
+	ptl->gpu_specific.ze_listen_sockname = NULL;
+	return err;
+}
+
+/*
+ * psm3_oneapi_ze_receive_dev_fds - receive the dev fds on the listen socket
+ *
+ * Set up the listen socket to be polled for POLLIN. When the event is
+ * received, accept for the new socket and then read the peer epid,
+ * and locate the epaddr for it. Then receive the dev fds to be stored
+ * in the am_epaddr.
+ *
+ * returns:
+ *		PSM2_OK - GPU dev FDs received from a peer
+ *		PSM2_OK_NO_PROGRESS - nothing received
+ *		other - error
+ */
+
+static psm2_error_t psm3_oneapi_ze_receive_dev_fds(struct ptl_am *ptl)
+{
+	psm2_error_t err = PSM2_OK;
+	struct pollfd fdset;
+	int newsock = -1;
+
+	fdset.fd = ptl->gpu_specific.ze_ipc_socket;
+	fdset.events = POLLIN;
+
+	if (poll(&fdset, 1, 0) <= 0)
+		return PSM2_OK_NO_PROGRESS;
+
+	{
+		struct sockaddr_un sockaddr = {0};
+		socklen_t len = sizeof(sockaddr);
+		int nfds = psm3_num_ze_dev_fds;
+		int nread;
+		psm2_epid_t epid;
+		psm2_epaddr_t epaddr;
+		am_epaddr_t *am_epaddr;
+
+		newsock = accept(ptl->gpu_specific.ze_ipc_socket, (struct sockaddr *)&sockaddr, &len);
+		if (newsock < 0) {
+			_HFI_ERROR("GPU dev FDs AF_UNIX accept failed: %s\n",
+						strerror(errno));
+			err =  PSM2_INTERNAL_ERR;
+			goto fail;
+		} else {
+			int ret;
+			// technically we could get less than we asked for and need to
+			// call recv again in future but our transfers are small enough
+			// we should get it all
+			if ((nread = recv(newsock, &epid, sizeof(epid), 0)) < 0) {
+				_HFI_ERROR("GPU dev FDs AF_UNIX recv failed: %s\n",
+							strerror(errno));
+				err =  PSM2_INTERNAL_ERR;
+				goto fail;
+			}
+			if (nread != sizeof(epid)) {
+				_HFI_ERROR("GPU dev FDs AF_UNIX recv incomplete: %d\n", nread);
+				err =  PSM2_INTERNAL_ERR;
+				goto fail;
+			}
+			// we only poll for recv FDs after processing a am_shm connect
+			// so the epid should always be known
+			if ((epaddr = psm3_epid_lookup(ptl->ep, epid)) == NULL) {
+				_HFI_ERROR("Peer Unknown, unable to receive GPU dev FDs from: %s\n",
+								psm3_epid_fmt_addr(epid, 0));
+				err =  PSM2_INTERNAL_ERR;
+				goto fail;
+			}
+			am_epaddr = (am_epaddr_t *)epaddr;
+			am_epaddr->gpu_specific.ze_num_peer_fds = nfds;
+			ret = psm3_oneapi_ze_recvmsg_fd(newsock, am_epaddr->gpu_specific.ze_peer_fds, nfds, ptl->epid);
+			if (ret <= 0) {
+				_HFI_ERROR("Unable to recvmsg %d GPU dev FDs from: %s: %s\n",
+								nfds, psm3_epid_fmt_addr(epid, 0),
+								strerror(-ret));
+				err =  PSM2_INTERNAL_ERR;
+				goto fail;
+			}
+			_HFI_CONNDBG("%d GPU dev FDs Received from: %s\n",
+								nfds, psm3_epid_fmt_addr(epid, 0));
+		}
+	}
+
+fail:
+	if (newsock >= 0)
+		close(newsock);
+	return err;
+}
+
+/*
+ * psm3_oneapi_ze_send_dev_fds - do next step to send the dev fds to the peer's
+ *		listen socket
+ *
+ * Check the connected state and proceed accordingly:
+ * - ZE_SOCK_NOT_CONNECTED
+ *     We have not done anything yet, so connect and send our epid,
+ *     followed by the dev fds. Set state to ZE_SOCK_DEV_FDS_SENT
+ * - ZE_SOCK_DEV_FDS_SENT
+ *     The dev fds have been sent. Issue ioctl to see if the output
+ *     queue has been emptied indicating that the peer has read the data.
+ *     If so, set state to ZE_SOCK_DEV_FDS_SENT_AND_RECD.
+ * - ZE_SOCK_DEV_FDS_SENT_AND_RECD
+ *     We are done, just return.
+ *
+ * returns:
+ *		PSM2_OK - next step completed
+ *		PSM2_OK_NO_PROGRESS - nothing to do
+ *		other - error
+ */
+
+static psm2_error_t psm3_oneapi_ze_send_dev_fds(struct ptl_am *ptl, struct am_epaddr *am_epaddr)
+{
+	switch (am_epaddr->gpu_specific.ze_sock_connected_state) {
+		case ZE_SOCK_DEV_FDS_SENT_AND_RECD:
+			return PSM2_OK_NO_PROGRESS;
+			break;
+
+		case ZE_SOCK_DEV_FDS_SENT:
+		{
+			int pending;
+
+			psmi_assert(am_epaddr->gpu_specific.ze_sock >= 0);
+			if_pf (ioctl(am_epaddr->gpu_specific.ze_sock, SIOCOUTQ, &pending) != 0) {
+				return	psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
+					"error sending dev FDs: %s\n", strerror(errno));
+			}
+			if (pending == 0) {
+				am_epaddr->gpu_specific.ze_sock_connected_state = ZE_SOCK_DEV_FDS_SENT_AND_RECD;
+				_HFI_CONNDBG("GPU dev FDs Send Completed to: %s\n",
+								psm3_epid_fmt_addr(am_epaddr->epaddr.epid, 0));
+				close(am_epaddr->gpu_specific.ze_sock);
+				am_epaddr->gpu_specific.ze_sock = -1;
+				return PSM2_OK;
+			}
+			// be paranoid just in case 1st call to send_dev_fds for given
+			// epaddr gets here
+			if (! ptl->gpu_specific.ze_need_dev_fds_poll)
+				_HFI_CONNDBG("restart GPU dev FDs poll\n");
+			ptl->gpu_specific.ze_need_dev_fds_poll = 1;
+			return PSM2_OK_NO_PROGRESS;
+			break;
+		}
+
+		case ZE_SOCK_NOT_CONNECTED:
+		{
+			struct sockaddr_un sockaddr = {0};
+			socklen_t len = sizeof(sockaddr);
+			psm2_epid_t peer_epid = am_epaddr->epaddr.epid;
+			int *fds, nfds;
+
+			if (!ptl->gpu_specific.ze_need_dev_fds_poll)
+				_HFI_CONNDBG("restart GPU dev FDs poll\n");
+			ptl->gpu_specific.ze_need_dev_fds_poll = 1;
+
+			fds = psm3_oneapi_ze_get_dev_fds(&nfds);
+
+			if ((am_epaddr->gpu_specific.ze_sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
+				_HFI_ERROR("error creating GPU dev FDs AF_UNIX sock: %s\n",
+							strerror(errno));
+				goto fail;
+			}
+
+			sockaddr.sun_family = AF_UNIX;
+			snprintf(sockaddr.sun_path, 108, "/dev/shm/psm3_shm.ze_sock2.%ld.%s",
+				(long int) getuid(), psm3_epid_fmt_internal(peer_epid, 0));
+
+			if (connect(am_epaddr->gpu_specific.ze_sock, (struct sockaddr *) &sockaddr, len) < 0) {
+				_HFI_ERROR("GPU dev FDs connect to %s (via %s) failed: %s\n",
+								psm3_epid_fmt_addr(am_epaddr->epaddr.epid, 0),
+								sockaddr.sun_path,  strerror(errno));
+				goto fail;
+			} else {
+				int ret;
+				ret = send(am_epaddr->gpu_specific.ze_sock, &ptl->epid, sizeof(ptl->epid), 0);
+				if (ret < 0) {
+					_HFI_ERROR("GPU dev FDs send to %s (via %s) failed: %s\n",
+							psm3_epid_fmt_addr(am_epaddr->epaddr.epid, 0),
+							sockaddr.sun_path, strerror(errno));
+					goto fail;
+				}
+		
+				ret = psm3_oneapi_ze_sendmsg_fds(am_epaddr->gpu_specific.ze_sock, fds, nfds, peer_epid);
+				if (ret <= 0) {
+					/* ret is -errno */
+					_HFI_ERROR("GPU dev FDs sendmsg to %s (via %s) failed: %s\n",
+							psm3_epid_fmt_addr(am_epaddr->epaddr.epid, 0),
+							sockaddr.sun_path,  strerror(-ret));
+					goto fail;
+				}
+				am_epaddr->gpu_specific.ze_sock_connected_state = ZE_SOCK_DEV_FDS_SENT;
+				_HFI_CONNDBG("%d GPU dev FDs Posted Send to: %s (via %s)\n",
+						nfds, psm3_epid_fmt_addr(am_epaddr->epaddr.epid, 0),
+						sockaddr.sun_path);
+				return PSM2_OK;
+			}
+			/* NOTREACHED */
+			break;
+		}
+
+		default:
+			return PSM2_INTERNAL_ERR;
+			break;
+	}
+	/* NOTREACHED */
+	return PSM2_INTERNAL_ERR;
+
+fail:
+	if (am_epaddr->gpu_specific.ze_sock >= 0)
+		close(am_epaddr->gpu_specific.ze_sock);
+	am_epaddr->gpu_specific.ze_sock = -1;
+	return PSM2_INTERNAL_ERR;
+}
+
+// simple test if dev_fds bi-dir exchange completed for given epaddr
+// 1 = yes, 0 = no
+static int psm3_oneapi_ze_dev_fds_exchanged(struct am_epaddr *am_epaddr)
+{
+	return (am_epaddr->gpu_specific.ze_sock_connected_state == ZE_SOCK_DEV_FDS_SENT_AND_RECD
+			&& am_epaddr->gpu_specific.ze_num_peer_fds) ;
+}
+
+/*
+ * psm3_oneapi_ze_check_dev_fds_exchanged - check that dev fds have been bi-dir exchanged
+ * with given peer. Poll to try and move forward as needed.
+ *
+ * connect state ZE_SOCK_DEV_FDS_SENT_AND_RECD indicates peer has received
+ * our send of dev_fds
+ *
+ * num_peer_fds indicates if we received peer's fds.
+ *
+ * if both are satisfied, exchange is complete, return PSM2_OK
+ *
+ *Returns:
+ *   PSM2_OK - both are done
+ *   PSM2_OK_NO_PROGRESS - more work needed
+ *   other - error
+ */
+static psm2_error_t psm3_oneapi_ze_check_dev_fds_exchanged(struct ptl_am *ptl, struct am_epaddr *am_epaddr)
+{
+	psm2_error_t err;
+	psm2_error_t ret;
+
+	psmi_assert(am_epaddr);
+	psmi_assert(! psm3_epid_zero_internal(am_epaddr->epaddr.epid));
+
+	if (psm3_oneapi_ze_dev_fds_exchanged(am_epaddr))
+		return PSM2_OK;
+
+	if (am_epaddr->cstate_outgoing != AMSH_CSTATE_OUTGOING_ESTABLISHED
+		&& am_epaddr->cstate_incoming != AMSH_CSTATE_INCOMING_ESTABLISHED)
+		return PSM2_OK_NO_PROGRESS;
+
+	// try to move forward 1 step
+	err = psm3_oneapi_ze_send_dev_fds(ptl, am_epaddr);
+	if (am_epaddr->gpu_specific.ze_sock_connected_state == ZE_SOCK_DEV_FDS_SENT_AND_RECD)
+		err = PSM2_OK;
+	else /* err will be NO_PROGRESS or worse */
+		err = psm3_error_cmp(err, PSM2_OK_NO_PROGRESS);
+
+	// only poll recv if we need to
+	ret = PSM2_OK_NO_PROGRESS;	// keep KW happy
+	if (am_epaddr->gpu_specific.ze_num_peer_fds == 0) 
+		ret = psm3_oneapi_ze_receive_dev_fds(ptl);
+	if (am_epaddr->gpu_specific.ze_num_peer_fds) 
+		ret = PSM2_OK;
+
+	/* worst err, NO_PROGRESS is worse than PSM2_OK */
+	return psm3_error_cmp(ret, err);
+}
+
+// check if all successful epid/epaddr in req have exchanged GPU dev FDs
+// when called it assumes all the good epid have completed so it does not
+// check failed epid and just treats them as done for this phase
+// return:
+//	PSM2_OK - all that can be done are done
+//	PSM2_OK_NO_PROGRESS - more to be done
+static psm2_error_t
+psm3_oneapi_ze_shm_ep_connreq_poll_dev_fds(struct ptl_am *ptl, struct am_ptl_connection_req *req)
+{
+	int num_left = 0;
+	int i;
+
+	for (i = 0; i < req->numep; i++) {
+		if (req->epid_mask[i] == AMSH_CMASK_NONE)
+			continue;
+		if (req->epid_mask[i] != AMSH_CMASK_DONE || req->errors[i])
+			continue;
+		psmi_assert(req->epaddr[i]);
+		psmi_assert(! psm3_epid_zero_internal(req->epaddr[i]->epid));
+		if (PSM2_OK != psm3_oneapi_ze_check_dev_fds_exchanged(ptl, (struct am_epaddr *)(req->epaddr[i])))
+			num_left++;
+	}
+	if (num_left == 0)
+		return PSM2_OK;
+	else
+		return PSM2_OK_NO_PROGRESS;	// not done everyone yet
+}
+
+/*
+ * psm3_oneapi_ze_poll_dev_fds_exchanged - poll to make forward progress on
+ * GPU dev FDs exchange
+ *
+ * Loop through the epaddrs in am_ep and check_dev_fds_exchanged
+ *
+ * Returns:
+ *		PSM2_OK - we found some work to do and made progress
+ *		PSM2_OK_NO_PROGRESS - didn't find anything to do
+ *		other - error
+ */
+
+static psm2_error_t psm3_oneapi_ze_poll_dev_fds_exchange(struct ptl_am *ptl)
+{
+	psm2_error_t err = PSM2_OK_NO_PROGRESS;
+	psm2_error_t ret;
+	int i;
+	int num_left = 0;
+
+	err = psm3_oneapi_ze_receive_dev_fds(ptl);
+
+	for (i = 0; i <= ptl->max_ep_idx; i++) {
+		am_epaddr_t *am_epaddr = (am_epaddr_t *)ptl->am_ep[i].epaddr;
+
+		if (!am_epaddr || psm3_epid_zero_internal(ptl->am_ep[i].epid))
+			continue;
+
+		if (psm3_oneapi_ze_dev_fds_exchanged(am_epaddr))
+			continue;
+
+		num_left++;	// causes one extra poll if complete now below, but no harm
+
+		// don't try if uni-dir REQ/REP is incomplete
+		if (am_epaddr->cstate_outgoing != AMSH_CSTATE_OUTGOING_ESTABLISHED
+			&& am_epaddr->cstate_incoming != AMSH_CSTATE_INCOMING_ESTABLISHED)
+			continue;
+
+		// try to move forward 1 step
+		ret = psm3_oneapi_ze_send_dev_fds(ptl, am_epaddr);
+		if (ret > PSM2_OK_NO_PROGRESS)
+			err = psm3_error_cmp(ret, err);
+		else if (ret == PSM2_OK && err == PSM2_OK_NO_PROGRESS)
+			err = ret;
+	}
+	if (num_left == 0 && ptl->gpu_specific.ze_need_dev_fds_poll)
+		_HFI_CONNDBG("stop GPU dev FDs poll\n");
+	ptl->gpu_specific.ze_need_dev_fds_poll = (num_left != 0);
+
+	return err;
+}
+
+static void psm3_oneapi_ze_sock_detach(struct ptl_am *ptl)
+{
+	if (ptl->gpu_specific.ze_ipc_socket >= 0)
+		close(ptl->gpu_specific.ze_ipc_socket);
+	ptl->gpu_specific.ze_ipc_socket = -1;
+	if (ptl->gpu_specific.ze_listen_sockname) {
+		unlink(ptl->gpu_specific.ze_listen_sockname);
+		psmi_free(ptl->gpu_specific.ze_listen_sockname);
+	}
+	ptl->gpu_specific.ze_listen_sockname = NULL;
+}
+#endif /* not PSM_HAVE_PIDFD */
+
+static psm2_error_t psm3_oneapi_ze_shm_init(struct ptl_am *ptl,
+								psm2_mq_stats_t *stats)
+{
+#ifndef PSM_HAVE_PIDFD
+	psm2_error_t err;
+
+	ptl->gpu_specific.ze_ipc_socket = -1;
+	if ((err = psm3_onapi_ze_init_ipc_socket(ptl)) != PSM2_OK)
+		return err;
+	if ((err = psm3_onapi_ze_init_fds()) != PSM2_OK)
+		return err;
+#endif
+
+#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
+	// TBD - should we have generic names for these env variables
+	// PSM3_GPU_MEMCACHE_ENABLED, PSM3_GPU_MEMCACHE_SIZE?
+	union psmi_envvar_val env_memcache_enabled;
+	psm3_getenv("PSM3_ONEAPI_MEMCACHE_ENABLED",
+			"PSM oneapi ipc memhandle cache enabled (default is enabled)",
+			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+			(union psmi_envvar_val)1, &env_memcache_enabled);
+	if (env_memcache_enabled.e_uint) {
+		union psmi_envvar_val env_memcache_size;
+		psm3_getenv("PSM3_ONEAPI_MEMCACHE_SIZE",
+				"Size of the oneapi ipc memhandle cache ",
+				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
+				(union psmi_envvar_val)ONEAPI_MEMHANDLE_CACHE_SIZE,
+				&env_memcache_size);
+		return psm3_oneapi_ze_memhandle_cache_alloc(
+					(psm3_oneapi_ze_memhandle_cache_t *)&ptl->memhandle_cache,
+					env_memcache_size.e_uint, stats);
+#endif
+	}
+	return PSM2_OK;
+}
+
+static void psm3_oneapi_ze_shm_finalize(struct ptl_am *ptl)
+{
+#ifndef PSM_HAVE_PIDFD
+	psm3_oneapi_ze_sock_detach(ptl);
+#endif
+	if (ptl->memhandle_cache)
+		psm3_oneapi_ze_memhandle_cache_free(ptl->memhandle_cache);
+	ptl->memhandle_cache = NULL;
+	return;
+}
+
+static psm2_error_t psm3_oneapi_ze_shm_epaddr_add(struct ptl_am *ptl, struct am_epaddr *am_epaddr)
+{
+#ifdef PSM_HAVE_PIDFD
+	am_epaddr->gpu_specific.ze_pidfd = syscall(SYS_pidfd_open, ptl->am_ep[am_epaddr->shmidx].pid, 0);
+	if (am_epaddr->gpu_specific.ze_pidfd < 0) {
+		_HFI_ERROR("pidfd_open failed: pid %u, ret %d (%s)\n",
+				ptl->am_ep[am_epaddr->shmidx].pid,
+				am_epaddr->gpu_specific.ze_pidfd,
+				strerror(errno));
+		return PSM2_NO_MEMORY;
+	}
+#else
+	am_epaddr->gpu_specific.ze_num_peer_fds = 0;
+	{
+		int i;
+		for (i=0; i < MAX_ZE_DEVICES; i++)
+			am_epaddr->gpu_specific.ze_peer_fds[i] = -1;
+	}
+	am_epaddr->gpu_specific.ze_sock_connected_state = ZE_SOCK_NOT_CONNECTED;
+	am_epaddr->gpu_specific.ze_sock = -1;
+#endif
+	return PSM2_OK;
+}
+
+static void psm3_oneapi_ze_shm_epaddr_free(struct am_epaddr *am_epaddr)
+{
+#ifdef PSM_HAVE_PIDFD
+	if (am_epaddr->gpu_specific.ze_pidfd >= 0)
+		close(am_epaddr->gpu_specific.ze_pidfd);
+#else
+	{
+		int i;
+		for (i=0; i < MAX_ZE_DEVICES; i++)
+			if (am_epaddr->gpu_specific.ze_peer_fds[i] >= 0)
+				close(am_epaddr->gpu_specific.ze_peer_fds[i]);
+	}
+	if (am_epaddr->gpu_specific.ze_sock >= 0)
+		close(am_epaddr->gpu_specific.ze_sock);
+#endif
+}
+
+static int psm3_oneapi_ze_shm_dev_fds_needed()
+{
+#ifndef PSM_HAVE_PIDFD
+	return 1;
+#else
+	return 0;
+#endif
+}
+
+static void psm3_oneapi_ze_shm_dev_fds_send(struct ptl_am *ptl, struct am_epaddr *am_epaddr)
+{
+#ifndef PSM_HAVE_PIDFD
+	psm3_oneapi_ze_send_dev_fds(ptl, am_epaddr);
+#endif
+}
+
+static psm2_error_t psm3_oneapi_ze_shm_dev_fds_connreq_poll(struct ptl_am *ptl, struct am_ptl_connection_req *req)
+{
+#ifndef PSM_HAVE_PIDFD
+	return psm3_oneapi_ze_shm_ep_connreq_poll_dev_fds(ptl, req);
+#else
+	return PSM2_OK;
+#endif
+}
+
+static psm2_error_t psm3_oneapi_ze_shm_dev_fds_check_exchanged(struct ptl_am *ptl, struct am_ptl_connection_req *req, int index)
+{
+#ifndef PSM_HAVE_PIDFD
+	// late connect establish, check once to
+	// see if have GPU dev fds, if not, this one
+	// missed the timelimit and timesout
+	if (req->op == AM_PTL_OP_CONNECT)
+		_HFI_CONNDBG("late established, special GPU dev FDs poll\n");
+	if (req->op == AM_PTL_OP_CONNECT &&
+			PSM2_OK != psm3_oneapi_ze_check_dev_fds_exchanged(ptl, (struct am_epaddr *)(req->epaddr[index])))
+		return PSM2_OK_NO_PROGRESS;
+	else
+#endif
+		return PSM2_OK;
+}
+
+static psm2_error_t psm3_oneapi_ze_shm_dev_fds_poll(struct ptl_am *ptl, psm2_error_t res)
+{
+#ifndef PSM_HAVE_PIDFD
+	// play err safe, callers ignore errors or expect just OK or NO_PROGRESS
+	if (ptl->gpu_specific.ze_need_dev_fds_poll
+			&& psm3_oneapi_ze_poll_dev_fds_exchange(ptl) != PSM2_OK_NO_PROGRESS)
+		return PSM2_OK;
+#endif
+	return res;
+}
+
+// On Sender, place the IPC handle in the RTS
+// We put offset in the basic "args" parameters and the actual
+// IPC handle as payload due to it's size
+// Callers expect payload_size >0 when using GPU IPC and key off non-zero
+// payload size in RTS to identify a GPU IPC RTS
+// Save in the req the needed information about IPC resources allocated here
+// so psm3_oneapi_ze_process_cts and release them.
+static psm2_error_t psm3_oneapi_ze_shm_build_rts(struct ptl_am *ptl,
+			psm2_mq_req_t req, int *narg_p,
+			psm2_amarg_t *args, void **payload_p, size_t *payload_size_p,
+			union am_gpu_rts_payload *info)
+{
+#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
+#ifndef PSM_HAVE_PIDFD
+	int fd;
+	int *devfds;
+	int numfds;
+	int device_index = 0;
+	struct drm_prime_handle open_fd = {0, 0, 0};
+#endif
+	uint64_t handle_fd = 0;
+	size_t total;
+	void *buf_base_ptr;
+	uint64_t alloc_id;
+	void *buf = req->req_data.buf;
+
+#ifndef PSM_HAVE_PIDFD
+	devfds = psm3_oneapi_ze_get_dev_fds(&numfds);
+	device_index = psm3_oneapi_ze_cur_dev - psm3_oneapi_ze_devices; /* index (offset) in table */
+	args[5].u32w0 = device_index;
+	fd = devfds[device_index];
+#endif
+	PSM3_ONEAPI_ZE_CALL(zeMemGetAddressRange, psm3_oneapi_ze_context, buf, &buf_base_ptr, &total);
+
+	/* Offset in GPU buffer from which we copy data, we have to
+	 * send it separetly because this offset is lost
+	 * when zeMemGetIpcHandle is called */
+	req->gpu_specific.ze_ipc_offset = (uint32_t)((uintptr_t)buf - (uintptr_t)buf_base_ptr);
+	args[2].u32w0 = (uint32_t)req->gpu_specific.ze_ipc_offset;
+	alloc_id = psm3_oneapi_ze_get_alloc_id(buf_base_ptr, &info->ze.ze_alloc_type);
+#ifndef PSM_HAVE_PIDFD
+	args[5].u32w1 = (uint32_t)alloc_id; /* 32-bit for now  */
+#else
+	args[5].u64w0 = alloc_id;
+#endif
+
+	PSM3_ONEAPI_ZE_CALL(zeMemGetIpcHandle, psm3_oneapi_ze_context,
+				(const void *)buf_base_ptr, &req->gpu_specific.ze_ipc_handle);
+#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
+	psm3_oneapi_ze_get_dmabuf_fd((const void *)buf, &handle_fd);
+#else
+	memcpy(&handle_fd, &req->gpu_specific.ze_ipc_handle, sizeof(uint32_t));
+#endif
+	req->gpu_specific.ze_handle_attached = 1;
+#ifndef PSM_HAVE_PIDFD
+	open_fd.fd = (uint32_t)handle_fd;
+	if (ioctl(fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &open_fd) < 0) {
+		_HFI_ERROR("ioctl failed for DRM_IOCTL_PRIME_FD_TO_HANDLE: for fd %d: %s", open_fd.fd, strerror(errno));
+		psm3_handle_error(ptl->ep, PSM2_INTERNAL_ERR,
+			"ioctl failed for DRM_IOCTL_PRIME_FD_TO_HANDLE for fd %d: errno=%d",
+			open_fd.fd, errno);
+		return PSM2_INTERNAL_ERR;
+	}
+	_HFI_VDBG("FD_TO_HANDLE: buf %p total 0x%lx base %p alloc_id %lu gem_handle %u\n",
+			buf, total, buf_base_ptr, alloc_id, open_fd.handle);
+	info->ze.ze_handle = open_fd.handle;
+	*narg_p = 6;
+	*payload_p = (void *)info;
+	*payload_size_p = sizeof(struct am_oneapi_ze_rts_payload);
+	// for DRM approach once we have the open_fd we could
+	// PutIpcHandle(ipc_handle) since open_fd has a reference
+	// however since that is a legacy mode, we focus on the
+	// prefered mode and have both delay the Put until CTS received
+#else
+	info->ze.ze_handle = (uint32_t)handle_fd;
+	*narg_p = 6;
+	*payload_p = (void *)info;
+	*payload_size_p = sizeof(struct am_oneapi_ze_rts_payload);
+#endif /* PSM_HAVE_PIDFD */
+	return PSM2_OK;
+#else // if no drm, error out as oneapi ipc handles don't work without drm
+	return PSM2_INTERNAL_ERR;
+#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM)
+}
+
+// On receiver, pull IPC information out of the RTS which our peer build using
+// psm3_oneapi_ze_shm_build_rts.  Information is saved to the req for subsequent
+// processing after tag matching via psm3_oneapi_ze_shm_rtsmatch
+static void psm3_oneapi_ze_shm_process_rts(psm2_mq_req_t req, void *buf,
+				size_t len, int narg, psm2_amarg_t *args)
+{
+	struct am_oneapi_ze_rts_payload *info;
+
+	psmi_assert(narg == 6);
+	info = (struct am_oneapi_ze_rts_payload *)buf;
+	psmi_assert(len == sizeof(struct am_oneapi_ze_rts_payload));
+	req->gpu_specific.ze_handle = info->ze_handle;
+	req->gpu_specific.ze_alloc_type = info->ze_alloc_type;
+	req->gpu_specific.ze_handle_attached = 1;
+	req->gpu_specific.ze_ipc_offset = args[2].u32w0;
+#ifndef PSM_HAVE_PIDFD
+	req->gpu_specific.ze_device_index = args[5].u32w0;
+	req->gpu_specific.ze_alloc_id = args[5].u32w1;
+#else
+	req->gpu_specific.ze_alloc_id = args[5].u64w0;
+#endif
+}
+
+// On receiver, use GPU IPC to copy data from the sender to this process
+// This is called on the receiver after psm3_oneapi_ze_process_rts has parsed
+// the incoming RTS and tag matching has matched the RTS with a receive buffer
+// and populated the req with information about the matched receiver buffer
+static int psm3_oneapi_ze_shm_rtsmatch(struct ptl_am *ptl, psm2_mq_req_t req)
+{
+	if (req->gpu_specific.ze_handle_attached) {
+		void *buf_ptr = psm3_oneapi_ze_memhandle_acquire(
+							ptl->memhandle_cache,
+							req->rts_sbuf - req->gpu_specific.ze_ipc_offset, req->gpu_specific.ze_handle,
+							req->rts_peer,
+#ifndef PSM_HAVE_PIDFD
+							req->gpu_specific.ze_device_index, req->gpu_specific.ze_alloc_id,
+#else
+							0, req->gpu_specific.ze_alloc_id,
+#endif
+							req->gpu_specific.ze_alloc_type);
+		psmi_assert_always(buf_ptr != NULL);
+		buf_ptr = (uint8_t *)buf_ptr + req->gpu_specific.ze_ipc_offset;
+		/* zeMemcpy into the receive side buffer
+		 * based on its location */
+		_HFI_VDBG("Copying src %p (offset 0x%x) dst %p msg_len %u\n",
+				buf_ptr, req->gpu_specific.ze_ipc_offset,
+				req->req_data.buf, req->req_data.recv_msglen);
+		if (req->is_buf_gpu_mem) {
+			/*PSM3_GPU_MEMCPY_DTOD*/
+			psm3_oneapi_ze_memcpy_DtoD(req->req_data.buf, buf_ptr,
+					req->req_data.recv_msglen);
+			// can skip sychronize, it's a noop for oneapi_ze
+			//PSM3_GPU_SYNCHRONIZE_MEMCPY();
+			//psm3_oneapi_ze_synchronize_memcpy();
+		} else {
+			/*PSM3_GPU_MEMCPY_DTOH*/
+			psm3_oneapi_ze_memcpy_DtoH(req->req_data.buf, buf_ptr,
+				req->req_data.recv_msglen);
+		}
+		psm3_oneapi_ze_memhandle_release(ptl->memhandle_cache,
+					(uint8_t *)buf_ptr - req->gpu_specific.ze_ipc_offset);
+		req->gpu_specific.ze_handle_attached = 0;
+		return 1;
+	}
+	return 0;
+}
+
+// On sender, we have now received the CTS corresponding to an RTS
+// we may have built in psm3_oneapi_ze_build_rts.  All we need to do here is release
+// the resources we allocated in psm3_oneapi_ze_build_rts.  We saved the  necessary
+// information tracking those resources in the send req.
+// Returns:
+// 	0 - the req was not for a GPU IO
+// 	1 - the req was for a GPU IO and we have released the resources
+static int psm3_oneapi_ze_shm_process_cts(psm2_mq_req_t req)
+{
+	if (req->gpu_specific.ze_handle_attached) {
+		psm3_oneapi_ze_put_ipc_handle(req->req_data.buf - req->gpu_specific.ze_ipc_offset,
+							req->gpu_specific.ze_ipc_handle);
+		req->gpu_specific.ze_handle_attached = 0;
+		return 1;
+	}
+	return 0;
+}
+// end of RTS and CTS processing functions for PSM3_DEVICES "shm"
+//***************************************************************************
+
+static psm2_error_t psm3_oneapi_ze_get_cuda_permitted(struct psm2_ep *ep, bool *enable)
+{
+	*enable = true;
+	return PSM2_OK;
+}
+
+static psm2_error_t psm3_oneapi_ze_set_cuda_permitted(struct psm2_ep *ep, bool enable)
+{
+	return PSM2_OK;
+}
+
+static bool psm3_oneapi_ze_is_memcpy_permitted(struct psm2_ep *ep)
+{
+	return true;
+}
+
+struct psm3_gpu_hal psm3_oneapi_ze_hal = {
+	.type = "oneapi-ze",
+#ifdef PSM_HAVE_RNDV_MOD
+#if defined(RV_GPU_ABI_VER_MINOR_0) && defined(RV_GPU_ABI_VER_MAJOR_1) && defined(RV_GPU_ABI_VER_MINOR_1)
+		// RV GPU API <= 1.0 does not have track GPU alloc_id
+		// RV GPU API <= 1.1 requires munmap_unpin
+		// so if RV GPU API <= 1.1, do not allow GPUDirect
+	.rv_major_rev_fail = RV_GPU_ABI_VER_MAJOR_1,
+	.rv_minor_rev_fail = RV_GPU_ABI_VER_MINOR_1,
+#else
+	/* not defined if compile against older RV header */
+#error "Intel GPU Support requires version 1.1 or newer rv_user_ioctls.h header"
+#endif
+
+	.rv_capability_expected = RV_CAP_INTEL_GPU,
+	.hal_cap_expected = PSM_HAL_CAP_INTEL_GPU,
+#endif /* PSM_HAVE_RNDV_MOD */
+
+	.ghfp_initialize = psm3_oneapi_ze_initialize,
+	.ghfp_finalize = psm3_oneapi_ze_finalize,
+	.ghfp_ep_open = psm3_oneapi_ze_ep_open,
+	.ghfp_ep_close = psm3_oneapi_ze_ep_close,
+	.ghfp_identify = psm3_oneapi_ze_identify,
+	.ghfp_verify_GPU_capabilities = psm3_oneapi_ze_verify_GPU_capabilities,
+	.ghfp_p2p_supported = psm3_oneapi_ze_p2p_supported,
+	.ghfp_gpudirect_supported = psm3_oneapi_ze_gpudirect_supported,
+	.ghfp_using_rv_for_mrs = psm3_oneapi_ze_using_rv_for_mrs,
+	.ghfp_get_pci_addr = psm3_oneapi_ze_get_pci_addr,
+#ifdef PSM_HAVE_RNDV_MOD
+	.ghfp_min_bar_size = psm3_oneapi_ze_min_bar_size,
+	.ghfp_check_phys_addr = psm3_oneapi_ze_check_phys_addr,
+	.ghfp_roundup_gdrcopy = psm3_oneapi_ze_roundup_gdrcopy,
+#ifdef PSM_HAVE_REG_MR
+	.ghfp_roundup_rv_reg_mr = psm3_oneapi_ze_roundup_rv_reg_mr,
+	.ghfp_init_rv_reg_mr_params = psm3_oneapi_ze_init_rv_reg_mr_params,
+#endif
+	.ghfp_init_rv_pin_mmap_params = psm3_oneapi_ze_init_rv_pin_mmap_params,
+	.ghfp_rv_reg_mmap_cleanup = psm3_oneapi_ze_rv_reg_mmap_cleanup,
+#endif /* PSM_HAVE_RNDV_MOD */
+#ifdef PSM_HAVE_REG_MR
+	.ghfp_cmp_mr = psm3_oneapi_ze_cmp_mr,
+	.ghfp_init_mr = psm3_oneapi_ze_init_mr,
+#endif
+	.ghfp_fetch_ctxt = psm3_oneapi_ze_fetch_ctxt,
+	.ghfp_refresh_ctxt = psm3_oneapi_ze_refresh_ctxt,
+	.ghfp_register_hostmem = psm3_oneapi_ze_register_hostmem,
+	.ghfp_unregister_hostmem = psm3_oneapi_ze_unregister_hostmem,
+	.ghfp_is_gpu_mem = psm3_oneapi_ze_is_gpu_mem,
+	.ghfp_prepare_HtoD_memcpys = psm3_oneapi_ze_prepare_HtoD_memcpys,
+	.ghfp_prepare_DtoH_memcpys = psm3_oneapi_ze_prepare_DtoH_memcpys,
+	.ghfp_shutdown_HtoD_memcpys = psm3_oneapi_ze_shutdown_HtoD_memcpys,
+	.ghfp_shutdown_DtoH_memcpys = psm3_oneapi_ze_shutdown_DtoH_memcpys,
+	.ghfp_memcpy_HtoD_start = psm3_oneapi_ze_memcpy_HtoD_start,
+	.ghfp_memcpy_DtoH_start = psm3_oneapi_ze_memcpy_DtoH_start,
+	.ghfp_memcpy_done = psm3_oneapi_ze_memcpy_done,
+	.ghfp_hostbuf_lazy_init = psm3_oneapi_ze_hostbuf_lazy_init,
+	.ghfp_hostbuf_reset = psm3_oneapi_ze_hostbuf_reset,
+	.ghfp_hostbuf_destroy = psm3_oneapi_ze_hostbuf_destroy,
+	.ghfp_memcpy_DtoD = psm3_oneapi_ze_memcpy_DtoD,
+	.ghfp_memcpy_HtoD = psm3_oneapi_ze_memcpy_HtoD,
+	.ghfp_memcpy_DtoH = psm3_oneapi_ze_memcpy_DtoH,
+	.ghfp_memcpy = psm3_oneapi_ze_memcpy,
+	.ghfp_synchronize_memcpy = psm3_oneapi_ze_synchronize_memcpy,
+	.ghfp_mark_buf_synchronous = psm3_oneapi_ze_mark_buf_synchronous,
+	.ghfp_host_alloc = psm3_oneapi_ze_host_alloc,
+	.ghfp_host_free = psm3_oneapi_ze_host_free,
+	.ghfp_gpu_addr_send_mr = psm3_oneapi_ze_gpu_addr_send_mr,
+	.ghfp_gpu_addr_recv_mr = psm3_oneapi_ze_gpu_addr_recv_mr,
+	// functions for PSM3_DEVICES "shm" RTS/CTS processing to enable
+	// use of GPU specific scale-up transfers within the given server
+	.ghfp_shm_init = psm3_oneapi_ze_shm_init,
+	.ghfp_shm_finalize = psm3_oneapi_ze_shm_finalize,
+	.ghfp_shm_epaddr_add = psm3_oneapi_ze_shm_epaddr_add,
+	.ghfp_shm_epaddr_free = psm3_oneapi_ze_shm_epaddr_free,
+	.ghfp_shm_dev_fds_needed = psm3_oneapi_ze_shm_dev_fds_needed,
+	.ghfp_shm_dev_fds_send = psm3_oneapi_ze_shm_dev_fds_send,
+	.ghfp_shm_dev_fds_connreq_poll = psm3_oneapi_ze_shm_dev_fds_connreq_poll,
+	.ghfp_shm_dev_fds_check_exchanged = psm3_oneapi_ze_shm_dev_fds_check_exchanged,
+	.ghfp_shm_dev_fds_poll = psm3_oneapi_ze_shm_dev_fds_poll,
+	.ghfp_shm_build_rts = psm3_oneapi_ze_shm_build_rts,
+	.ghfp_shm_process_rts = psm3_oneapi_ze_shm_process_rts,
+	.ghfp_shm_rtsmatch = psm3_oneapi_ze_shm_rtsmatch,
+	.ghfp_shm_process_cts = psm3_oneapi_ze_shm_process_cts,
+	.ghfp_get_cuda_permitted = psm3_oneapi_ze_get_cuda_permitted,
+	.ghfp_set_cuda_permitted = psm3_oneapi_ze_set_cuda_permitted,
+	.ghfp_is_memcpy_permitted = psm3_oneapi_ze_is_memcpy_permitted,
+};
+
+#endif /* PSM_ONEAPI */
diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.c b/prov/psm3/psm3/hal_sockets/sockets_ep.c
old mode 100755
new mode 100644
index ce7ddb61bc3..8943cbf511d
--- a/prov/psm3/psm3/hal_sockets/sockets_ep.c
+++ b/prov/psm3/psm3/hal_sockets/sockets_ep.c
@@ -893,7 +893,7 @@ psm3_ep_open_tcp_internal(psm2_ep_t ep, int unit, int port,
 }
 
 #ifdef RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /* This function is only called when GPUDirect is enabled */
 static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key)
 {
@@ -908,9 +908,8 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key)
 
 	// GPU Direct is enabled and we need a GPU Cache
 	loc_info.rdma_mode = RV_RDMA_MODE_GPU_ONLY;
-#ifdef PSM_ONEAPI
-	psm3_oneapi_ze_can_use_zemem();
-#endif
+
+	PSM3_GPU_USING_RV_FOR_MRS();
 
 	// need portnum for rdma_mode KERNEL or (USER|GPU)
 	loc_info.port_num = ep->portnum;
@@ -932,17 +931,14 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key)
 #endif
 	if (loc_info.capability & RV_CAP_GPU_DIRECT)
 		psmi_hal_add_cap(PSM_HAL_CAP_GPUDIRECT);
-	if (loc_info.capability & RV_CAP_NVIDIA_GPU)
-		psmi_hal_add_cap(PSM_HAL_CAP_NVIDIA_GPU);
-	if (loc_info.capability & RV_CAP_INTEL_GPU)
-		psmi_hal_add_cap(PSM_HAL_CAP_INTEL_GPU);
+	PSM3_GPU_RV_SET_HAL_CAP(loc_info.capability);
 	// sockets does not support PSM_HAL_CAP_GPUDIRECT_SDMA nor RDMA
 	ep->rv_mr_cache_size = loc_info.mr_cache_size;
 	ep->rv_gpu_cache_size = loc_info.gpu_cache_size;
 
 	return PSM2_OK;
 }
-#endif /* PSM_CUDA  || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 #endif /* RNDV_MOD */
 
 psm2_error_t
@@ -954,7 +950,7 @@ psm3_ep_open_sockets(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid
 	ep->rdmamode = 0;	// no rendezvous RDMA for sockets
 	// no MR cache, leave ep->mr_cache_mode as set by caller (NONE)
 #ifdef RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	ep->rv_gpu_cache_size = psmi_parse_gpudirect_rv_gpu_cache_size(0);
 #endif
 #endif
@@ -997,9 +993,9 @@ psm3_ep_open_sockets(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid
 	_HFI_PRDBG("Using unit_id[%d] %s.\n", ep->unit_id, ep->dev_name);
 
 #ifdef RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/* Open rv only when GPUDirect is enabled */
-	if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect() &&
+	if (PSM3_GPU_IS_ENABLED && psmi_parse_gpudirect() &&
 	    open_rv(ep, job_key) != PSM2_OK) {
 		_HFI_ERROR("Unable to open rv for port %d of %s.\n",
 			   port, ep->dev_name);
@@ -1007,7 +1003,7 @@ psm3_ep_open_sockets(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid
 		ep->dev_name = NULL;
 		return PSM2_INTERNAL_ERR;
 	}
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 #endif /* RNDV_MOD */
 	ep->wiremode = 0; // TCP vs UDP are separate EPID protocols
 	ep->addr_index = addr_index;
@@ -1041,7 +1037,7 @@ psm3_sockets_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 	// defaults for SDMA thresholds.
 	// sockets does not support Send DMA, so set large to disable.
         proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking = ~0U;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
         proto->iovec_gpu_thresh_eager = proto->iovec_gpu_thresh_eager_blocking = ~0U;
 #endif
 #endif
@@ -1360,7 +1356,7 @@ void psm3_ep_free_sockets(psm2_ep_t ep)
 		}
 	}
 #ifdef RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (ep->rv) {
 		psm3_rv_close(ep->rv);
 		ep->rv = NULL;
diff --git a/prov/psm3/psm3/hal_sockets/sockets_ep.h b/prov/psm3/psm3/hal_sockets/sockets_ep.h
index 51fcd06f792..2cd4b6b467a 100644
--- a/prov/psm3/psm3/hal_sockets/sockets_ep.h
+++ b/prov/psm3/psm3/hal_sockets/sockets_ep.h
@@ -65,7 +65,7 @@
 #include <netinet/in.h>
 
 #ifdef RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #include <infiniband/verbs.h>
 #include <psm_rndv_mod.h>
 #endif
diff --git a/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c b/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c
index 645dfd3ebd2..3dbe1fdb2f4 100644
--- a/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c
+++ b/prov/psm3/psm3/hal_sockets/sockets_gdrcpy.c
@@ -51,8 +51,9 @@
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 #include "psm_user.h"
+
+#ifdef PSM_HAVE_GPU
 #include "psm2_hal.h"
 #include <fcntl.h>
 #include <sys/ioctl.h>
@@ -66,21 +67,14 @@ psm3_sockets_gdr_convert_gpu_to_host_addr(unsigned long buf,
 							 size_t size, int flags,
 							 psm2_ep_t ep)
 {
+#ifdef RNDV_MOD
 	void *host_addr_buf;
 	uintptr_t pageaddr;
 	uint64_t pagelen;
 
-#ifdef PSM_ONEAPI
-	PSMI_ONEAPI_ZE_CALL(zeMemGetAddressRange, ze_context,
-			    (const void *)buf, (void **)&pageaddr, &pagelen);
-#else
-	pageaddr = buf & GPU_PAGE_MASK;
-	pagelen = (uint64_t) (PSMI_GPU_PAGESIZE +
-			      ((buf + size - 1) & GPU_PAGE_MASK) -  pageaddr);
-#endif
+	PSM3_GPU_ROUNDUP_GDRCOPY(buf, size, &pageaddr, &pagelen);
 	_HFI_VDBG("buf=%p size=%zu pageaddr=%p pagelen=%"PRIu64" flags=0x%x ep=%p\n",
 		(void *)buf, size, (void *)pageaddr, pagelen, flags, ep);
-#ifdef RNDV_MOD
 	ep = ep->mctxt_master;
 	host_addr_buf = psm3_rv_pin_and_mmap(ep->rv, pageaddr, pagelen, IBV_ACCESS_IS_GPU_ADDR);
 	if_pf (! host_addr_buf) {
@@ -92,16 +86,12 @@ psm3_sockets_gdr_convert_gpu_to_host_addr(unsigned long buf,
 			return NULL;
 	}
 //_HFI_ERROR("pinned buf=%p size=%zu pageaddr=%p pagelen=%u flags=0x%x ep=%p, @ %p\n", (void *)buf, size, (void *)pageaddr, pagelen, flags, ep, host_addr_buf);
+	return (void *)((uintptr_t)host_addr_buf + (buf - pageaddr));
 #else
 	psmi_assert_always(0);	// unimplemented, should not get here
-	host_addr_buf = NULL;
+	return NULL;
 #endif /* RNDV_MOD */
-#ifdef PSM_ONEAPI
-	return (void *)((uintptr_t)host_addr_buf + (buf - pageaddr));
-#else
-	return (void *)((uintptr_t)host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK));
-#endif
 }
 
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 #endif /* PSM_SOCKETS */
diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal.c b/prov/psm3/psm3/hal_sockets/sockets_hal.c
index dd9ec3735dc..3251784a34f 100644
--- a/prov/psm3/psm3/hal_sockets/sockets_hal.c
+++ b/prov/psm3/psm3/hal_sockets/sockets_hal.c
@@ -63,17 +63,13 @@
 #include "sockets_hal_inline_i.h"
 #endif
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-#define SOCKET_GPU_THRESH_RNDV (~(uint32_t)0)
-#endif
-
 static int psm3_hfp_sockets_initialize(psmi_hal_instance_t *phi,
 	int devid_enabled[PTL_MAX_INIT])
 {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	// testing on HED-2629 suggests turning off RNDV can help
 	// latency for messages in size 8-256 KB
-	psm3_gpu_thresh_rndv = SOCKET_GPU_THRESH_RNDV;
+	psm3_gpu_thresh_rndv = (~(uint32_t)0);
 #endif
 	/* we initialize a few HAL software specific capabilities which
 	 * are known before context_open can open RV or parse HAL specific
@@ -100,40 +96,19 @@ static const char* psm3_hfp_sockets_identify(void)
 {
 	static char buf[100];
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)  /* rv module only applicable to sockets for CUDA builds */
-#ifdef RNDV_MOD
-/* we test NVIDIA_GPU_DIRECT since that define
- * controls the rv module ioctl header file interface
- */
-#if defined(NVIDIA_GPU_DIRECT) || defined(INTEL_GPU_DIRECT)
-#ifdef NVIDIA_GPU_DIRECT
-	snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d gpu v%d.%d cuda",
-#else
-	snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d gpu v%d.%d oneapi-ze",
-#endif
-			psmi_hal_get_hal_instance_name(),
-			psmi_hal_get_hal_instance_description(),
-			psm3_rv_get_user_major_bldtime_version(),
-			psm3_rv_get_user_minor_bldtime_version(),
-			psm3_rv_get_gpu_user_major_bldtime_version(),
-			psm3_rv_get_gpu_user_minor_bldtime_version());
-#else /* NVIDIA_GPU_DIRECT || INTEL_GPU_DIRECT */
-	snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d",
+/* rv module only applicable to sockets for GPU builds */
+#if defined(RNDV_MOD) && defined(PSM_HAVE_GPU)
+	snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%u.%u" PSM3_GPU_FMT_RV_GPU_VER,
 			psmi_hal_get_hal_instance_name(),
 			psmi_hal_get_hal_instance_description(),
 			psm3_rv_get_user_major_bldtime_version(),
-			psm3_rv_get_user_minor_bldtime_version());
-#endif /* NVIDIA_GPU_DIRECT  || INTEL_GPU_DIRECT */
-#else /* RNDV_MOD */
-	snprintf(buf, sizeof(buf), "HAL: %s (%s)",
-			psmi_hal_get_hal_instance_name(),
-			psmi_hal_get_hal_instance_description());
-#endif /* RNDV_MOD */
-#else /* PSM_CUDA || PSM_ONEAPI */
+			psm3_rv_get_user_minor_bldtime_version()
+			PSM3_GPU_OUT_RV_GPU_VER);
+#else
 	snprintf(buf, sizeof(buf), "HAL: %s (%s)",
 			psmi_hal_get_hal_instance_name(),
 			psmi_hal_get_hal_instance_description());
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif
 
 	return buf;
 }
@@ -181,15 +156,14 @@ static void psm3_hfp_sockets_mq_init_defaults(struct psm2_mq *mq)
 	// even without RDMA, the receiver controlled pacing helps scalability
 	mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous
 	mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (PSMI_IS_GPU_ENABLED)
-		mq->ips_gpu_window_rv_str = PSM_GPU_NIC_RNDV_WINDOW_STR;
+#ifdef PSM_HAVE_GPU
+	mq->ips_gpu_window_rv_str = psm3_gpu_rndv_nic_window_default;
 #endif
 	// we parse inet and rv_gpu_cache_size here so we can cache it
 	// once per EP open, even if multi-rail or multi-QP
 	(void) psm3_sockets_parse_inet(1);
 #ifdef RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	(void)psmi_parse_gpudirect_rv_gpu_cache_size(1);
 #endif
 #endif
@@ -203,7 +177,7 @@ static void psm3_hfp_sockets_ep_open_opts_get_defaults(struct psm3_ep_open_opts
 	opts->imm_size = 128;
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static void psm3_hfp_sockets_gdr_open(void)
 {
 }
@@ -257,16 +231,10 @@ static hfp_sockets_t psm3_sockets_hi = {
 	.phi = {
 		.hal_index				  = PSM_HAL_INDEX_SOCKETS,
 #ifdef USE_UDP
-		.description				  = "Sockets"
+		.description				  = "Sockets" PSM3_GPU_TYPES,
 #else
-		.description				  = "TCP Sockets"
-#endif
-#ifdef PSM_CUDA
-								" (cuda)"
-#elif defined(PSM_ONEAPI)
-								" (oneapi-ze)"
+		.description				  = "TCP Sockets" PSM3_GPU_TYPES,
 #endif
-									,
 		.nic_sys_class_path			  = "/sys/class/net",
 		.nic_sys_port_path_fmt			  = PSM3_PORT_PATH_TYPE_NO_PORT,
 		.params					  = {0},
@@ -286,7 +254,7 @@ static hfp_sockets_t psm3_sockets_hi = {
 		.hfp_mq_init_defaults			  = psm3_hfp_sockets_mq_init_defaults,
 		.hfp_ep_open_opts_get_defaults		  = psm3_hfp_sockets_ep_open_opts_get_defaults,
 		.hfp_context_initstats			  = psm3_hfp_sockets_context_initstats,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		.hfp_gdr_open				  = psm3_hfp_sockets_gdr_open,
 #endif
 
@@ -328,10 +296,10 @@ static hfp_sockets_t psm3_sockets_hi = {
 		.hfp_ips_ibta_init			  = psm3_hfp_sockets_ips_ibta_init,
 		.hfp_ips_path_rec_init			  = psm3_hfp_sockets_ips_path_rec_init,
 		.hfp_ips_ptl_pollintr			  = psm3_hfp_sockets_ips_ptl_pollintr,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		.hfp_gdr_close				  = psm3_hfp_sockets_gdr_close,
 		.hfp_gdr_convert_gpu_to_host_addr	  = psm3_hfp_sockets_gdr_convert_gpu_to_host_addr,
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 		.hfp_get_port_index2pkey		  = psm3_hfp_sockets_get_port_index2pkey,
 		.hfp_poll_type				  = psm3_hfp_sockets_poll_type,
 		.hfp_spio_transfer_frame		  = psm3_hfp_sockets_spio_transfer_frame,
diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal.h b/prov/psm3/psm3/hal_sockets/sockets_hal.h
old mode 100755
new mode 100644
index 6b8f260cb29..de825ffab47
--- a/prov/psm3/psm3/hal_sockets/sockets_hal.h
+++ b/prov/psm3/psm3/hal_sockets/sockets_hal.h
@@ -100,11 +100,11 @@ psm3_sockets_recvhdrq_init(const struct ips_epstate *epstate,
 psm2_error_t psm3_sockets_udp_recvhdrq_progress(struct ips_recvhdrq *recvq, bool force);
 psm2_error_t psm3_sockets_tcp_recvhdrq_progress(struct ips_recvhdrq *recvq, bool force);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 void* psm3_sockets_gdr_convert_gpu_to_host_addr(unsigned long buf,
                                 size_t size, int flags,
                                 psm2_ep_t ep);
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 #define FD_STATE_NONE 0
 #define FD_STATE_READY 1
diff --git a/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h b/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h
index 9b703674147..7ee5798b547 100644
--- a/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h
+++ b/prov/psm3/psm3/hal_sockets/sockets_hal_inline_i.h
@@ -449,7 +449,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_ips_ptl_pollintr(
 					 next_timeout, pollok, pollcyc, pollintr);
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static PSMI_HAL_INLINE void psm3_hfp_sockets_gdr_close(void)
 {
 }
@@ -460,7 +460,7 @@ static PSMI_HAL_INLINE void* psm3_hfp_sockets_gdr_convert_gpu_to_host_addr(unsig
 	return psm3_sockets_gdr_convert_gpu_to_host_addr(buf, size, flags,
                                 ep);
 }
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 #include "sockets_spio.c"
 
@@ -469,7 +469,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_spio_transfer_frame(struct
 					uint32_t *payload, uint32_t length,
 					uint32_t isCtrlMsg, uint32_t cksum_valid,
 					uint32_t cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, uint32_t is_gpu_payload
 #endif
 	)
@@ -490,7 +490,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_spio_transfer_frame(struct
 		return psm3_sockets_udp_spio_transfer_frame(proto, flow, scb,
 					payload, length, isCtrlMsg,
 					cksum_valid, cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 					, is_gpu_payload
 #endif
 					);
@@ -499,7 +499,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_spio_transfer_frame(struct
 		return psm3_sockets_tcp_spio_transfer_frame(proto, flow, scb,
 					payload, length, isCtrlMsg,
 					cksum_valid, cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 					, is_gpu_payload
 #endif
 					);
@@ -510,7 +510,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_transfer_frame(struct ips_p
 					uint32_t *payload, uint32_t length,
 					uint32_t isCtrlMsg, uint32_t cksum_valid,
 					uint32_t cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, uint32_t is_gpu_payload
 #endif
 	)
@@ -518,7 +518,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_sockets_transfer_frame(struct ips_p
 	return psm3_hfp_sockets_spio_transfer_frame(proto, flow, scb,
 					payload, length, isCtrlMsg,
 					cksum_valid, cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 					, is_gpu_payload
 #endif
 					);
diff --git a/prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c b/prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c
old mode 100755
new mode 100644
index de893265802..8091d5196da
--- a/prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c
+++ b/prov/psm3/psm3/hal_sockets/sockets_recvhdrq.c
@@ -236,7 +236,7 @@ psm3_sockets_tcp_preprocess_packet(psm2_ep_t ep, int fd, struct ips_recvhdrq_eve
 		goto out;
 	}
 
-#if !defined(PSM_CUDA) && !defined(PSM_ONEAPI)
+#ifndef PSM_HAVE_GPU
 	psm2_mq_req_t req = psm3_mq_req_match(rcv_ev->proto->mq,
 		(psm2_epaddr_t) &epstaddr->ipsaddr->msgctl->master_epaddr,
 		(psm2_mq_tag_t *) rcv_ev->p_hdr->tag, 0);
diff --git a/prov/psm3/psm3/hal_sockets/sockets_spio.c b/prov/psm3/psm3/hal_sockets/sockets_spio.c
index 8d4fe1d65c7..64fe044b2fb 100644
--- a/prov/psm3/psm3/hal_sockets/sockets_spio.c
+++ b/prov/psm3/psm3/hal_sockets/sockets_spio.c
@@ -71,7 +71,7 @@
 /*---------------------------------------------------------------------------*/
 /* TCP specific code */
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 // set iov for remaining GPU payload data. It copies device memory to sockets_ep.sbuf
 // in word boundary and then set iov to use the sockets_ep.sbuf with proper offset.
 #define PAYLOAD_IOV(iov, payload, payload_len, remaining, buf, is_gpu_payload)                      \
@@ -101,7 +101,7 @@
 #endif
 
 // prepare msghdr for a message
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #define MSG_IOV(msg, header, payload, payload_len, remaining, buf, is_gpu_payload)                  \
 	if (likely(remaining > payload_len)) {                                                       \
 		msg.msg_iov[msg.msg_iovlen].iov_len = remaining - payload_len;                       \
@@ -323,7 +323,7 @@ psm3_sockets_tcp_sendpacing(struct ips_proto *proto, struct ips_flow *flow)
 static __inline__ psm2_error_t
 psm3_sockets_tcp_aux_send(psm2_ep_t ep, struct ips_flow *flow,
 	struct ips_message_header *header, uint32_t *payload, uint32_t payload_len
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	, uint32_t is_gpu_payload
 #endif
 	)
@@ -344,7 +344,7 @@ psm3_sockets_tcp_aux_send(psm2_ep_t ep, struct ips_flow *flow,
 	msg.msg_iov[0].iov_len = sizeof(*header);
 	if (payload_len) {
 		PAYLOAD_IOV(msg.msg_iov[1], payload, payload_len, payload_len
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			, ep->sockets_ep.sbuf, is_gpu_payload
 #endif
 			);
@@ -383,7 +383,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f
 			struct ips_scb *scb, uint32_t *payload,
 			uint32_t length, uint32_t isCtrlMsg,
 			uint32_t cksum_valid, uint32_t cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			, uint32_t is_gpu_payload
 #endif
 			)
@@ -477,7 +477,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f
 			_HFI_VDBG("Send DISCONN msg opcode=%x via aux_socket\n", opcode);
 			flow->send_remaining = 0;
 			return psm3_sockets_tcp_aux_send(ep, flow, ips_lrh, payload, length
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, is_gpu_payload
 #endif
 			);
@@ -490,7 +490,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f
 		len = flow->send_remaining ? flow->send_remaining : sizeof(*ips_lrh) + length;
 		msg.msg_iovlen = 0;
 		MSG_IOV(msg, ips_lrh, payload, length, len
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			, ep->sockets_ep.sbuf, is_gpu_payload
 #endif
 			);
@@ -552,7 +552,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f
 				psm3_sockaddr_fmt((struct sockaddr *)&flow->ipsaddr->sockets.remote_pri_addr, 0),
 				length);
 			_HFI_PDBG_DUMP_ALWAYS((uint8_t*)ips_lrh, sizeof(*ips_lrh));
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (is_gpu_payload) {
 				PSM3_GPU_MEMCPY_DTOH(ep->sockets_ep.sbuf,
 					payload, length);
@@ -566,7 +566,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f
 		}
 		if_pf (opcode == OPCODE_DISCONNECT_REPLY) {
 			return psm3_sockets_tcp_aux_send(ep, flow, ips_lrh, payload, length
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, is_gpu_payload
 #endif
 			);
@@ -707,7 +707,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f
 				//    (UDP), it will fill receiver buffer from beginning
 				flow->send_remaining = 0;
 				ret = psm3_sockets_tcp_aux_send(ep, flow, ips_lrh, payload, length
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 					, is_gpu_payload
 #endif
 				);
@@ -739,7 +739,7 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f
 			_HFI_VDBG("Invalid tcp_fd on %s! Try to use aux socket.\n", ep->dev_name);
 			flow->send_remaining = 0;
 			ret = psm3_sockets_tcp_aux_send(ep, flow, ips_lrh, payload, length
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, is_gpu_payload
 #endif
 			);
@@ -751,9 +751,9 @@ psm3_sockets_tcp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f
 	}
 
 #ifndef PSM_TCP_ACK
-	// return PSM2_OK for ctrl msg and PSM2_TCP_DATA_SENT for data msg
+	// return PSM2_OK for ctrl msg and PSM2_RELIABLE_DATA_SENT for data msg
 	if (ret == PSM2_OK && !isCtrlMsg) {
-		return PSM2_TCP_DATA_SENT;
+		return PSM2_RELIABLE_DATA_SENT;
 	}
 #endif
 	return ret;
@@ -774,7 +774,7 @@ psm3_sockets_tcp_spio_transfer_frames(struct ips_proto *proto, struct ips_flow *
 
 	struct msghdr msg = ep->sockets_ep.snd_msg;
 	msg.msg_iovlen = 0;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	// this is used for GPU support. It maintains the position in sbuf
 	// to which we copy data from device
 	uint8_t *buf = ep->sockets_ep.sbuf;
@@ -796,7 +796,7 @@ psm3_sockets_tcp_spio_transfer_frames(struct ips_proto *proto, struct ips_flow *
 		ret = psm3_sockets_tcp_spio_transfer_frame(proto, flow, scb, ips_scb_buffer(scb),
 			scb->payload_size, PSMI_TRUE, scb->ips_lrh.flags & IPS_SEND_FLAG_PKTCKSUM,
 			scb->cksum[0]
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			, IS_TRANSFER_BUF_GPU_MEM(scb)
 #endif
 		);
@@ -839,7 +839,7 @@ psm3_sockets_tcp_spio_transfer_frames(struct ips_proto *proto, struct ips_flow *
 		len = sizeof(*ips_lrh) + scb->payload_size;
 	}
 	MSG_IOV(msg, ips_lrh, ips_scb_buffer(scb), scb->payload_size, len
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			, buf, IS_TRANSFER_BUF_GPU_MEM(scb)
 #endif
 		);
@@ -864,7 +864,7 @@ psm3_sockets_tcp_spio_transfer_frames(struct ips_proto *proto, struct ips_flow *
 		if (likely(scb->payload_size > 0)) {
 			PAYLOAD_IOV(iovs[msg.msg_iovlen], ips_scb_buffer(scb),
 				scb->payload_size, scb->payload_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, buf, IS_TRANSFER_BUF_GPU_MEM(scb)
 #endif
 				);
@@ -983,7 +983,7 @@ psm3_sockets_udp_gso_send(int fd, struct ips_proto *proto,
 		psm3_sockaddr_in_t *addr,
 		struct ips_scb *scb, uint8_t *payload, uint32_t length,
 		uint32_t frag_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		, uint32_t is_gpu_payload
 #endif
 		)
@@ -1027,7 +1027,7 @@ psm3_sockets_udp_gso_send(int fd, struct ips_proto *proto,
                                 len + sizeof(*ips_lrh) + HFI_CRC_SIZE_IN_BYTES);
 
 		_HFI_VDBG("copy payload %p %u\n", payload, len);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (is_gpu_payload) {
 			PSM3_GPU_MEMCPY_DTOH(sbuf_gso + sizeof(*ips_lrh),
 					payload, len);
@@ -1099,7 +1099,7 @@ psm3_sockets_udp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f
 			struct ips_scb *scb, uint32_t *payload,
 			uint32_t length, uint32_t isCtrlMsg,
 			uint32_t cksum_valid, uint32_t cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			, uint32_t is_gpu_payload
 #endif
 			)
@@ -1143,7 +1143,7 @@ psm3_sockets_udp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f
 					&flow->ipsaddr->sockets.remote_pri_addr,
 					scb, (uint8_t*)payload, scb->chunk_size_remaining,
 					scb->frag_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 					,is_gpu_payload
 #endif
 					)) {
@@ -1159,7 +1159,7 @@ psm3_sockets_udp_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *f
 	memcpy(sbuf, ips_lrh, sizeof(*ips_lrh));
 	// copy payload to send buffer, length could be zero, be safe
 	_HFI_VDBG("copy payload %p %u\n",  payload, length);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (is_gpu_payload) {
 		PSM3_GPU_MEMCPY_DTOH(sbuf + sizeof(*ips_lrh),
 			payload, length);
diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.c b/prov/psm3/psm3/hal_verbs/verbs_ep.c
index f4e30d6c5e9..9a070f797d3 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_ep.c
+++ b/prov/psm3/psm3/hal_verbs/verbs_ep.c
@@ -231,19 +231,38 @@ psm3_ep_open_verbs(psm2_ep_t ep, int unit, int port, int addr_index, psm2_uuid_t
 	// HFI_TF_NFLOWS (32) limits receiver side concurrent tidflows (aka inbound
 	// RDMA w/immed).
 	// For USER RC Eager without SRQ we can have num_recv_wqes/FRACTION per
-	// QP in which case theoretical need could be huge.  We add 4000 as a
-	// swag to cover most cases and user can always tune higher as needed
+	// QP, and we calculate the total size based on the total QPs required.
+	// The CQ size for the UD QP is covered by hfi_num_recv_wqes.
 	// For USER RC Eager with SRQ worse case is num_recv_wqes so we
 	// add that to allow up to num_recv_wqes on UD QP and SRQ each and keep
 	// the HFI_TF_NFLOWS+1000 as headroom.
 	if (! ep->verbs_ep.hfi_num_recv_cqes) {
 		ep->verbs_ep.hfi_num_recv_cqes = ep->verbs_ep.hfi_num_recv_wqes+HFI_TF_NFLOWS+1000;
+#ifdef USE_RC
 		if ((ep->rdmamode&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) {
-			if (ep->verbs_ep.srq)
+			if (ep->verbs_ep.srq) {
 				ep->verbs_ep.hfi_num_recv_cqes += ep->verbs_ep.hfi_num_recv_wqes;
-			else
-				ep->verbs_ep.hfi_num_recv_cqes += 4000;
+			} else {
+				int tot_cnt = psm3_get_myrank_count();
+				int loc_cnt = psm3_get_mylocalrank_count();
+				uint32_t rem_cnt;
+				uint32_t cqes_per_qp;
+
+				/*
+				 * Check to see if MPI is used. If yes, we will calculate the total
+				 * number of RC QPs. Otherwise, we use a arbitrary large number to
+				 * accomodate up to 128 remote connections
+				 */
+				if (tot_cnt > 0 && loc_cnt > 0)
+					rem_cnt = (uint32_t)(tot_cnt - loc_cnt);
+				else
+					rem_cnt = 128;
+
+				cqes_per_qp = ep->verbs_ep.hfi_num_recv_wqes / VERBS_RECV_QP_FRACTION;
+				ep->verbs_ep.hfi_num_recv_cqes += rem_cnt * cqes_per_qp;
+			}
 		}
+#endif
 	}
 	ep->verbs_ep.recv_cq = ibv_create_cq(ep->verbs_ep.context,
 						 ep->verbs_ep.hfi_num_recv_cqes,
@@ -354,7 +373,7 @@ psm3_verbs_parse_params(psm2_ep_t ep)
 			"Number of recv CQEs to allocate\n"
 			"(0 will calculate as PSM3_NUM_RECV_WQES+1032 for PSM3_RDMA=0-2\n"
 			"for PSM3_RDMA=3 with SRQ, allow an additional PSM3_NUM_RECV_WQES\n"
-			"for PSM3_RDMA=3 without SRQ, allow an additional 4000) [0]",
+			"for PSM3_RDMA=3 without SRQ, calculate based on total QPs) [0]",
 			PSMI_ENVVAR_LEVEL_USER,
 			PSMI_ENVVAR_TYPE_UINT,
 			(union psmi_envvar_val)0, &envvar_val);
@@ -408,7 +427,7 @@ psm3_verbs_parse_params(psm2_ep_t ep)
 	// 		(HFI_TF_NFLOWS + ep->hfi_num_send_rdma)
 	//		* psm3_mq_max_window_rv(mq, 0)
 	// and automatically increase with warning if not?
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	ep->rv_gpu_cache_size = psmi_parse_gpudirect_rv_gpu_cache_size(0);
 	// TBD - we could check gpu_cache_size >= minimum based on:
 	// 		(HFI_TF_NFLOWS + ep->hfi_num_send_rdma)
@@ -458,7 +477,7 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 	// PSM3_* env for SDMA are parsed later in psm3_ips_proto_init.
 	proto->iovec_thresh_eager = 8192;
 	proto->iovec_thresh_eager_blocking = 8192;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	proto->iovec_gpu_thresh_eager = 128;
 	proto->iovec_gpu_thresh_eager_blocking = 128;
 #endif
@@ -469,37 +488,64 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 	// at this point ep->mtu is our HW capability found during open
 	// and adjusted to allow for PSM headers so ep->mtu reflects maximum
 	// PSM payload (not yet adjusted for optional cksum_sz)
-	/* See if user specifies a lower MTU to use */
-	if (!psm3_getenv("PSM3_MTU",
-		"Upper bound on packet MTU (<=0 uses port MTU): 1-5,256,512,1024,2048,4096]",
-	     PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
-	     (union psmi_envvar_val)-1, &env_mtu)) {
+	char help[128];
+
+	if ((ep->rdmamode & IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) {
+		snprintf(help, sizeof(help), "Upper bound on PSM3 payload (<=0 uses port MTU): 1-7, 1024-PSM3_MQ_RNDV_NIC_THRESH(%u)", ep->mq->rndv_nic_thresh);
+	} else {
+		snprintf(help, sizeof(help), "Upper bound on packet MTU (<=0 uses port MTU): 1-5,256,512,1024,2048,4096,8192");
+	}
+	/* See if user specifies a MTU to use */
+	if (!psm3_getenv("PSM3_MTU", help,
+		PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
+		(union psmi_envvar_val)-1, &env_mtu)) {
+		uint32_t mtu; // in bytes
 		// use OPA_MTU_MAX so we don't round down to min MTU when
 		// OPA enum values mistakenly used here.
-		if (env_mtu.e_int >= IBTA_MTU_MIN && env_mtu.e_int <= OPA_MTU_MAX) //enum
-			env_mtu.e_int = opa_mtu_enum_to_int((enum opa_mtu)env_mtu.e_int);
-		else if (env_mtu.e_int < IBTA_MTU_MIN) // pick default
-			env_mtu.e_int = 8192; // default high, will use wire MTU
-		else // wash through enum to force round up to next valid MTU
-			env_mtu.e_int = opa_mtu_enum_to_int(opa_mtu_int_to_enum(env_mtu.e_int));
+		if (env_mtu.e_int >= IBTA_MTU_MIN && env_mtu.e_int <= OPA_MTU_MAX) { //enum
+			mtu = opa_mtu_enum_to_int((enum opa_mtu)env_mtu.e_int);
+		} else if (env_mtu.e_int < IBTA_MTU_MIN) { // pick default
+			mtu = ep->mtu + MAX_PSM_HEADER; // use wire MTU
+		} else if ((ep->rdmamode & IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) { // use as local PSM3 MTU
+			// Under RDMA3 mode, UD is used for ctr msg only that shall be smaller than
+			// wire MTU. It's safe to increase PSM3 MTU beyond wire MTU because RC will be
+			// used, and the NIC driver will segment a msg into multiple packets to ensure 
+			// each pkt size is within wire MTU. 
+
+			mtu = env_mtu.e_int;
+			// only apply PSM3_MTU on eager messages
+			if (env_mtu.e_int > ep->mq->rndv_nic_thresh)
+				mtu = ep->mq->rndv_nic_thresh;
+			if (env_mtu.e_int < opa_mtu_enum_to_int(IBTA_MTU_MIN))
+				mtu = opa_mtu_enum_to_int(IBTA_MTU_MIN);
+			// round down to nearest multiple of 64
+			mtu = ROUNDDOWNP2(mtu, 64);
+			proto->epinfo.ep_mtu = mtu - MAX_PSM_HEADER;
+		} else { // walk through enum to force round up to next valid MTU
+			mtu = opa_mtu_enum_to_int(opa_mtu_int_to_enum(env_mtu.e_int));
+		}
+
 		// only allow MTU decrease
 		// PSM3_MTU specified ends up being used as max verbs payload
 		// so decrease by PSM HEADER size (and cksum below)
-		if (ep->mtu > env_mtu.e_int - MAX_PSM_HEADER)
-			ep->mtu = env_mtu.e_int - MAX_PSM_HEADER;
+		if (ep->mtu > mtu - MAX_PSM_HEADER)
+			ep->mtu = mtu - MAX_PSM_HEADER;
 	}
+
 	/* allow space for optional software managed checksum (for debug) */
 	ep->mtu -= cksum_sz;
-	// ep->mtu is our final choice of local PSM payload we can support
-	proto->epinfo.ep_mtu = ep->mtu;
+	// if proto->epinfo.ep_mtu is not set, use ep->mtu as our final choice 
+	// of local PSM payload we can support
+	if (!proto->epinfo.ep_mtu)
+		proto->epinfo.ep_mtu = ep->mtu;
 
 	if (PSM2_OK != psm_verbs_alloc_send_pool(ep, ep->verbs_ep.pd, &ep->verbs_ep.send_pool, 
 				// save 1 send WQE just to be paranoid (should be unnecessary)
 				min(ep->verbs_ep.hfi_num_send_wqes, ep->verbs_ep.qp_cap.max_send_wr-1),
 				// want to end up with multiple of cache line (64)
-				// ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
+				// proto->epinfo.ep_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
 				// be conservative (+BUFFER_HEADROOM)
-				ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM
+				proto->epinfo.ep_mtu + MAX_PSM_HEADER + BUFFER_HEADROOM
 		)) {
 		_HFI_ERROR( "Unable to allocate UD send buffer pool\n");
 		goto fail;
@@ -516,9 +562,9 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 	if (PSM2_OK != psm_verbs_alloc_recv_pool(ep, 0, ep->verbs_ep.qp, &ep->verbs_ep.recv_pool,
 				min(ep->verbs_ep.hfi_num_recv_wqes, ep->verbs_ep.qp_cap.max_recv_wr),
 				// want to end up with multiple of cache line (64)
-				// ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
+				// proto->epinfo.ep_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
 				// be conservative (+BUFFER_HEADROOM)
-				ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM
+				proto->epinfo.ep_mtu + MAX_PSM_HEADER + BUFFER_HEADROOM
 		)) {
 		_HFI_ERROR( "Unable to allocate UD recv buffer pool\n");
 		goto fail;
@@ -529,9 +575,9 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 					ep->verbs_ep.hfi_num_recv_wqes,
 					 (proto->ep->rdmamode == IPS_PROTOEXP_FLAG_RDMA_USER)? 0
 					// want to end up with multiple of cache line (64)
-					// ep->mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
+					// proto->epinfo.ep_mtu+MAX_PSM_HEADERS will be power of 2 verbs MTU
 					// be conservative (+BUFFER_HEADROOM)
-					: (ep->mtu + MAX_PSM_HEADER + BUFFER_HEADROOM)
+					: (proto->epinfo.ep_mtu + MAX_PSM_HEADER + BUFFER_HEADROOM)
 			)) {
 			_HFI_ERROR( "Unable to allocate SRQ recv buffer pool\n");
 			goto fail;
@@ -545,10 +591,10 @@ psm3_verbs_ips_proto_init(struct ips_proto *proto, uint32_t cksum_sz)
 
 	// no send segmentation, max_segs will constrain
 	ep->chunk_max_segs = 1;
-	ep->chunk_max_size = ep->mtu;
+	ep->chunk_max_size = proto->epinfo.ep_mtu;
 #ifdef PSM_BYTE_FLOW_CREDITS
 	// let flow_credits be the control
-	proto->flow_credit_bytes = ep->mtu * proto->max_credits;
+	proto->flow_credit_bytes = proto->epinfo.ep_mtu * proto->max_credits;
 	_HFI_DBG("initial flow_credits %d bytes %d\n",
 				proto->flow_credits, proto->flow_credit_bytes);
 #else
@@ -874,25 +920,9 @@ psm2_error_t psm_verbs_alloc_send_pool(psm2_ep_t ep, struct ibv_pd *pd,
 			_HFI_ERROR( "can't alloc send buffers");
 			goto fail;
 		}
-#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
-		// By registering memory with Cuda, we make
-		// cuMemcpy run faster for copies from
-		// GPU to the send buffer.
-		if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
-			PSMI_CUDA_CALL(cuMemHostRegister,
-				pool->send_buffers,
-				pool->send_total*pool->send_buffer_size,
-				CU_MEMHOSTALLOC_PORTABLE);
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-		// By registering memory with Level Zero, we make
-		// zeCommandListAppendMemoryCopy run faster for copies from
-		// GPU to the send buffer.
-		if (PSMI_IS_GPU_ENABLED)
-			PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer,
-				ze_driver, pool->send_buffers,
+		// This can allows faster copies from GPU to the send buffer
+		PSM3_GPU_REGISTER_HOSTMEM( pool->send_buffers,
 				pool->send_total*pool->send_buffer_size);
-#endif
 
 		_HFI_PRDBG("send pool: buffers: %p size %u\n",  pool->send_buffers, pool->send_buffer_size);
 		pool->send_bufs = (struct verbs_sbuf *)psmi_calloc(ep, NETWORK_BUFFERS,
@@ -993,25 +1023,9 @@ psm2_error_t psm_verbs_alloc_recv_pool(psm2_ep_t ep, uint32_t for_srq,
 				_HFI_ERROR( "can't alloc recv buffers");
 				goto fail;
 			}
-#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
-			// By registering memory with Cuda, we make
-			// cuMemcpy run faster for copies from
-			// recv buffer to GPU
-			if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
-				PSMI_CUDA_CALL(cuMemHostRegister,
-					pool->recv_buffers,
-					pool->recv_total*pool->recv_buffer_size,
-					CU_MEMHOSTALLOC_PORTABLE);
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-			// By registering memory with Level Zero, we make
-			// zeCommandListAppendMemoryCopy run faster for copies from
-			// recv buffer to GPU
-			if (PSMI_IS_GPU_ENABLED)
-				PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer,
-					ze_driver, pool->recv_buffers,
-					pool->recv_total*pool->recv_buffer_size);
-#endif
+			// This can allow faster copies from recv buffer to GPU
+			PSM3_GPU_REGISTER_HOSTMEM(pool->recv_buffers,
+				pool->recv_total*pool->recv_buffer_size);
 			//printf("recv pool: buffers: %p size %u\n",  pool->recv_buffers, pool->recv_buffer_size);
 #ifdef USE_RC
 			pool->recv_bufs = (struct verbs_rbuf *)psmi_calloc(ep, NETWORK_BUFFERS,
@@ -1104,38 +1118,7 @@ void psm_verbs_free_send_pool(psm3_verbs_send_pool_t pool)
 		pool->send_bufs = NULL;
 	}
 	if (pool->send_buffers) {
-#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
-		if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
-			/* ignore NOT_REGISTERED in case cuda initialized late */
-			/* ignore other errors as context could be destroyed before this */
-			CUresult cudaerr;
-			//PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
-			//		cuMemHostUnregister, pool->send_buffers);
-			psmi_count_cuMemHostUnregister++;
-			cudaerr = psmi_cuMemHostUnregister(pool->send_buffers);
-			if (cudaerr) {
-				const char *pStr = NULL;
-				psmi_count_cuGetErrorString++;
-				psmi_cuGetErrorString(cudaerr, &pStr);
-				_HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n",
-						cudaerr, pStr?pStr:"Unknown");
-			}
-
-		}
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-		if (PSMI_IS_GPU_ENABLED) {
-			ze_result_t result;
-			//PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer,
-			//		 ze_driver, pool->send_buffers);
-			psmi_count_zexDriverReleaseImportedPointer++;
-			result = psmi_zexDriverReleaseImportedPointer(ze_driver,
-					pool->send_buffers);
-			if (result != ZE_RESULT_SUCCESS) {
-				_HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result));
-			}
-		}
-#endif
+		PSM3_GPU_UNREGISTER_HOSTMEM(pool->send_buffers);
 		psmi_free(pool->send_buffers);
 		pool->send_buffers = NULL;
 	}
@@ -1156,37 +1139,7 @@ void psm_verbs_free_recv_pool(psm3_verbs_recv_pool_t pool)
 	}
 #endif
 	if (pool->recv_buffers) {
-#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
-		if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
-			/* ignore NOT_REGISTERED in case cuda initialized late */
-			/* ignore other errors as context could be destroyed before this */
-			CUresult cudaerr;
-			//PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
-			//		cuMemHostUnregister, pool->recv_buffers);
-			psmi_count_cuMemHostUnregister++;
-			cudaerr = psmi_cuMemHostUnregister(pool->recv_buffers);
-			if (cudaerr) {
-				const char *pStr = NULL;
-				psmi_count_cuGetErrorString++;
-				psmi_cuGetErrorString(cudaerr, &pStr);
-				_HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n",
-						cudaerr, pStr?pStr:"Unknown");
-			}
-		}
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-		if (PSMI_IS_GPU_ENABLED) {
-			ze_result_t result;
-			//PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer,
-			//		 ze_driver, pool->recv_buffers);
-			psmi_count_zexDriverReleaseImportedPointer++;
-			result = psmi_zexDriverReleaseImportedPointer(ze_driver,
-					pool->recv_buffers);
-			if (result != ZE_RESULT_SUCCESS) {
-				_HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result));
-			}
-		}
-#endif
+		PSM3_GPU_UNREGISTER_HOSTMEM(pool->recv_buffers);
 		psmi_free(pool->recv_buffers);
 		pool->recv_buffers = NULL;
 	}
@@ -1667,7 +1620,7 @@ extern int ips_protoexp_rdma_write_completion( uint64_t wr_id);
 psm2_error_t
 psm3_verbs_completion_update(psm2_ep_t ep, int drain)
 {
-	#define CQE_BATCH 10	// reap a few at a time, hopefully faster this way
+	#define CQE_BATCH 32	// reap a few at a time, hopefully faster this way
 	//#define CQE_BATCH 8 or 18	// reap a few at a time, hopefully faster this way
 							// 18*COALLESE > default reap threshold so we
 							// should get away with one poll_q
@@ -1677,6 +1630,9 @@ psm3_verbs_completion_update(psm2_ep_t ep, int drain)
 							// alloca(sizeof(ibv_wc) & batch)
 	struct ibv_wc wc[CQE_BATCH];
 	int ne;
+#ifdef USE_RC
+	struct ips_epaddr *ipsaddr;
+#endif
 
 	PSMI_LOCK_ASSERT(ep->mq->progress_lock);
 	// TBD - when coallescing completions we'll tend to fall through to poll_cq
@@ -1738,6 +1694,12 @@ psm3_verbs_completion_update(psm2_ep_t ep, int drain)
 				ips_protoexp_rdma_write_completion(
 							 wc[i].wr_id & ~VERBS_SQ_WR_ID_MASK);
 				break;
+			case IBV_WC_RDMA_READ:
+				ipsaddr = (struct ips_epaddr *)wc[i].wr_id;
+
+				ipsaddr->verbs.remote_seq_outstanding = 0;
+				_HFI_VDBG("Got remote_recv_psn=%d\n", ipsaddr->verbs.remote_recv_psn);
+				break;
 #endif
 			default:
 				_HFI_ERROR("unexpected send completion on %s port %u opcode %d QP %u\n",
@@ -2197,15 +2159,15 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key)
 	// we always fill in everything we might need in loc_info
 	// in some modes, some of the fields are not used by RV
 	loc_info.mr_cache_size = ep->rv_mr_cache_size;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/* gpu_cache_size ignored unless RV_RDMA_MODE_GPU */
 	loc_info.gpu_cache_size = ep->rv_gpu_cache_size;
 #endif
 	loc_info.rdma_mode = IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)?
 					RV_RDMA_MODE_KERNEL: RV_RDMA_MODE_USER;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (PSMI_IS_GPU_ENABLED) {
-		// when Cuda is enabled we will have larger window_sz and
+#ifdef PSM_HAVE_GPU
+	if (PSM3_GPU_IS_ENABLED) {
+		// when GPU is enabled we will have larger window_sz and
 		// need to upsize the caches we will use for priority MRs
 		if (ep->rdmamode & IPS_PROTOEXP_FLAG_ENABLED) {
 			// priority window_sz reg_mr for CPU
@@ -2214,9 +2176,9 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key)
  		if (psmi_parse_gpudirect()) {
 			// When GPU Direct is enabled we need a GPU Cache
 			loc_info.rdma_mode |= RV_RDMA_MODE_GPU;
-#ifdef PSM_ONEAPI
-			psm3_oneapi_ze_can_use_zemem();
-#endif
+
+			PSM3_GPU_USING_RV_FOR_MRS();
+
 			if ((ep->rdmamode & IPS_PROTOEXP_FLAG_ENABLED)
 				&& (psmi_parse_gpudirect_rdma_send_limit(1)
 				|| psmi_parse_gpudirect_rdma_recv_limit(1))) {
@@ -2267,7 +2229,7 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key)
 	}
 	// parallel hal_gen1/gen1_hal_inline_i.h handling HFI1_CAP_GPUDIRECT_OT
 #ifndef RV_CAP_GPU_DIRECT
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #error "Inconsistent build.  RV_CAP_GPU_DIRECT must be defined for GPU builds. Must use GPU enabled rv headers"
 #else
 // lifted from rv_user_ioctls.h
@@ -2281,15 +2243,12 @@ static psm2_error_t open_rv(psm2_ep_t ep, psm2_uuid_t const job_key)
 		psmi_hal_add_cap(PSM_HAL_CAP_GPUDIRECT_SDMA);
 		psmi_hal_add_cap(PSM_HAL_CAP_GPUDIRECT_RDMA);
 	}
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (loc_info.capability & RV_CAP_NVIDIA_GPU)
-		psmi_hal_add_cap(PSM_HAL_CAP_NVIDIA_GPU);
-	if (loc_info.capability & RV_CAP_INTEL_GPU)
-		psmi_hal_add_cap(PSM_HAL_CAP_INTEL_GPU);
+#ifdef PSM_HAVE_GPU
+	PSM3_GPU_RV_SET_HAL_CAP(loc_info.capability);
 #endif
 	ep->verbs_ep.rv_index = loc_info.rv_index;
 	ep->rv_mr_cache_size = loc_info.mr_cache_size;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	ep->rv_gpu_cache_size = loc_info.gpu_cache_size;
 #endif
 	ep->verbs_ep.rv_q_depth = loc_info.q_depth;
@@ -2442,6 +2401,25 @@ static psm2_error_t verbs_open_dev(psm2_ep_t ep, int unit, int port, int addr_in
 				psm3_gid128_fmt(ep->gid, 2));
 	}
 
+#if defined(USE_RDMA_READ)
+#if defined(USE_RC)
+	{
+		struct ibv_device_attr dev_attr;
+		// get RDMA capabilities of device
+		if (ibv_query_device(ep->verbs_ep.context, &dev_attr)) {
+			_HFI_ERROR("Unable to query device %s: %s\n", ep->dev_name,
+						strerror(errno));
+			err = PSM2_INTERNAL_ERR;
+			goto fail;
+		}
+		ep->verbs_ep.max_qp_rd_atom = dev_attr.max_qp_rd_atom;
+		ep->verbs_ep.max_qp_init_rd_atom = dev_attr.max_qp_init_rd_atom;
+		_HFI_PRDBG("got device attr: rd_atom %u init_rd_atom %u\n",
+						dev_attr.max_qp_rd_atom, dev_attr.max_qp_init_rd_atom);
+		// TBD could have an env variable to reduce requested values
+	}
+#endif // USE_RC
+#endif
 #ifdef RNDV_MOD
 	if (IPS_PROTOEXP_FLAG_KERNEL_QP(ep->rdmamode)
 		|| ep->mr_cache_mode == MR_CACHE_MODE_KERNEL ) {
@@ -2774,6 +2752,9 @@ psm2_error_t modify_rc_qp_to_init(psm2_ep_t ep, struct ibv_qp *qp)
 	//attr.qkey = ep->verbs_ep.qkey;
 	//flags |= IBV_QP_QKEY;	// only allowed for UD
 	attr.qp_access_flags = 0;
+#ifdef USE_RDMA_READ
+	attr.qp_access_flags |= IBV_ACCESS_REMOTE_READ;
+#endif
 	attr.qp_access_flags |= IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_LOCAL_WRITE;
 	//attr.qp_access_flags |= IBV_ACCESS_REMOTE_ATOMIC;
 	flags |= IBV_QP_ACCESS_FLAGS;
@@ -2804,11 +2785,15 @@ psm2_error_t modify_rc_qp_to_rtr(psm2_ep_t ep, struct ibv_qp *qp,
 	// TBD - we already factored in req vs pr to update pr no need
 	// for modify_cq_qp_to_rtr to repeat it
 	// pr_mtu is max PSM payload in bytes and req_attr_mtu is IB enum
-	attr.path_mtu = MIN(ibv_mtu_int_to_enum(path_rec->pr_mtu), req_attr->mtu);
+	attr.path_mtu = MIN(ibv_mtu_int_to_enum(ep->mtu), req_attr->mtu);
 	attr.dest_qp_num = req_attr->qpn;
 	attr.rq_psn = initpsn;
 	flags |= (IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | IBV_QP_RQ_PSN);
 
+#ifdef USE_RDMA_READ
+	attr.max_dest_rd_atomic = min(ep->verbs_ep.max_qp_rd_atom,
+									req_attr->initiator_depth);
+#endif
 	_HFI_PRDBG("set max_dest_rd_atomic to %u\n", attr.max_dest_rd_atomic);
 	attr.min_rnr_timer = 12;	// TBD well known
 	flags |= (IBV_QP_MIN_RNR_TIMER | IBV_QP_MAX_DEST_RD_ATOMIC);
@@ -2818,7 +2803,7 @@ psm2_error_t modify_rc_qp_to_rtr(psm2_ep_t ep, struct ibv_qp *qp,
 					ep->dev_name, strerror(errno));
 		return PSM2_INTERNAL_ERR;
 	}
-	_HFI_PRDBG("moved %d to RTR\n", qp->qp_num);
+	_HFI_PRDBG("moved %d to RTR with MTU=%d\n", qp->qp_num, attr.path_mtu);
 
 	return PSM2_OK;
 }
@@ -2836,6 +2821,10 @@ psm2_error_t modify_rc_qp_to_rts(psm2_ep_t ep, struct ibv_qp *qp,
 	attr.sq_psn = initpsn;	// value we told other side
 	flags |= IBV_QP_SQ_PSN;
 
+#ifdef USE_RDMA_READ
+	attr.max_rd_atomic = min(ep->verbs_ep.max_qp_init_rd_atom,
+									req_attr->responder_resources);
+#endif
 	_HFI_PRDBG("set max_rd_atomic to %u\n", attr.max_rd_atomic);
 	flags |=  IBV_QP_MAX_QP_RD_ATOMIC;
 
@@ -2886,9 +2875,9 @@ unsigned psm3_verbs_parse_rdmamode(int reload)
 	if (psm3_rv_available()) {
 		default_value = IPS_PROTOEXP_FLAG_RDMA_KERNEL;
 	}
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	// GPUDIRECT causes default_value of RDMA=1
-	if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect())
+	if (PSM3_GPU_IS_ENABLED && psmi_parse_gpudirect())
 		default_value = IPS_PROTOEXP_FLAG_RDMA_KERNEL;
 #endif
 #endif
@@ -2950,8 +2939,8 @@ unsigned psm3_verbs_parse_mr_cache_mode(unsigned rdmamode, int reload)
 	// PSM_HAL_CAP_GPUDIRECT_* flags not known until after HAL device open,
 	// so we test SDMA and RDMA here as prereqs for GPUDIRECT_SDMA and RDMA.
 	if (! (rdmamode & IPS_PROTOEXP_FLAG_ENABLED)
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-		&& (PSMI_IS_GPU_DISABLED || ! psmi_parse_gpudirect()
+#ifdef PSM_HAVE_GPU
+		&& (! PSM3_GPU_IS_ENABLED || ! psmi_parse_gpudirect()
 			//verbs always has these HAL capabilities set
 			//|| (!psmi_hal_has_cap(PSM_HAL_CAP_SDMA)
 			//	&& !psmi_hal_has_cap(PSM_HAL_CAP_RDMA)))
@@ -2962,9 +2951,9 @@ unsigned psm3_verbs_parse_mr_cache_mode(unsigned rdmamode, int reload)
 	} else if (IPS_PROTOEXP_FLAG_KERNEL_QP(rdmamode)) {
 		// RDMA enabled in kernel mode.  Must use rv MR cache
 		envval.e_uint = MR_CACHE_MODE_RV;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #ifdef PSM_HAVE_RNDV_MOD
-	} else if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect()) {
+	} else if (PSM3_GPU_IS_ENABLED && psmi_parse_gpudirect()) {
 		// GPU Direct (RDMA, send DMA and/or gdrcopy) must
 		// use kernel MR cache in RV
 		envval.e_uint = MR_CACHE_MODE_KERNEL;
diff --git a/prov/psm3/psm3/hal_verbs/verbs_ep.h b/prov/psm3/psm3/hal_verbs/verbs_ep.h
index c1da6b73e53..e85e5776f36 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_ep.h
+++ b/prov/psm3/psm3/hal_verbs/verbs_ep.h
@@ -107,7 +107,7 @@
 									// if 1, post as we recv them
 #define VERBS_SEND_CQ_REAP 256	// check for completions when this many unreaped
 #define VERBS_PORT 1			// default port if not specified
-#define VERBS_RECV_CQE_BATCH 1	// how many CQEs to ask for at a time
+#define VERBS_RECV_CQE_BATCH 32	// how many CQEs to ask for at a time
 #define UD_ADDITION (40)		// extra bytes at start of UD recv buffer
 								// defined in verbs API to accomidate IB GRH
 #define BUFFER_HEADROOM 0		// how much extra to allocate in buffers
@@ -310,19 +310,25 @@ struct psm3_verbs_ep {
 	uint32_t qkey;
 	//uint8_t link_layer;         // IBV_LINK_LAYER_ETHERNET or other
 	uint8_t active_rate;
+#if defined(USE_RDMA_READ)
+#if defined(USE_RC)
+	uint8_t max_qp_rd_atom;
+	uint8_t max_qp_init_rd_atom;
+#endif // USE_RC
+#endif
 	struct psm3_verbs_send_pool send_pool;
 	struct psm3_verbs_send_allocator send_allocator;
 	uint32_t send_rdma_outstanding;	// number of outstanding RDMAs
 	uint32_t send_reap_thresh;	// TBD if should be here or in pool
 	struct psm3_verbs_recv_pool recv_pool;
+#ifdef USE_RC
+	struct psm3_verbs_recv_pool srq_recv_pool;
+#endif
 #if VERBS_RECV_CQE_BATCH > 1
 	struct ibv_wc recv_wc_list[VERBS_RECV_CQE_BATCH];
 	int recv_wc_count;	// number left in recv_wc_list
 	int recv_wc_next;	// next index
 #else
-#ifdef USE_RC
-	struct psm3_verbs_recv_pool srq_recv_pool;
-#endif
 	// if asked to revisit a packet we save it here
 	rbuf_t revisit_buf;
 	uint32_t revisit_payload_size;
diff --git a/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c b/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c
index ab0942e5497..38a8dfce702 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c
+++ b/prov/psm3/psm3/hal_verbs/verbs_gdrcpy.c
@@ -51,8 +51,9 @@
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 #include "psm_user.h"
+
+#ifdef PSM_HAVE_GPU
 #include "psm2_hal.h"
 #include <fcntl.h>
 #include <sys/ioctl.h>
@@ -66,10 +67,11 @@ psm3_verbs_gdr_convert_gpu_to_host_addr(unsigned long buf,
 							 size_t size, int flags,
 							 psm2_ep_t ep)
 {
+#ifdef RNDV_MOD
 	void *host_addr_buf;
 	uintptr_t pageaddr;
 	uint64_t pagelen;
-#ifdef RNDV_MOD
+
 	// when PSM3_MR_ACCESS is enabled, we use the same access flags for
 	// gdrcopy as we use for user space GPU MRs.  This can improve MR cache
 	// hit rate.  Note the actual mmap is always for CPU read/write access.
@@ -79,19 +81,10 @@ psm3_verbs_gdr_convert_gpu_to_host_addr(unsigned long buf,
 	// both tend to be smaller buffers, this may provide a better hit rate.
 	int access = IBV_ACCESS_IS_GPU_ADDR
 			|(ep->mr_access?IBV_ACCESS_LOCAL_WRITE|IBV_ACCESS_REMOTE_WRITE:0);
-#endif
 
-#ifdef PSM_ONEAPI
-	PSMI_ONEAPI_ZE_CALL(zeMemGetAddressRange, ze_context,
-			    (const void *)buf, (void **)&pageaddr, &pagelen);
-#else
-	pageaddr = buf & GPU_PAGE_MASK;
-	pagelen = (uint64_t) (PSMI_GPU_PAGESIZE +
-			      ((buf + size - 1) & GPU_PAGE_MASK) - pageaddr);
-#endif
+	PSM3_GPU_ROUNDUP_GDRCOPY(buf, size, &pageaddr, &pagelen);
 	_HFI_VDBG("buf=%p size=%zu pageaddr=%p pagelen=%"PRIu64" flags=0x%x ep=%p\n",
 		(void *)buf, size, (void *)pageaddr, pagelen, flags, ep);
-#ifdef RNDV_MOD
 	ep = ep->mctxt_master;
 	host_addr_buf = psm3_rv_pin_and_mmap(ep->rv, pageaddr, pagelen, access);
 	if_pf (! host_addr_buf) {
@@ -104,16 +97,12 @@ psm3_verbs_gdr_convert_gpu_to_host_addr(unsigned long buf,
 			return NULL;
 	}
 //_HFI_ERROR("pinned buf=%p size=%zu pageaddr=%p pagelen=%u access=0x%x flags=0x%x ep=%p, @ %p\n", (void *)buf, size, (void *)pageaddr, pagelen, access, flags, ep, host_addr_buf);
+	return (void *)((uintptr_t)host_addr_buf + (buf - pageaddr));
 #else
 	psmi_assert_always(0);	// unimplemented, should not get here
-	host_addr_buf = NULL;
+	return NULL;
 #endif /* RNDV_MOD */
-#ifdef PSM_ONEAPI
-	return (void *)((uintptr_t)host_addr_buf + (buf - pageaddr));
-#else
-	return (void *)((uintptr_t)host_addr_buf + (buf & GPU_PAGE_OFFSET_MASK));
-#endif
 }
 
-#endif /* PSM_CUDA || PSM_ONEAPI  */
+#endif /* PSM_HAVE_GPU */
 #endif /* PSM_VERBS */
diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal.c b/prov/psm3/psm3/hal_verbs/verbs_hal.c
index 69d27478b48..4cc441d4402 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_hal.c
+++ b/prov/psm3/psm3/hal_verbs/verbs_hal.c
@@ -96,32 +96,12 @@ static const char* psm3_hfp_verbs_identify(void)
 	static char buf[100];
 
 #ifdef RNDV_MOD
-/* we test NVIDIA_GPU_DIRECT here instead of PSM_CUDA since that define
- * controls the rv module ioctl header file interface
- */
-#ifdef NVIDIA_GPU_DIRECT
-	snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d gpu v%d.%d cuda",
+	snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%u.%u" PSM3_GPU_FMT_RV_GPU_VER,
 			psmi_hal_get_hal_instance_name(),
 			psmi_hal_get_hal_instance_description(),
 			psm3_rv_get_user_major_bldtime_version(),
-			psm3_rv_get_user_minor_bldtime_version(),
-			psm3_rv_get_gpu_user_major_bldtime_version(),
-			psm3_rv_get_gpu_user_minor_bldtime_version());
-#elif defined(INTEL_GPU_DIRECT)
-	snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d gpu v%d.%d oneapi-ze",
-			psmi_hal_get_hal_instance_name(),
-			psmi_hal_get_hal_instance_description(),
-			psm3_rv_get_user_major_bldtime_version(),
-			psm3_rv_get_user_minor_bldtime_version(),
-			psm3_rv_get_gpu_user_major_bldtime_version(),
-			psm3_rv_get_gpu_user_minor_bldtime_version());
-#else
-	snprintf(buf, sizeof(buf), "HAL: %s (%s) built against rv interface v%d.%d",
-			psmi_hal_get_hal_instance_name(),
-			psmi_hal_get_hal_instance_description(),
-			psm3_rv_get_user_major_bldtime_version(),
-			psm3_rv_get_user_minor_bldtime_version());
-#endif
+			psm3_rv_get_user_minor_bldtime_version()
+			PSM3_GPU_OUT_RV_GPU_VER);
 #else /* RNDV_MOD */
 	snprintf(buf, sizeof(buf), "HAL: %s (%s)",
 			psmi_hal_get_hal_instance_name(),
@@ -174,15 +154,14 @@ static void psm3_hfp_verbs_mq_init_defaults(struct psm2_mq *mq)
 		mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous
 	}
 	mq->hfi_thresh_tiny = PSM_MQ_NIC_MAX_TINY;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (PSMI_IS_GPU_ENABLED)
-		mq->ips_gpu_window_rv_str = PSM_GPU_NIC_RNDV_WINDOW_STR;
+#ifdef PSM_HAVE_GPU
+	mq->ips_gpu_window_rv_str = psm3_gpu_rndv_nic_window_default;
 #endif
 	// we parse mr_cache_mode and rv_gpu_cache_size here so we can cache it
 	// once per EP open, even if multi-rail or multi-QP
 	(void)psm3_verbs_parse_mr_cache_mode(rdmamode, 1);
 #ifdef RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	(void)psmi_parse_gpudirect_rv_gpu_cache_size(1);
 #endif
 #endif
@@ -196,7 +175,7 @@ static void psm3_hfp_verbs_ep_open_opts_get_defaults(struct psm3_ep_open_opts *o
 	opts->imm_size = VERBS_SEND_MAX_INLINE; // PSM header size is 56
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static void psm3_hfp_verbs_gdr_open(void)
 {
 }
@@ -249,13 +228,7 @@ static hfp_verbs_t psm3_verbs_hi = {
 	/* start of public psmi_hal_instance_t data */
 	.phi = {
 		.hal_index				  = PSM_HAL_INDEX_VERBS,
-		.description				  = "RDMA Verbs"
-#ifdef PSM_CUDA
-								" (cuda)"
-#elif defined(PSM_ONEAPI)
-								" (oneapi-ze)"
-#endif
-									,
+		.description				  = "RDMA Verbs" PSM3_GPU_TYPES,
 		.nic_sys_class_path			  = "/sys/class/infiniband",
 		.nic_sys_port_path_fmt			  = PSM3_PORT_PATH_TYPE_IB,
 		.params					  = {0},
@@ -274,7 +247,7 @@ static hfp_verbs_t psm3_verbs_hi = {
 		.hfp_mq_init_defaults			  = psm3_hfp_verbs_mq_init_defaults,
 		.hfp_ep_open_opts_get_defaults		  = psm3_hfp_verbs_ep_open_opts_get_defaults,
 		.hfp_context_initstats			  = psm3_hfp_verbs_context_initstats,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		.hfp_gdr_open				  = psm3_hfp_verbs_gdr_open,
 #endif
 
@@ -316,10 +289,10 @@ static hfp_verbs_t psm3_verbs_hi = {
 		.hfp_ips_ibta_init			  = psm3_hfp_verbs_ips_ibta_init,
 		.hfp_ips_path_rec_init			  = psm3_hfp_verbs_ips_path_rec_init,
 		.hfp_ips_ptl_pollintr			  = psm3_hfp_verbs_ips_ptl_pollintr,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		.hfp_gdr_close				  = psm3_hfp_verbs_gdr_close,
 		.hfp_gdr_convert_gpu_to_host_addr	  = psm3_hfp_verbs_gdr_convert_gpu_to_host_addr,
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 		.hfp_get_port_index2pkey		  = psm3_hfp_verbs_get_port_index2pkey,
 		.hfp_poll_type				  = psm3_hfp_verbs_poll_type,
 		.hfp_spio_transfer_frame		  = psm3_hfp_verbs_spio_transfer_frame,
diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal.h b/prov/psm3/psm3/hal_verbs/verbs_hal.h
index ae18c675a28..1c5d6f75cab 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_hal.h
+++ b/prov/psm3/psm3/hal_verbs/verbs_hal.h
@@ -87,11 +87,11 @@ psm3_verbs_recvhdrq_init(const struct ips_epstate *epstate,
 
 psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 void* psm3_verbs_gdr_convert_gpu_to_host_addr(unsigned long buf,
                                 size_t size, int flags,
                                 psm2_ep_t ep);
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 #endif /* _PSM_HAL_VERBS_HAL_H */
 #endif /* PSM_VERBS */
diff --git a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h
index 8ef06d9ae97..7cf2a25a707 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h
+++ b/prov/psm3/psm3/hal_verbs/verbs_hal_inline_i.h
@@ -238,12 +238,12 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_set_req_params(
 				const struct ips_connect_reqrep *req)
 {
 #ifdef RNDV_MOD
-	ipsaddr->verbs.remote_gid = req->verbs.gid;
-	ipsaddr->verbs.remote_rv_index = req->verbs.rv_index;
+	ipsaddr->verbs.remote_gid = req->verbs.rv.gid;
+	ipsaddr->verbs.remote_rv_index = req->verbs.rv.rv_index;
 	if (ipsaddr->verbs.rv_conn) {
 		psmi_assert(IPS_PROTOEXP_FLAG_KERNEL_QP(proto->ep->rdmamode));
 		psmi_assert(proto->ep->rv);
-		if (!  psm3_nonzero_gid(&req->verbs.gid)) {
+		if (!  psm3_nonzero_gid(&req->verbs.rv.gid)) {
 			_HFI_ERROR("mismatched PSM3_RDMA config, remote end not in mode 1\n");
 			return PSM2_INTERNAL_ERR;
 			// TBD - if we wanted to allow mismatched config to run in UD mode
@@ -266,7 +266,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_set_req_params(
 				ipsaddr->pathgrp->pg_path[0][IPS_PATH_LOW_PRIORITY]->verbs.pr_connecting = 1;
 			}
 		}
-	// } else if (psm3_nonzero_gid(&req->verbs.gid)) {
+	// } else if (psm3_nonzero_gid(&req->verbs.rv.gid)) {
 	//	 We could fail here, but we just let remote end decide
 	//	_HFI_ERROR("mismatched PSM3_RDMA config, remote end in mode 1\n");
 	//	return PSM2_INTERNAL_ERR;
@@ -305,6 +305,9 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_set_req_params(
 				}
 			}
 
+			ipsaddr->verbs.remote_recv_seq_addr = req->verbs.urc.recv_addr;
+			ipsaddr->verbs.remote_recv_seq_rkey = req->verbs.urc.recv_rkey;
+
 			if (modify_rc_qp_to_init(proto->ep, ipsaddr->verbs.rc_qp)) {
 				_HFI_ERROR("qp_to_init failed\n");
 				return PSM2_INTERNAL_ERR;
@@ -383,27 +386,41 @@ static PSMI_HAL_INLINE void psm3_hfp_verbs_ips_proto_build_connect_message(
 		// only supply gid if we want to use kernel rv
 		if (IPS_PROTOEXP_FLAG_KERNEL_QP(proto->ep->rdmamode)
 				&& proto->ep->rv) {
-			req->verbs.gid = proto->ep->verbs_ep.lgid;
-			req->verbs.rv_index = proto->ep->verbs_ep.rv_index;
+			req->verbs.rv.gid = proto->ep->verbs_ep.lgid;
+			req->verbs.rv.rv_index = proto->ep->verbs_ep.rv_index;
 		} else
 #endif
 		{
-			memset(&req->verbs.gid, 0, sizeof(req->verbs.gid));
-			req->verbs.rv_index = 0;
+			memset(&req->verbs.rv.gid, 0, sizeof(req->verbs.rv.gid));
+			req->verbs.rv.rv_index = 0;
 		}
 #if defined(USE_RC)
 		if (ipsaddr->verbs.rc_qp) {
 			psmi_assert(IPS_PROTOEXP_FLAG_USER_RC_QP(proto->ep->rdmamode));
 			req->initpsn = proto->runid_key;// pid, not ideal, better than const
 			req->verbs.qp_attr.qpn = ipsaddr->verbs.rc_qp->qp_num;
-			req->verbs.qp_attr.mtu = opa_mtu_int_to_enum(req->mtu);
+			req->verbs.qp_attr.mtu = opa_mtu_int_to_enum(proto->ep->mtu);
 			req->verbs.qp_attr.srq = 0;
 			req->verbs.qp_attr.resv = 0;
 			req->verbs.qp_attr.target_ack_delay = 0; // TBD; - from local device
 			req->verbs.qp_attr.resv2 = 0;
+#ifdef USE_RDMA_READ
+			// Send our RDMA Read capabilities
+			req->verbs.qp_attr.responder_resources = proto->ep->verbs_ep.max_qp_rd_atom;
+			req->verbs.qp_attr.initiator_depth = proto->ep->verbs_ep.max_qp_init_rd_atom;
+#else
 			req->verbs.qp_attr.responder_resources = 0;
 			req->verbs.qp_attr.initiator_depth = 0;
+#endif
 			memset(&req->verbs.qp_attr.resv3, 0, sizeof(req->verbs.qp_attr.resv3));
+
+			if (IPS_PROTOEXP_FLAG_RDMA_QP(proto->ep->rdmamode) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) {
+				req->verbs.urc.recv_addr = (uintptr_t)ipsaddr->verbs.recv_seq_mr->addr;
+				req->verbs.urc.recv_rkey = ipsaddr->verbs.recv_seq_mr->rkey;
+			} else {
+				req->verbs.urc.recv_addr = 0;
+				req->verbs.urc.recv_rkey = 0;
+			}
 		} else
 #endif // USE_RC
 			memset(&req->verbs.qp_attr, 0, sizeof(req->verbs.qp_attr));
@@ -489,6 +506,28 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_init_connections(
 	ipsaddr->verbs.use_allocator =  &proto->ep->verbs_ep.send_allocator;
 	ipsaddr->verbs.use_qp =  proto->ep->verbs_ep.qp;
 	ipsaddr->verbs.use_max_inline_data = proto->ep->verbs_ep.qp_cap.max_inline_data;
+
+	if (IPS_PROTOEXP_FLAG_RDMA_QP(proto->ep->rdmamode) == IPS_PROTOEXP_FLAG_RDMA_USER_RC) {
+		struct ips_flow *flow = &ipsaddr->flows[EP_FLOW_GO_BACK_N_PIO];
+
+		ipsaddr->verbs.recv_seq_mr = ibv_reg_mr(proto->ep->verbs_ep.pd,
+			&flow->recv_seq_num, sizeof(flow->recv_seq_num),
+			IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_READ);
+		if (!ipsaddr->verbs.recv_seq_mr) {
+			_HFI_ERROR("Unable register recv_seq_num MR on %s: %s\n",
+						proto->ep->dev_name, strerror(errno));
+			goto fail;
+		}
+
+		ipsaddr->verbs.remote_recv_psn_mr = ibv_reg_mr(proto->ep->verbs_ep.pd,
+			&ipsaddr->verbs.remote_recv_psn, sizeof(ipsaddr->verbs.remote_recv_psn),
+			IBV_ACCESS_LOCAL_WRITE);
+		if (!ipsaddr->verbs.remote_recv_psn_mr) {
+			_HFI_ERROR("Unable register remote_recv_psn MR on %s: %s\n",
+						proto->ep->dev_name, strerror(errno));
+			goto fail;
+		}
+	}
 #endif
 
 #ifdef RNDV_MOD
@@ -542,6 +581,14 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ipsaddr_init_connections(
                 rc_qp_destroy(ipsaddr->verbs.rc_qp);
                 ipsaddr->verbs.rc_qp = NULL;
         }
+		if (ipsaddr->verbs.recv_seq_mr) {
+			ibv_dereg_mr(ipsaddr->verbs.recv_seq_mr);
+			ipsaddr->verbs.recv_seq_mr = NULL;
+		}
+		if (ipsaddr->verbs.remote_recv_psn_mr) {
+			ibv_dereg_mr(ipsaddr->verbs.remote_recv_psn_mr);
+			ipsaddr->verbs.remote_recv_psn_mr = NULL;
+		}
 #endif
 	return err;
 }
@@ -650,7 +697,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_ips_ptl_pollintr(
 					 next_timeout, pollok, pollcyc, pollintr);
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static PSMI_HAL_INLINE void psm3_hfp_verbs_gdr_close(void)
 {
 }
@@ -661,7 +708,7 @@ static PSMI_HAL_INLINE void* psm3_hfp_verbs_gdr_convert_gpu_to_host_addr(unsigne
 	return psm3_verbs_gdr_convert_gpu_to_host_addr(buf, size, flags,
                                 ep);
 }
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 #include "verbs_spio.c"
 
@@ -670,7 +717,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_spio_transfer_frame(struct ip
 					uint32_t *payload, uint32_t length,
 					uint32_t isCtrlMsg, uint32_t cksum_valid,
 					uint32_t cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, uint32_t is_gpu_payload
 #endif
 	)
@@ -678,7 +725,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_spio_transfer_frame(struct ip
 	return psm3_verbs_spio_transfer_frame(proto, flow, scb,
 					 payload, length, isCtrlMsg,
 					 cksum_valid, cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, is_gpu_payload
 #endif
 	);
@@ -689,7 +736,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_transfer_frame(struct ips_pro
 					uint32_t *payload, uint32_t length,
 					uint32_t isCtrlMsg, uint32_t cksum_valid,
 					uint32_t cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, uint32_t is_gpu_payload
 #endif
 	)
@@ -697,7 +744,7 @@ static PSMI_HAL_INLINE psm2_error_t psm3_hfp_verbs_transfer_frame(struct ips_pro
 	return psm3_verbs_spio_transfer_frame(proto, flow, scb,
 					 payload, length, isCtrlMsg,
 					 cksum_valid, cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, is_gpu_payload
 #endif
 	);
diff --git a/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c b/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c
index f38aa505fc8..9f5d867ba54 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c
+++ b/prov/psm3/psm3/hal_verbs/verbs_recvhdrq.c
@@ -222,7 +222,7 @@ psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq)
 				break;
 			else if_pf (err < 0) {
 				if (errno == EAGAIN || errno == EWOULDBLOCK
-				    || errno == EBUSY || errno = EINTR)
+				    || errno == EBUSY || errno == EINTR)
 					break;
 				_HFI_ERROR("failed ibv_poll_cq '%s' (%d) on %s port %u epid %s\n",
 					strerror(errno), errno, ep->dev_name, ep->portnum, psm3_epid_fmt_internal(ep->epid, 0));
@@ -360,7 +360,7 @@ psm2_error_t psm3_verbs_recvhdrq_progress(struct ips_recvhdrq *recvq)
 			break;
 		}
 #if VERBS_RECV_CQE_BATCH > 1
-	} while(! done);
+	} while(ep->verbs_ep.recv_wc_count || !done);
 #else
 	}
 #endif
diff --git a/prov/psm3/psm3/hal_verbs/verbs_spio.c b/prov/psm3/psm3/hal_verbs/verbs_spio.c
index f12478ef70c..d1cf283b162 100644
--- a/prov/psm3/psm3/hal_verbs/verbs_spio.c
+++ b/prov/psm3/psm3/hal_verbs/verbs_spio.c
@@ -67,6 +67,44 @@
 #include "ips_proto_internal.h"
 #include "ips_proto_params.h"
 
+#ifdef USE_RC
+static inline psm2_error_t
+psm3_verbs_get_remote_psn(psm2_ep_t ep, struct ips_epaddr *ipsaddr) {
+	psm2_error_t ret = PSM2_OK;
+
+	struct ibv_send_wr wr;
+	struct ibv_send_wr *bad_wr;
+	struct ibv_sge list;
+
+	// set local location to store received data
+	list.addr = (uintptr_t)ipsaddr->verbs.remote_recv_psn_mr->addr;
+	list.length = sizeof(ipsaddr->verbs.remote_recv_psn);
+	list.lkey = ipsaddr->verbs.remote_recv_psn_mr->lkey;
+
+	wr.next = NULL; // just post 1
+	wr.wr_id = (uintptr_t)ipsaddr;
+	wr.sg_list = &list;
+	wr.num_sge = 1; // size of sg_list
+	wr.opcode = IBV_WR_RDMA_READ;
+
+	// set remote location where to read data from
+	wr.wr.rdma.remote_addr = ipsaddr->verbs.remote_recv_seq_addr;
+	wr.wr.rdma.rkey = ipsaddr->verbs.remote_recv_seq_rkey;
+	wr.send_flags = IBV_SEND_SIGNALED;
+
+	if_pf (ibv_post_send(ipsaddr->verbs.rc_qp, &wr, &bad_wr)) {
+		if (errno != EBUSY && errno != EAGAIN && errno != ENOMEM)
+			_HFI_ERROR("failed to get remote psn num on %s port %u: %s\n",
+					ep->dev_name, ep->portnum, strerror(errno));
+		return PSM2_EP_NO_RESOURCES;
+	}
+	ipsaddr->verbs.remote_seq_outstanding = 1;
+	_HFI_VDBG("posted remote_recv_psn RDMA READ: from 0x%"PRIx64" to 0x%"PRIx64" len %u rkey 0x%x\n",
+		wr.wr.rdma.remote_addr, list.addr, list.length, wr.wr.rdma.rkey);
+	return ret;
+}
+#endif
+
 // TBD we could get also get scb->cksum out of scb
 // when called:
 //		scb->ips_lrh has fixed size PSM header including OPA LRH
@@ -100,7 +138,7 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 			struct ips_scb *scb, uint32_t *payload,
 			uint32_t length, uint32_t isCtrlMsg,
 			uint32_t cksum_valid, uint32_t cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			, uint32_t is_gpu_payload
 #endif
 			)
@@ -148,6 +186,36 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 #endif // PSM_FI
 	PSMI_LOCK_ASSERT(proto->mq->progress_lock);
 	psmi_assert_always(! cksum_valid);	// no software checksum yet
+
+#ifdef USE_RC
+	if (!isCtrlMsg && flow->ipsaddr->verbs.use_qp->qp_type == IBV_QPT_RC && proto->max_credits < IPS_PROTO_FLOW_CREDITS_RC_MAX) {
+		if (flow->ipsaddr->verbs.remote_seq_outstanding) {
+			psm3_verbs_completion_update(proto->ep, 1);
+			if (flow->ipsaddr->verbs.remote_seq_outstanding)
+				return PSM2_EP_NO_RESOURCES;
+		}
+
+		// NOTE: the remote_recv_psn is the actual received pkt psn + 1 (see ips_proto_is_expected_or_nak())
+		//       and the scb psn_num is the pkt we are going to send out. So we have below diff calculation
+		int diff = scb->seq_num.psn_num - flow->ipsaddr->verbs.remote_recv_psn;
+
+		_HFI_VDBG("pkt psn=%d remote recv psn=%d diff=%d cc_count=%d\n",
+			scb->seq_num.psn_num, flow->ipsaddr->verbs.remote_recv_psn, diff,
+			flow->ipsaddr->verbs.cc_count);
+		if (diff < 0)
+			diff += proto->psn_mask + 1;
+		if (diff >= proto->max_credits || (flow->ipsaddr->verbs.cc_count && diff >= proto->min_credits)) {
+			psm3_verbs_get_remote_psn(proto->ep, flow->ipsaddr);
+			// cc_count is congestion control count. right now we use it to indicate whether is
+			// under congestion control. The count can potentially used in dynamic CC adjustment
+			// in the future
+			flow->ipsaddr->verbs.cc_count += 1;
+			return PSM2_EP_NO_RESOURCES;
+		}
+
+		flow->ipsaddr->verbs.cc_count = 0;
+	}
+#endif
 	// allocate a send buffer
 	// if we have no buffers, we can return PSM2_EP_NO_RESOURCES and caller
 	// will try again later
@@ -161,9 +229,17 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 	}
 	if_pf (! sbuf) {
 		_HFI_VDBG("out of send buffers\n");
+		// try to poll send completion and see if we can free some sbuf
+		psm3_verbs_completion_update(proto->ep, 1);
 		return PSM2_EP_NO_RESOURCES;
 	}
 	_HFI_VDBG("got sbuf %p index %lu\n", sbuf_to_buffer(sbuf), send_buffer_index(sbuf_pool(ep, sbuf), sbuf_to_buffer(sbuf)));
+
+	uint8_t is_reliable = USE_QP->qp_type == IBV_QPT_RC && scb == STAILQ_FIRST(&flow->scb_unacked);
+	if (is_reliable) {
+		// no explicit ack for RC because RC already has its own ack
+		ips_lrh->bth[2] &= __cpu_to_be32(~IPS_SEND_FLAG_ACKREQ);
+	}
 	// TBD - we should be able to skip sending some headers such as OPA lrh and
 	// perhaps bth (does PSM use bth to hold PSNs?)
 	// copy scb->ips_lrh to send buffer
@@ -171,7 +247,7 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 	memcpy(sbuf_to_buffer(sbuf), ips_lrh, sizeof(*ips_lrh));
 	if (!send_dma) {
 		// copy payload to send buffer, length could be zero, be safe
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (is_gpu_payload) {
 			_HFI_VDBG("copy gpu payload %p %u\n",  payload, length);
 			PSM3_GPU_MEMCPY_DTOH(sbuf_to_buffer(sbuf) + sizeof(*ips_lrh),
@@ -287,7 +363,7 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 		psm3_ep_verbs_unalloc_sbuf(USE_ALLOCATOR, sbuf, prev_sbuf);
 		ret = PSM2_EP_NO_RESOURCES;
 	}
-	_HFI_VDBG("done ud_transfer_frame: len %u, remote qpn %u\n",
+	_HFI_VDBG("done spio_transfer_frame: len %u, remote qpn %u\n",
 		list[0].length +list[1].length,
 #ifdef USE_RC
 		(USE_QP->qp_type != IBV_QPT_UD)? flow->ipsaddr->verbs.remote_qpn :
@@ -297,7 +373,7 @@ psm3_verbs_spio_transfer_frame(struct ips_proto *proto, struct ips_flow *flow,
 	err = psm3_verbs_completion_update(proto->ep, 0);
 	if_pf (err != PSM2_OK)
 		return err;
-	return ret;
+	return is_reliable ? PSM2_RELIABLE_DATA_SENT : ret;
 #undef USE_ALLOCATOR
 #undef USE_QP
 #undef USE_MAX_INLINE
diff --git a/prov/psm3/psm3/include/linux-i386/sysdep.h b/prov/psm3/psm3/include/linux-i386/sysdep.h
index 3d5d944964b..f8e2046f8c6 100644
--- a/prov/psm3/psm3/include/linux-i386/sysdep.h
+++ b/prov/psm3/psm3/include/linux-i386/sysdep.h
@@ -56,34 +56,6 @@
 #ifndef _HFI_i386_SYSDEP_H
 #define _HFI_i386_SYSDEP_H
 
-typedef struct cpuid {
-        unsigned eax, ebx, ecx, edx;
-} cpuid_t;
-
-static __inline__ void
-get_cpuid(const unsigned func, const unsigned subfunc, cpuid_t *id)
-{
-	unsigned a, b, c, d;
-
-	asm (" \
-	mov %4, %%eax \n\
-	mov %5, %%ecx \n\
-	cpuid \n\
-	mov %%eax, %0 \n\
-	mov %%ebx, %1 \n\
-	mov %%ecx, %2 \n\
-	mov %%edx, %3 \n\
-	" : "=g" (a), "=g" (b), "=g" (c), "=g" (d)
-	: "g" (func), "g" (subfunc)
-	: "%eax", "%ebx", "%ecx", "%edx"
-	);
-
-	id->eax = a;
-	id->ebx = b;
-	id->ecx = c;
-	id->edx = d;
-}
-
 static __inline__ uint64_t get_cycles(void)
 {
 	uint64_t v;
diff --git a/prov/psm3/psm3/include/utils_debug.h b/prov/psm3/psm3/include/utils_debug.h
index b7b6655f2e6..aba4b020fb5 100644
--- a/prov/psm3/psm3/include/utils_debug.h
+++ b/prov/psm3/psm3/include/utils_debug.h
@@ -172,7 +172,7 @@ extern char psm3_mylabel[];
 void psm3_set_mylabel(char *);
 extern FILE *psm3_dbgout;
 extern void psm3_dump_buf(uint8_t *buf, uint32_t len);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len);
 #endif
 
@@ -268,7 +268,7 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len);
 #define _HFI_PDBG_ON unlikely(psm3_dbgmask & __HFI_PKTDBG)
 #define _HFI_PDBG_ALWAYS(fmt, ...) _HFI_DBG_ALWAYS(fmt, ##__VA_ARGS__)
 #define _HFI_PDBG_DUMP_ALWAYS(buf, len) psm3_dump_buf(buf, len)
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #define _HFI_PDBG_DUMP_GPU_ALWAYS(buf, len) psm3_dump_gpu_buf(buf, len)
 #endif
 
@@ -321,7 +321,7 @@ extern void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len);
 #define _HFI_MMDBG_ON 0
 #define _HFI_MMDBG_ALWAYS(fmt, ...)
 #define _HFI_PDBG_DUMP_ALWAYS(buf, len)
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #define _HFI_PDBG_DUMP_GPU_ALWAYS(buf, len)
 #endif
 #define _HFI_INFO_ON 0
diff --git a/prov/psm3/psm3/include/utils_env.h b/prov/psm3/psm3/include/utils_env.h
index 770f04cc44a..9a83a8f2472 100644
--- a/prov/psm3/psm3/include/utils_env.h
+++ b/prov/psm3/psm3/include/utils_env.h
@@ -146,6 +146,11 @@ MOCKABLE(psm3_getenv)(const char *name, const char *descr, int level,
 		union psmi_envvar_val *newval);
 MOCK_DCL_EPILOGUE(psm3_getenv);
 
+// NOTE: This function writes the entire output union pointed to by newval. as a
+// result, the backing storage for the pointer must be at least the size of the
+// full union type, not simply the size of the type indicated by the type
+// parameter.
+//
 int MOCKABLE(psm3_getenv_range)(const char *name, const char *descr,
 		const char *help, unsigned level_flags,
 		int type, union psmi_envvar_val defval, union psmi_envvar_val min,
diff --git a/prov/psm3/psm3/include/utils_user.h b/prov/psm3/psm3/include/utils_user.h
index e40800aedba..8e225d5e94f 100644
--- a/prov/psm3/psm3/include/utils_user.h
+++ b/prov/psm3/psm3/include/utils_user.h
@@ -159,14 +159,6 @@ static __inline__ uint32_t psm3_next_power2(uint64_t x)
 #define HFI_KHDR_TINYLEN_MASK 0xf
 #define HFI_KHDR_TINYLEN_SHIFT 16
 
-
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-extern int is_driver_gpudirect_enabled;
-
-#define PSMI_IS_DRIVER_GPUDIRECT_ENABLED  likely(is_driver_gpudirect_enabled)
-#define PSMI_IS_DRIVER_GPUDIRECT_DISABLED unlikely(!is_driver_gpudirect_enabled)
-#endif
-
 /* hfi kdeth header format */
 struct hfi_kdeth {
 	__le32 kdeth0;
@@ -268,4 +260,15 @@ static __inline__ uint64_t nanosecs_to_cycles(uint64_t ns)
 	return (ns * 1000ULL) / psm3_pico_per_cycle;
 }
 
+/* concatenate two symbols, giving the caller the opportunity to do macro
+ * expansion of either argument.  in particular, this is required for CUDA,
+ * which #define-maps legacy functions to alternate versions (by appending
+ * _v2 suffixes).
+ *
+ * without this, macro authors will get different results depending on whether
+ * they immediately use a passed symbol in a concatenation (will not expand),
+ * or pass it to a nested macro (will expand).
+ */
+#define PSM3_CONCAT(a, b) a##b
+
 #endif /* UTILS_USER_H */
diff --git a/prov/psm3/psm3/psm.c b/prov/psm3/psm3/psm.c
index e46f868f054..06d0a7a11c5 100644
--- a/prov/psm3/psm3/psm.c
+++ b/prov/psm3/psm3/psm.c
@@ -53,7 +53,6 @@
 
 /* Copyright (c) 2003-2016 Intel Corporation. All rights reserved. */
 
-#include <dlfcn.h>
 #include <ctype.h>
 #include "psm_user.h"
 #include "psm2_hal.h"
@@ -101,509 +100,6 @@ char *psm3_affinity_shm_name;
 uint64_t *psm3_shared_affinity_ptr;
 uint64_t *psm3_shared_affinity_nic_refcount_ptr;
 
-uint32_t psm3_cpu_model;
-
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-int is_gdr_copy_enabled;
-uint32_t gdr_copy_limit_send;
-uint32_t gdr_copy_limit_recv;
-int is_gpudirect_enabled = 0;
-int _device_support_gpudirect = -1; // -1 indicates "unset". See device_support_gpudirect().
-int is_driver_gpudirect_enabled;
-uint32_t psm3_gpu_thresh_rndv = PSM3_GPU_THRESH_RNDV;
-uint64_t psm3_gpu_cache_evict;	// in bytes
-#endif
-
-#ifdef PSM_CUDA
-int is_cuda_enabled;
-int my_gpu_device = 0;
-int _gpu_p2p_supported = -1; // -1 indicates "unset". see gpu_p2p_supported().
-int _device_support_unified_addr = -1; // -1 indicates "unchecked". See verify_device_support_unified_addr().
-
-/* CUDA Driver Library */
-void *psmi_cuda_lib;
-int cuda_lib_version;
-/* CUDA Runtime (cudart) Library */
-void *psmi_cudart_lib;
-int cuda_runtime_ver;
-#endif
-
-#ifdef PSM_ONEAPI
-int is_oneapi_ze_enabled;
-int my_gpu_device = 0;
-int _gpu_p2p_supported = -1; // -1 indicates "unset". see gpu_p2p_supported().
-
-ze_context_handle_t ze_context = NULL;
-ze_driver_handle_t ze_driver = NULL;
-struct ze_dev_ctxt ze_devices[MAX_ZE_DEVICES];
-int num_ze_devices = 0;
-struct ze_dev_ctxt *cur_ze_dev = NULL;
-
-/* ZE Loader(zel) And Runtime(ze) Library */
-void *psmi_oneapi_ze_lib;
-ze_api_version_t zel_api_version = 0;
-zel_version_t zel_lib_version = { };
-#endif // PSM_ONEAPI
-
-#ifdef PSM_CUDA
-CUresult (*psmi_cuInit)(unsigned int  Flags );
-CUresult (*psmi_cuCtxDetach)(CUcontext c);
-CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c);
-CUresult (*psmi_cuCtxSetCurrent)(CUcontext c);
-CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
-CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
-CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
-CUresult (*psmi_cuDeviceGet)(CUdevice* device, int  ordinal);
-CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
-CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
-CUresult (*psmi_cuDeviceGetCount)(int* count);
-CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
-CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
-CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
-CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
-CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
-CUresult (*psmi_cuEventQuery)(CUevent hEvent);
-CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream);
-CUresult (*psmi_cuEventSynchronize)(CUevent hEvent);
-CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags);
-CUresult (*psmi_cuMemFreeHost)(void* p);
-CUresult (*psmi_cuMemHostRegister)(void* p, size_t bytesize, unsigned int Flags);
-CUresult (*psmi_cuMemHostUnregister)(void* p);
-CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
-CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
-CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
-CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
-CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream);
-CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr);
-CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
-CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr);
-CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr);
-CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active);
-CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev);
-CUresult (*psmi_cuCtxGetDevice)(CUdevice* device);
-CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
-CUresult (*psmi_cuGetErrorString)(CUresult error, const char **pStr);
-cudaError_t (*psmi_cudaRuntimeGetVersion)(int* runtimeVersion);
-
-uint64_t psmi_count_cuInit;
-uint64_t psmi_count_cuCtxDetach;
-uint64_t psmi_count_cuCtxGetCurrent;
-uint64_t psmi_count_cuCtxSetCurrent;
-uint64_t psmi_count_cuPointerGetAttribute;
-uint64_t psmi_count_cuPointerSetAttribute;
-uint64_t psmi_count_cuDeviceCanAccessPeer;
-uint64_t psmi_count_cuDeviceGet;
-uint64_t psmi_count_cuDeviceGetAttribute;
-uint64_t psmi_count_cuDriverGetVersion;
-uint64_t psmi_count_cuDeviceGetCount;
-uint64_t psmi_count_cuStreamCreate;
-uint64_t psmi_count_cuStreamDestroy;
-uint64_t psmi_count_cuStreamSynchronize;
-uint64_t psmi_count_cuEventCreate;
-uint64_t psmi_count_cuEventDestroy;
-uint64_t psmi_count_cuEventQuery;
-uint64_t psmi_count_cuEventRecord;
-uint64_t psmi_count_cuEventSynchronize;
-uint64_t psmi_count_cuMemHostAlloc;
-uint64_t psmi_count_cuMemFreeHost;
-uint64_t psmi_count_cuMemHostRegister;
-uint64_t psmi_count_cuMemHostUnregister;
-uint64_t psmi_count_cuMemcpy;
-uint64_t psmi_count_cuMemcpyDtoD;
-uint64_t psmi_count_cuMemcpyDtoH;
-uint64_t psmi_count_cuMemcpyHtoD;
-uint64_t psmi_count_cuMemcpyDtoHAsync;
-uint64_t psmi_count_cuMemcpyHtoDAsync;
-uint64_t psmi_count_cuIpcGetMemHandle;
-uint64_t psmi_count_cuIpcOpenMemHandle;
-uint64_t psmi_count_cuIpcCloseMemHandle;
-uint64_t psmi_count_cuMemGetAddressRange;
-uint64_t psmi_count_cuDevicePrimaryCtxGetState;
-uint64_t psmi_count_cuDevicePrimaryCtxRetain;
-uint64_t psmi_count_cuCtxGetDevice;
-uint64_t psmi_count_cuDevicePrimaryCtxRelease;
-uint64_t psmi_count_cuGetErrorString;
-uint64_t psmi_count_cudaRuntimeGetVersion;
-
-int psmi_cuda_lib_load()
-{
-	psm2_error_t err = PSM2_OK;
-	char *dlerr;
-
-	PSM2_LOG_MSG("entering");
-	_HFI_DBG("Loading CUDA library.\n");
-
-	psmi_cuda_lib = dlopen("libcuda.so.1", RTLD_LAZY);
-	if (!psmi_cuda_lib) {
-		dlerr = dlerror();
-		_HFI_ERROR("Unable to open libcuda.so.1.  Error %s\n",
-				dlerr ? dlerr : "no dlerror()");
-		goto fail;
-	}
-
-	psmi_cuDriverGetVersion = dlsym(psmi_cuda_lib, "cuDriverGetVersion");
-
-	if (!psmi_cuDriverGetVersion) {
-		_HFI_ERROR
-			("Unable to resolve symbols in CUDA libraries.\n");
-		goto fail;
-	}
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuGetErrorString);// for PSMI_CUDA_CALL
-
-	PSMI_CUDA_CALL(cuDriverGetVersion, &cuda_lib_version);
-	if (cuda_lib_version < 7000) {
-		_HFI_ERROR("Please update CUDA driver, required minimum version is 7.0\n");
-		goto fail;
-	}
-
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuInit);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetCurrent);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxDetach);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxSetCurrent);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerGetAttribute);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuPointerSetAttribute);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceCanAccessPeer);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetAttribute);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGet);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDeviceGetCount);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamCreate);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamDestroy);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuStreamSynchronize);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventCreate);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventDestroy);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventQuery);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventRecord);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuEventSynchronize);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostAlloc);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemFreeHost);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostRegister);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemHostUnregister);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpy);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoD);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoH);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoD);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyDtoHAsync);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemcpyHtoDAsync);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcGetMemHandle);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcOpenMemHandle);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuIpcCloseMemHandle);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuMemGetAddressRange);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxGetState);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRetain);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuDevicePrimaryCtxRelease);
-	PSMI_CUDA_DLSYM(psmi_cuda_lib, cuCtxGetDevice);
-
-	/* CUDA Runtime */
-	psmi_cudart_lib = dlopen("libcudart.so", RTLD_LAZY);
-	if (!psmi_cudart_lib) {
-		dlerr = dlerror();
-		_HFI_ERROR("Unable to open libcudart.so.  Error %s\n",
-				dlerr ? dlerr : "no dlerror()");
-		goto fail;
-	}
-	PSMI_CUDA_DLSYM(psmi_cudart_lib, cudaRuntimeGetVersion);
-
-	PSM2_LOG_MSG("leaving");
-	return err;
-fail:
-	if (psmi_cuda_lib)
-		dlclose(psmi_cuda_lib);
-	if (psmi_cudart_lib)
-		dlclose(psmi_cudart_lib);
-	err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to load CUDA library.\n");
-	return err;
-}
-
-static void psmi_cuda_stats_register()
-{
-#define PSMI_CUDA_COUNT_DECLU64(func) \
-	PSMI_STATS_DECLU64(#func, NULL, &psmi_count_##func)
-
-	struct psmi_stats_entry entries[] = {
-		PSMI_CUDA_COUNT_DECLU64(cuInit),
-		PSMI_CUDA_COUNT_DECLU64(cuCtxDetach),
-		PSMI_CUDA_COUNT_DECLU64(cuCtxGetCurrent),
-		PSMI_CUDA_COUNT_DECLU64(cuCtxSetCurrent),
-		PSMI_CUDA_COUNT_DECLU64(cuPointerGetAttribute),
-		PSMI_CUDA_COUNT_DECLU64(cuPointerSetAttribute),
-		PSMI_CUDA_COUNT_DECLU64(cuDeviceCanAccessPeer),
-		PSMI_CUDA_COUNT_DECLU64(cuDeviceGet),
-		PSMI_CUDA_COUNT_DECLU64(cuDeviceGetAttribute),
-		PSMI_CUDA_COUNT_DECLU64(cuDriverGetVersion),
-		PSMI_CUDA_COUNT_DECLU64(cuDeviceGetCount),
-		PSMI_CUDA_COUNT_DECLU64(cuStreamCreate),
-		PSMI_CUDA_COUNT_DECLU64(cuStreamDestroy),
-		PSMI_CUDA_COUNT_DECLU64(cuStreamSynchronize),
-		PSMI_CUDA_COUNT_DECLU64(cuEventCreate),
-		PSMI_CUDA_COUNT_DECLU64(cuEventDestroy),
-		PSMI_CUDA_COUNT_DECLU64(cuEventQuery),
-		PSMI_CUDA_COUNT_DECLU64(cuEventRecord),
-		PSMI_CUDA_COUNT_DECLU64(cuEventSynchronize),
-		PSMI_CUDA_COUNT_DECLU64(cuMemHostAlloc),
-		PSMI_CUDA_COUNT_DECLU64(cuMemFreeHost),
-		PSMI_CUDA_COUNT_DECLU64(cuMemHostRegister),
-		PSMI_CUDA_COUNT_DECLU64(cuMemHostUnregister),
-		PSMI_CUDA_COUNT_DECLU64(cuMemcpy),
-		PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoD),
-		PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoH),
-		PSMI_CUDA_COUNT_DECLU64(cuMemcpyHtoD),
-		PSMI_CUDA_COUNT_DECLU64(cuMemcpyDtoHAsync),
-		PSMI_CUDA_COUNT_DECLU64(cuMemcpyHtoDAsync),
-		PSMI_CUDA_COUNT_DECLU64(cuIpcGetMemHandle),
-		PSMI_CUDA_COUNT_DECLU64(cuIpcOpenMemHandle),
-		PSMI_CUDA_COUNT_DECLU64(cuIpcCloseMemHandle),
-		PSMI_CUDA_COUNT_DECLU64(cuMemGetAddressRange),
-		PSMI_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxGetState),
-		PSMI_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxRetain),
-		PSMI_CUDA_COUNT_DECLU64(cuCtxGetDevice),
-		PSMI_CUDA_COUNT_DECLU64(cuDevicePrimaryCtxRelease),
-		PSMI_CUDA_COUNT_DECLU64(cuGetErrorString),
-		PSMI_CUDA_COUNT_DECLU64(cudaRuntimeGetVersion),
-	};
-#undef PSMI_CUDA_COUNT_DECLU64
-
-	psm3_stats_register_type("PSM_Cuda_call_statistics",
-		"Count of CUDA calls per API entry point for the whole process.\n"
-		"When using an NVIDIA GPU, PSM3 may call lower level CUDA "
-		"APIs to access or transfer application buffers in GPU memory.",
-			PSMI_STATSTYPE_GPU,
-			entries, PSMI_HOWMANY(entries), NULL,
-			&psmi_count_cuInit, NULL); /* context must != NULL */
-}
-#endif // PSM_CUDA
-
-#ifdef PSM_ONEAPI
-ze_result_t (*psmi_zeInit)(ze_init_flags_t flags);
-ze_result_t (*psmi_zeDriverGet)(uint32_t *pCount, ze_driver_handle_t *phDrivers);
-ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices);
-ze_result_t (*psmi_zeDevicePciGetPropertiesExt)(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties);
-#ifndef PSM3_NO_ONEAPI_IMPORT
-ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress);
-ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDriver, void *ptr, size_t size);
-ze_result_t (*psmi_zexDriverReleaseImportedPointer)(ze_driver_handle_t hDriver, void *ptr);
-#endif
-ze_result_t (*psmi_zeContextCreate)(ze_driver_handle_t hDriver, const ze_context_desc_t *desc, ze_context_handle_t *phContext);
-ze_result_t (*psmi_zeContextDestroy)(ze_context_handle_t hContext);
-ze_result_t (*psmi_zeCommandQueueCreate)(ze_context_handle_t hContext, ze_device_handle_t hDevice,const ze_command_queue_desc_t *desc, ze_command_queue_handle_t *phCommandQueue);
-ze_result_t (*psmi_zeCommandQueueDestroy)(ze_command_queue_handle_t hCommandQueue);
-ze_result_t (*psmi_zeCommandQueueExecuteCommandLists)(ze_command_queue_handle_t hCommandQueue, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ze_fence_handle_t hFence);
-ze_result_t (*psmi_zeCommandQueueSynchronize)(ze_command_queue_handle_t hCommandQueue, uint64_t timeout);
-ze_result_t (*psmi_zeCommandListCreate)(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_list_desc_t *desc, ze_command_list_handle_t *phCommandList);
-ze_result_t (*psmi_zeCommandListDestroy)(ze_command_list_handle_t hCommandList);
-ze_result_t (*psmi_zeCommandListClose)(ze_command_list_handle_t hCommandList);
-ze_result_t (*psmi_zeCommandListReset)(ze_command_list_handle_t hCommandList);
-ze_result_t (*psmi_zeCommandListCreateImmediate)(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_queue_desc_t *desc, ze_command_list_handle_t *phCommandList);
-ze_result_t (*psmi_zeCommandListAppendMemoryCopy)(ze_command_list_handle_t hCommandList, void *dstptr, const void *srcptr, size_t size, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
-ze_result_t (*psmi_zeCommandListAppendSignalEvent)(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent);
-ze_result_t (*psmi_zeDeviceCanAccessPeer)(ze_device_handle_t hDevice, ze_device_handle_t hPeerDevice, ze_bool_t *value);
-ze_result_t (*psmi_zeDeviceGetCommandQueueGroupProperties)(ze_device_handle_t hDevice, uint32_t *pCount, ze_command_queue_group_properties_t *pCommandQueueGroupProperties);
-ze_result_t (*psmi_zeMemAllocHost)(ze_context_handle_t hContext, const ze_host_mem_alloc_desc_t *host_desc, size_t size, size_t alignment, void **pptr);
-ze_result_t (*psmi_zeMemAllocDevice)(ze_context_handle_t hContext, const ze_device_mem_alloc_desc_t *device_desc, size_t size, size_t alignment, ze_device_handle_t hDevice, void **pptr);
-ze_result_t (*psmi_zeMemFree)(ze_context_handle_t hContext, void *ptr);
-ze_result_t (*psmi_zeMemGetIpcHandle)(ze_context_handle_t hContext, const void *ptr, ze_ipc_mem_handle_t *pIpcHandle);
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-ze_result_t (*psmi_zeMemGetIpcHandleFromFileDescriptorExp)(ze_context_handle_t hContext, uint64_t handle, ze_ipc_mem_handle_t *pIpcHandle);
-ze_result_t (*psmi_zeMemGetFileDescriptorFromIpcHandleExp)(ze_context_handle_t hContext, ze_ipc_mem_handle_t ipcHandle, uint64_t *pHandle);
-ze_result_t (*psmi_zeMemPutIpcHandle)(ze_context_handle_t hContext, ze_ipc_mem_handle_t handle);
-#endif
-ze_result_t (*psmi_zeMemOpenIpcHandle)(ze_context_handle_t hContext,ze_device_handle_t hDevice, ze_ipc_mem_handle_t handle, ze_ipc_memory_flags_t flags, void **pptr);
-ze_result_t (*psmi_zeMemCloseIpcHandle)(ze_context_handle_t hContext, const void *ptr);
-ze_result_t (*psmi_zeMemGetAddressRange)(ze_context_handle_t hContext, const void *ptr, void **pBase, size_t *pSize);
-ze_result_t (*psmi_zeMemGetAllocProperties)(ze_context_handle_t hContext, const void *ptr, ze_memory_allocation_properties_t *pMemAllocProperties, ze_device_handle_t *phDevice);
-ze_result_t (*psmi_zeEventPoolCreate)(ze_context_handle_t hContext, const ze_event_pool_desc_t *desc, uint32_t numDevices, ze_device_handle_t *phDevices, ze_event_pool_handle_t *phEventPool);
-ze_result_t (*psmi_zeEventPoolDestroy)(ze_event_pool_handle_t hEventPool);
-ze_result_t (*psmi_zeEventCreate)(ze_event_pool_handle_t hEventPool, const ze_event_desc_t *desc, ze_event_handle_t *phEvent);
-ze_result_t (*psmi_zeEventDestroy)(ze_event_handle_t hEvent);
-ze_result_t (*psmi_zeEventQueryStatus)(ze_event_handle_t hEvent);
-ze_result_t (*psmi_zeEventHostSynchronize)(ze_event_handle_t hEvent, uint64_t timeout);
-ze_result_t (*psmi_zeEventHostReset)(ze_event_handle_t hEvent);
-ze_result_t (*psmi_zelLoaderGetVersions)(size_t *num_elems, zel_component_version_t *versions);
-
-uint64_t psmi_count_zeInit;
-uint64_t psmi_count_zeDriverGet;
-uint64_t psmi_count_zeDeviceGet;
-uint64_t psmi_count_zeDevicePciGetPropertiesExt;
-#ifndef PSM3_NO_ONEAPI_IMPORT
-uint64_t psmi_count_zeDriverGetExtensionFunctionAddress;
-uint64_t psmi_count_zexDriverImportExternalPointer;
-uint64_t psmi_count_zexDriverReleaseImportedPointer;
-#endif
-uint64_t psmi_count_zeContextCreate;
-uint64_t psmi_count_zeContextDestroy;
-uint64_t psmi_count_zeCommandQueueCreate;
-uint64_t psmi_count_zeCommandQueueDestroy;
-uint64_t psmi_count_zeCommandQueueExecuteCommandLists;
-uint64_t psmi_count_zeCommandQueueSynchronize;
-uint64_t psmi_count_zeCommandListCreate;
-uint64_t psmi_count_zeCommandListDestroy;
-uint64_t psmi_count_zeCommandListClose;
-uint64_t psmi_count_zeCommandListReset;
-uint64_t psmi_count_zeCommandListCreateImmediate;
-uint64_t psmi_count_zeCommandListAppendMemoryCopy;
-uint64_t psmi_count_zeCommandListAppendSignalEvent;
-uint64_t psmi_count_zeDeviceCanAccessPeer;
-uint64_t psmi_count_zeDeviceGetCommandQueueGroupProperties;
-uint64_t psmi_count_zeMemAllocHost;
-uint64_t psmi_count_zeMemAllocDevice;
-uint64_t psmi_count_zeMemFree;
-uint64_t psmi_count_zeMemGetIpcHandle;
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-uint64_t psmi_count_zeMemGetIpcHandleFromFileDescriptorExp;
-uint64_t psmi_count_zeMemGetFileDescriptorFromIpcHandleExp;
-uint64_t psmi_count_zeMemPutIpcHandle;
-#endif
-uint64_t psmi_count_zeMemOpenIpcHandle;
-uint64_t psmi_count_zeMemCloseIpcHandle;
-uint64_t psmi_count_zeMemGetAddressRange;
-uint64_t psmi_count_zeMemGetAllocProperties;
-uint64_t psmi_count_zeEventPoolCreate;
-uint64_t psmi_count_zeEventPoolDestroy;
-uint64_t psmi_count_zeEventCreate;
-uint64_t psmi_count_zeEventDestroy;
-uint64_t psmi_count_zeEventQueryStatus;
-uint64_t psmi_count_zeEventHostSynchronize;
-uint64_t psmi_count_zeEventHostReset;
-uint64_t psmi_count_zelLoaderGetVersions;
-
-int psmi_oneapi_ze_load()
-{
-	psm2_error_t err = PSM2_OK;
-	char *dlerr;
-
-	PSM2_LOG_MSG("entering");
-	_HFI_VDBG("Loading OneAPI Level Zero library.\n");
-
-	psmi_oneapi_ze_lib = dlopen("libze_loader.so", RTLD_LAZY);
-	if (!psmi_oneapi_ze_lib) {
-		dlerr = dlerror();
-		_HFI_ERROR(
-			"Unable to open libze_loader.so.  Error %s\n",
-			dlerr ? dlerr : "no dlerror()");
-		goto fail;
-	}
-
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeInit);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGet);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDeviceGet);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDevicePciGetPropertiesExt);
-#ifndef PSM3_NO_ONEAPI_IMPORT
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDriverGetExtensionFunctionAddress);
-#endif
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeContextCreate);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeContextDestroy);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandQueueCreate);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandQueueDestroy);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandQueueExecuteCommandLists);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandQueueSynchronize);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListCreate);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListDestroy);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListClose);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListReset);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListCreateImmediate);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListAppendMemoryCopy);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeCommandListAppendSignalEvent);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDeviceCanAccessPeer);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeDeviceGetCommandQueueGroupProperties);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemAllocHost);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemAllocDevice);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemFree);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemGetIpcHandle);
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemGetIpcHandleFromFileDescriptorExp);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemGetFileDescriptorFromIpcHandleExp);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemPutIpcHandle);
-#endif
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemOpenIpcHandle);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemCloseIpcHandle);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemGetAddressRange);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeMemGetAllocProperties);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventPoolCreate);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventPoolDestroy);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventCreate);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventDestroy);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventQueryStatus);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventHostSynchronize);
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zeEventHostReset);
-
-	/* ze loader API */
-	PSMI_ONEAPI_ZE_DLSYM(psmi_oneapi_ze_lib, zelLoaderGetVersions);
-
-	PSM2_LOG_MSG("leaving");
-	return err;
-fail:
-	if (psmi_oneapi_ze_lib)
-		dlclose(psmi_oneapi_ze_lib);
-	err = psm3_handle_error(PSMI_EP_NORETURN,
-		PSM2_INTERNAL_ERR,
-		"Unable to load OneAPI Level Zero library.\n");
-	return err;
-}
-
-static void psmi_oneapi_ze_stats_register()
-{
-#define PSMI_ONEAPI_ZE_COUNT_DECLU64(func) \
-	PSMI_STATS_DECLU64(#func, NULL, &psmi_count_##func)
-
-	struct psmi_stats_entry ze_entries[] = {
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeInit),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGet),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGet),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDevicePciGetPropertiesExt),
-#ifndef PSM3_NO_ONEAPI_IMPORT
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDriverGetExtensionFunctionAddress),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zexDriverImportExternalPointer),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zexDriverReleaseImportedPointer),
-#endif
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeContextCreate),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeContextDestroy),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueCreate),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueDestroy),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueExecuteCommandLists),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandQueueSynchronize),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListCreate),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListDestroy),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListClose),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListReset),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListCreateImmediate),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListAppendMemoryCopy),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeCommandListAppendSignalEvent),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDeviceCanAccessPeer),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeDeviceGetCommandQueueGroupProperties),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemAllocHost),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemAllocDevice),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemFree),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemGetIpcHandle),
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemGetIpcHandleFromFileDescriptorExp),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemGetFileDescriptorFromIpcHandleExp),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemPutIpcHandle),
-#endif
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemOpenIpcHandle),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemCloseIpcHandle),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemGetAddressRange),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeMemGetAllocProperties),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventPoolCreate),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventPoolDestroy),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventCreate),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventDestroy),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventQueryStatus),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventHostSynchronize),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zeEventHostReset),
-		PSMI_ONEAPI_ZE_COUNT_DECLU64(zelLoaderGetVersions)
-	};
-#undef PSMI_ONEAPI_ZE_COUNT_DECLU64
-
-	psm3_stats_register_type("PSM_OneAPI_ZE_call_statistics",
-		"Count of OneAPI Level Zero calls per API entry point for the whole process.\n"
-		"When using an Intel(r) GPU, PSM3 may call Level Zero "
-		"APIs to access or transfer application buffers in GPU memory.",
-		PSMI_STATSTYPE_GPU,
-		ze_entries, PSMI_HOWMANY(ze_entries), NULL,
-		&psmi_count_zeInit, NULL); /* context must != NULL */
-}
-#endif // PSM_ONEAPI
-
 /*
  * Bit field that contains capability set.
  * Each bit represents different capability.
@@ -639,410 +135,6 @@ int MOCKABLE(psm3_isinitialized)()
 }
 MOCK_DEF_EPILOGUE(psm3_isinitialized);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-static void psmi_gpu_init(void)
-{
-	int ret;
-
-	union psmi_envvar_val env_enable_gdr_copy;
-	psm3_getenv("PSM3_GDRCOPY",
-				"Enable (set envvar to 1) for gdr copy support in PSM (Enabled by default)",
-				PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
-				(union psmi_envvar_val)1, &env_enable_gdr_copy);
-	is_gdr_copy_enabled = env_enable_gdr_copy.e_int;
-
-	union psmi_envvar_val env_gpu_thresh_rndv;
-	ret = psm3_getenv_range("PSM3_GPU_THRESH_RNDV",
-			  "RNDV protocol is used for GPU send message sizes greater than the threshold",
-			  NULL, PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-			  (union psmi_envvar_val)psm3_gpu_thresh_rndv,
-			  (union psmi_envvar_val)0, (union psmi_envvar_val)UINT32_MAX,
-			  NULL, NULL, &env_gpu_thresh_rndv);
-	if (ret > 0)
-		/*
-		 * For backward compatibility, check if the old variable name is set.
-		 * Priority order: New name > old name > default value.
-		 */
-		psm3_getenv("PSM3_CUDA_THRESH_RNDV",
-			    "[Deprecated, use PSM3_GPU_THRESH_RNDV]"
-			    " RNDV protocol is used for GPU send message sizes greater than the threshold",
-			    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
-			    (union psmi_envvar_val)psm3_gpu_thresh_rndv,
-				&env_gpu_thresh_rndv);
-
-	psm3_gpu_thresh_rndv = env_gpu_thresh_rndv.e_uint;
-
-
-	union psmi_envvar_val env_gdr_copy_limit_send;
-	psm3_getenv("PSM3_GDRCOPY_LIMIT_SEND",
-				"GDR Copy is turned off on the send side"
-				" for message sizes greater than the limit"
-#ifndef OPA
-				" or larger than 1 MTU\n",
-#else
-				"\n",
-#endif
-				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
-				(union psmi_envvar_val)GDR_COPY_LIMIT_SEND, &env_gdr_copy_limit_send);
-	gdr_copy_limit_send = env_gdr_copy_limit_send.e_int;
-
-	if (gdr_copy_limit_send < 8 || gdr_copy_limit_send > psm3_gpu_thresh_rndv)
-		gdr_copy_limit_send = max(GDR_COPY_LIMIT_SEND, psm3_gpu_thresh_rndv);
-
-	union psmi_envvar_val env_gdr_copy_limit_recv;
-	psm3_getenv("PSM3_GDRCOPY_LIMIT_RECV",
-				"GDR Copy is turned off on the recv side"
-				" for message sizes greater than the limit\n",
-				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
-				(union psmi_envvar_val)GDR_COPY_LIMIT_RECV, &env_gdr_copy_limit_recv);
-	gdr_copy_limit_recv = env_gdr_copy_limit_recv.e_int;
-
-	if (gdr_copy_limit_recv < 8)
-		gdr_copy_limit_recv = GDR_COPY_LIMIT_RECV;
-
-	if (!is_gdr_copy_enabled)
-		gdr_copy_limit_send = gdr_copy_limit_recv = 0;
-}
-#endif /* PSM_CUDA || PSM_ONEAPI */
-
-#ifdef PSM_CUDA
-int psmi_cuda_initialize()
-{
-	psm2_error_t err = PSM2_OK;
-
-	PSM2_LOG_MSG("entering");
-	_HFI_DBG("Enabling CUDA support.\n");
-
-	psmi_cuda_stats_register();
-
-	err = psmi_cuda_lib_load();
-	if (err != PSM2_OK)
-		goto fail;
-
-	PSMI_CUDA_CALL(cuInit, 0);
-
-	PSMI_CUDA_CALL(cudaRuntimeGetVersion, &cuda_runtime_ver);
-
-#ifdef PSM_HAVE_RNDV_MOD
-	psm2_get_gpu_bars();
-#endif
-
-	psmi_gpu_init();
-
-	PSM2_LOG_MSG("leaving");
-	return err;
-fail:
-	err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, "Unable to initialize PSM3 CUDA support.\n");
-	return err;
-}
-#endif // PSM_CUDA
-
-#ifdef PSM_ONEAPI
-
-static void psmi_oneapi_find_copy_only_engine(ze_device_handle_t dev,
-					      struct ze_dev_ctxt *ctxt)
-{
-	uint32_t count = 0;
-	ze_command_queue_group_properties_t *props = NULL;
-	int i;
-	int done = 0;
-
-	/* Set the default */
-	ctxt->ordinal = 0;
-	ctxt->index = 0;
-	ctxt->num_queues = 1;
-	PSMI_ONEAPI_ZE_CALL(zeDeviceGetCommandQueueGroupProperties, dev,
-			    &count, NULL);
-	props = psmi_calloc(PSMI_EP_NONE, UNDEFINED, count, sizeof(*props));
-	if (!props) {
-		_HFI_ERROR("Failed to allocate mem for CmdQ Grp\n");
-		return;
-	}
-	PSMI_ONEAPI_ZE_CALL(zeDeviceGetCommandQueueGroupProperties, dev,
-			    &count, props);
-
-	// pick the last command queue group which supports copy but not compute.
-	// For PVC this will be the xeLink copy engine which will also
-	// have numQueues >1 (TBD - perhaps only select if it has numQueues>1).
-	// This ordinal is then supplied to create Command Queues and Command Lists.
-	for (i = count - 1; i >= 0; i--) {
-		_HFI_DBG("GPU Queue Group %d: copy=%d Compute=%d num_queues=%d\n", i,
-			(props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) != 0,
-			(props[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) != 0,
-			(int)props[i].numQueues);
-		if (! done && (props[i].flags &
-		    ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY) &&
-		    !(props[i].flags &
-		      ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) {
-			ctxt->ordinal = i;
-			ctxt->num_queues = props[i].numQueues;
-			done = 1;
-			if (_HFI_DBG_ON) {
-				_HFI_DBG_ALWAYS("Selected GPU copy engine %d\n", i);
-			} else {
-				break;
-			}
-		}
-	}
-	psmi_free(props);
-}
-
-// create command queue for use in psmi_oneapi_ze_memcpy for sync memcpy
-static void psmi_oneapi_cmd_create(ze_device_handle_t dev, struct ze_dev_ctxt *ctxt)
-{
-	ze_command_queue_desc_t ze_cq_desc = {
-		.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
-		.flags = 0,
-		//.mode set below
-		.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
-	};
-
-	psmi_oneapi_find_copy_only_engine(dev, ctxt);
-	ze_cq_desc.ordinal = ctxt->ordinal;
-	ze_cq_desc.index = ctxt->index;
-
-	if (psm3_oneapi_immed_sync_copy) {
-		ze_cq_desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
-		PSMI_ONEAPI_ZE_CALL(zeCommandListCreateImmediate, ze_context,
-			 dev, &ze_cq_desc, &ctxt->cl);
-	} else {
-		ze_command_list_desc_t ze_cl_desc = {
-			.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
-			.flags = 0
-		};
-		ze_cq_desc.mode = ZE_COMMAND_QUEUE_MODE_DEFAULT;
-
-		PSMI_ONEAPI_ZE_CALL(zeCommandQueueCreate, ze_context,
-			dev, &ze_cq_desc, &ctxt->cq);
-
-		ze_cl_desc.commandQueueGroupOrdinal = ctxt->ordinal;
-		PSMI_ONEAPI_ZE_CALL(zeCommandListCreate, ze_context,
-			dev, &ze_cl_desc, &ctxt->cl);
-	}
-	ctxt->dev = dev;
-
-	if (psm3_oneapi_parallel_dtod_copy_thresh < UINT_MAX) {
-		// create resources for dual copy mechanism
-		ze_event_pool_desc_t pool_desc = {
-				.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,
-				.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE,
-				.count = 2
-		};
-		ze_event_desc_t event_desc = {
-				.stype = ZE_STRUCTURE_TYPE_EVENT_DESC,
-				.signal = ZE_EVENT_SCOPE_FLAG_HOST,
-				.wait = ZE_EVENT_SCOPE_FLAG_HOST,
-		};
-		PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate,
-				ze_context, &pool_desc, 0, NULL, &ctxt->event_pool);
-
-		event_desc.index = 0;
-		PSMI_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc,
-				&ctxt->copy_status0);
-
-		event_desc.index = 1;
-		PSMI_ONEAPI_ZE_CALL(zeEventCreate, ctxt->event_pool, &event_desc,
-				&ctxt->copy_status1);
-
-		psmi_oneapi_async_cmd_create(ctxt, &ctxt->async_cq0,
-				&ctxt->async_cl0);
-		psmi_oneapi_async_cmd_create(ctxt, &ctxt->async_cq1,
-				&ctxt->async_cl1);
-	}
-}
-
-void psmi_oneapi_cmd_create_all(void)
-{
-	int i;
-	struct ze_dev_ctxt *ctxt;
-	ze_context_desc_t ctxtDesc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, NULL, 0 };
-
-	if (!ze_context)
-		PSMI_ONEAPI_ZE_CALL(zeContextCreate, ze_driver, &ctxtDesc,
-				    &ze_context);
-
-	for (i = 0; i < num_ze_devices; i++) {
-		ctxt = &ze_devices[i];
-
-		if (!ctxt->cl) {
-			psmi_oneapi_cmd_create(ctxt->dev, ctxt);
-			_HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n",
-						i, ctxt->dev);
-		}
-	}
-	if (num_ze_devices > 0)
-		cur_ze_dev = &ze_devices[0];
-}
-
-void psmi_oneapi_cmd_destroy_all(void)
-{
-	int i;
-	struct ze_dev_ctxt *ctxt;
-
-	for (i = 0; i < num_ze_devices; i++) {
-		ctxt = &ze_devices[i];
-
-		if (ctxt->async_cl1 != NULL) {
-			PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl1);
-			ctxt->async_cl1 = NULL;
-		}
-		if (ctxt->async_cq1 != NULL) {
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq1);
-			ctxt->async_cq1 = NULL;
-		}
-		if (ctxt->async_cl0 != NULL) {
-			PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->async_cl0);
-			ctxt->async_cl0 = NULL;
-		}
-		if (ctxt->async_cq0 != NULL) {
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->async_cq0);
-			ctxt->async_cq0 = NULL;
-		}
-		if (ctxt->copy_status1 != NULL) {
-			PSMI_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status1);
-			ctxt->copy_status1 = NULL;
-		}
-		if (ctxt->copy_status0 != NULL) {
-			PSMI_ONEAPI_ZE_CALL(zeEventDestroy, ctxt->copy_status0);
-			ctxt->copy_status0 = NULL;
-		}
-		if (ctxt->event_pool != NULL) {
-			PSMI_ONEAPI_ZE_CALL(zeEventPoolDestroy, ctxt->event_pool);
-			ctxt->event_pool = NULL;
-		}
-		if (ctxt->cl) {
-			PSMI_ONEAPI_ZE_CALL(zeCommandListDestroy, ctxt->cl);
-			ctxt->cl = NULL;
-		}
-		if (ctxt->cq) {
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, ctxt->cq);
-			ctxt->cq = NULL;
-		}
-	}
-	cur_ze_dev = NULL;
-
-	/* Also destroy ze_context */
-	if (ze_context) {
-		PSMI_ONEAPI_ZE_CALL(zeContextDestroy, ze_context);
-		ze_context = NULL;
-	}
-}
-
-int psmi_oneapi_ze_initialize()
-{
-	psm2_error_t err = PSM2_OK;
-	uint32_t ze_driver_count = 1;
-	uint32_t ze_device_count = 0;
-	ze_device_handle_t devices[MAX_ZE_DEVICES];
-	zel_component_version_t *zel_comps = NULL;
-	size_t num_zel_comps;
-	int i;
-	union psmi_envvar_val env;
-
-	PSM2_LOG_MSG("entering");
-	_HFI_DBG("Init Level Zero library.\n");
-
-	psmi_oneapi_ze_stats_register();
-	err = psmi_oneapi_ze_load();
-	if (err != PSM2_OK)
-		goto fail;
-
-	psm3_getenv("PSM3_ONEAPI_IMMED_SYNC_COPY",
-				"Use Immediate CommandList for synchronous copy to/from GPU]",
-				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
-				(union psmi_envvar_val)1, &env);
-	psm3_oneapi_immed_sync_copy = env.e_int;
-
-	psm3_getenv("PSM3_ONEAPI_IMMED_ASYNC_COPY",
-				"Use Immediate CommandList for asynchronous pipeline copy to/from GPU]",
-				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
-				(union psmi_envvar_val)1, &env);
-	psm3_oneapi_immed_async_copy = env.e_int;
-
-	psm3_getenv("PSM3_ONEAPI_PARALLEL_DTOD_COPY_THRESH",
-				"Use parallel CommandLists for GPU to GPU copy larger than threshold",
-				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
-				(union psmi_envvar_val)(256*1024-1), &env);
-	// no benefit below 128K-1, plus the copy is spilt at a 64K boundary
-	psm3_oneapi_parallel_dtod_copy_thresh = max(128*1024-1, env.e_uint);
-
-
-	PSMI_ONEAPI_ZE_CALL(zeInit, ZE_INIT_FLAG_GPU_ONLY);
-
-	/* Need to query count before alloc array */
-	PSMI_ONEAPI_ZE_CALL(zelLoaderGetVersions, &num_zel_comps, NULL);
-	if (num_zel_comps > 0) {
-		zel_comps = (zel_component_version_t *)psmi_calloc(
-				PSMI_EP_NONE, UNDEFINED, sizeof(zel_component_version_t),
-				num_zel_comps);
-		PSMI_ONEAPI_ZE_CALL(zelLoaderGetVersions, &num_zel_comps, zel_comps);
-
-		/* Loop looking for "loader" name */
-		for (i = 0; i < num_zel_comps; i++) {
-			if (!strncmp(zel_comps[i].component_name, "loader", sizeof("loader"))){
-				zel_lib_version = zel_comps[i].component_lib_version;
-				zel_api_version	= zel_comps[i].spec_version;
-				break;
-			}
-		}
-		psmi_free(zel_comps);
-		if (i == num_zel_comps) {
-			_HFI_DBG("WARNING: 'loader' not found among the %zd components reported"
-			         " by zelLoaderGetVersions, unable to report Level-Zero version",
-			         num_zel_comps);
-		}
-	} else {
-		_HFI_DBG("WARNING: no components reported by zelLoaderGetVersions,"
-		         " unable to report Level-Zero version");
-	}
-
-	PSMI_ONEAPI_ZE_CALL(zeDriverGet, &ze_driver_count, &ze_driver);
-#ifndef PSM3_NO_ONEAPI_IMPORT
-	PSMI_ONEAPI_ZE_CALL(zeDriverGetExtensionFunctionAddress, ze_driver, "zexDriverImportExternalPointer", (void **)&psmi_zexDriverImportExternalPointer);
-	PSMI_ONEAPI_ZE_CALL(zeDriverGetExtensionFunctionAddress, ze_driver, "zexDriverReleaseImportedPointer", (void **)&psmi_zexDriverReleaseImportedPointer);
-#endif
-
-	PSMI_ONEAPI_ZE_CALL(zeDeviceGet, ze_driver, &ze_device_count, NULL);
-	if (ze_device_count > MAX_ZE_DEVICES)
-		ze_device_count = MAX_ZE_DEVICES;
-	PSMI_ONEAPI_ZE_CALL(zeDeviceGet, ze_driver, &ze_device_count, devices);
-
-	ze_context_desc_t ctxtDesc = { ZE_STRUCTURE_TYPE_CONTEXT_DESC, NULL, 0 };
-	PSMI_ONEAPI_ZE_CALL(zeContextCreate, ze_driver, &ctxtDesc, &ze_context);
-	_HFI_DBG("ze_driver %p %u devices first device %p ze_context %p\n",
-		   ze_driver, ze_device_count, devices[0], ze_context);
-
-	for (i = 0; i < ze_device_count; i++) {
-		ze_devices[i].dev_index = i;
-		psmi_oneapi_cmd_create(devices[i], &ze_devices[i]);
-		_HFI_DBG("Initialized cmd queues for ze_device[%d] %p\n",
-				i, ze_devices[i].dev);
-	}
-
-	num_ze_devices = ze_device_count;
-	if (num_ze_devices > 0)
-		cur_ze_dev = &ze_devices[0];
-
-	err = psmi_oneapi_putqueue_alloc();
-	if (err != PSM2_OK)
-		goto fail;
-
-	psmi_gpu_init();
-
-#ifndef PSM_HAVE_PIDFD
-	psm3_num_ze_dev_fds = 0;
-#endif
-
-	PSM2_LOG_MSG("leaving");
-	return err;
-fail:
-	err = psm3_handle_error(PSMI_EP_NORETURN,
-		PSM2_INTERNAL_ERR,
-		"Unable to initialize PSM3 OneAPI Level Zero support.\n");
-	return err;
-}
-#endif // PSM_ONEAPI
-
 static
 void
 psmi_free_subnets(void)
@@ -1260,29 +352,6 @@ psm2_error_t psm3_init(int *major, int *minor)
 	psm3_verno_client_val =
 	    min(PSMI_VERNO_MAKE(*major, *minor), psm3_verno);
 
-	/* Check to see if we need to set Architecture flags to something
-	 * besides big core Xeons */
-	cpuid_t id;
-	psm3_cpu_model = CPUID_MODEL_UNDEFINED;
-
-	/* First check to ensure Genuine Intel */
-	get_cpuid(0x0, 0, &id);
-	if(id.ebx == CPUID_GENUINE_INTEL_EBX
-		&& id.ecx == CPUID_GENUINE_INTEL_ECX
-		&& id.edx == CPUID_GENUINE_INTEL_EDX)
-	{
-		/* Use cpuid with EAX=1 to get processor info */
-		get_cpuid(0x1, 0, &id);
-		psm3_cpu_model = CPUID_GENUINE_INTEL;
-	}
-
-	if( (psm3_cpu_model == CPUID_GENUINE_INTEL) &&
-		(id.eax & CPUID_FAMILY_MASK) == CPUID_FAMILY_XEON)
-	{
-		psm3_cpu_model = ((id.eax & CPUID_MODEL_MASK) >> 4) |
-				((id.eax & CPUID_EXMODEL_MASK) >> 12);
-	}
-
 	psmi_refcount++;
 	/* psm3_dbgmask lives in libhfi.so */
 	psm3_getenv("PSM3_TRACEMASK",
@@ -1450,90 +519,16 @@ psm2_error_t psm3_init(int *major, int *minor)
 	if (psm3_device_is_enabled(devid_enabled, PTL_DEVID_AMSH)) {
 		if (psm3_dsa_init()) {
 			err = PSM2_INTERNAL_ERR;
-			goto fail_hal;
+			goto fail_dsa;
 		}
 	}
 #endif
 
-#ifdef PSM_CUDA
-	union psmi_envvar_val env_enable_cuda;
-	psm3_getenv("PSM3_CUDA",
-			"Enable (set envvar to 1) for cuda support in PSM (Disabled by default)",
-			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
-			(union psmi_envvar_val)0, &env_enable_cuda);
-	// order important, always parse gpudirect
-	is_cuda_enabled = psmi_parse_gpudirect() || env_enable_cuda.e_int;
-
-	if (PSMI_IS_GPU_ENABLED) {
-		err = psmi_cuda_initialize();
-		if (err != PSM2_OK)
-#ifdef PSM_DSA
-			goto fail_undsa;
-#else
-			goto fail_hal;
-#endif
-	}
-#else /* PSM_CUDA */
-	/* PSM3_CUDA is not allowed for this build, so we check it's
-	 * presence but don't want to use psm3_getenv since we don't
-	 * want it to appear in PSM3_VERBOSE_ENV help text
-	 */
-	int enable_cuda = 0;
-	if (psm3_parse_str_int(psm3_env_get("PSM3_CUDA"), &enable_cuda,
-				INT_MIN, INT_MAX) == -2
-		|| enable_cuda) {
-		_HFI_INFO("WARNING: PSM built without CUDA enabled, PSM3_CUDA unavailable\n");
+#ifdef PSM_HAVE_GPU
+	if ( (err = PSM3_GPU_INITIALIZE()) != PSM2_OK) {
+		goto fail_gpu;
 	}
-#endif /* PSM_CUDA */
-
-#ifdef PSM_ONEAPI
-	union psmi_envvar_val env_enable_oneapi;
-	psm3_getenv("PSM3_ONEAPI_ZE",
-			"Enable (set envvar to 1) for OneAPI Level Zero (ZE) support in PSM (Disabled by default)",
-			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_INT,
-			(union psmi_envvar_val)0, &env_enable_oneapi);
-	is_oneapi_ze_enabled = psmi_parse_gpudirect() || env_enable_oneapi.e_int;
-
-	if (PSMI_IS_GPU_ENABLED) {
-		err = psmi_oneapi_ze_initialize();
-		if (err != PSM2_OK) {
-#ifdef PSM_DSA
-			goto fail_undsa;
-#else
-			goto fail_hal;
 #endif
-		}
-	}
-#else /* PSM_ONEAPI */
-	/* PSM3_ONEAPI_ZE is not allowed for this build, so we check it's
-	 * presence but don't want to use psm3_getenv since we don't
-	 * want it to appear in PSM3_VERBOSE_ENV help text
-	 */
-	int enable_oneapi = 0;
-	if (psm3_parse_str_int(psm3_env_get("PSM3_ONEAPI_ZE"), &enable_oneapi,
-				INT_MIN, INT_MAX) == -2
-		|| enable_oneapi) {
-		_HFI_INFO("WARNING: PSM built without ONEAPI_ZE enabled, PSM3_ONEAPI_ZE unavailable\n");
-	}
-#endif /* PSM_ONEAPI */
-
-#if !defined(PSM_CUDA) && ! defined(PSM_ONEAPI)
-	/* PSM3_GPUDIRECT is not allowed for this build, so we check it's
-	 * presence but don't want to use psm3_getenv since we don't
-	 * want it to appear in PSM3_VERBOSE_ENV help text
-	 * Note we check here, rather than in ips_proto_init, because
-	 * PSM3_GPUDIERECT can enable GPU for ptl_am (shm) as well as ips,
-	 * so if a user attempted a non-GPU build single node run with
-	 * PSM3_GPUDIRECT=1 and expected GPU handling in shm, they would not
-	 * get the behavior they expected
-	 */
-	unsigned int gpudirect = 0;
-	if (psm3_parse_str_uint(psm3_env_get("PSM3_GPUDIRECT"), &gpudirect,
-				0, UINT_MAX) == -2
-		|| gpudirect) {
-		_HFI_INFO("WARNING: PSM built with neither ONEAPI_ZE nor CUDA enabled, PSM3_GPUDIRECT unavailable\n");
-	}
-#endif /* !defined(PSM_CUDA) && ! defined(PSM_ONEAPI) */
 
 update:
 	*major = (int)psm3_verno_major;
@@ -1544,15 +539,17 @@ psm2_error_t psm3_init(int *major, int *minor)
 	PSM2_LOG_MSG("leaving");
 	return err;
 
-#if defined(PSM_DSA) && (defined(PSM_CUDA) || defined(PSM_ONEAPI))
-fail_undsa:
+#ifdef PSM_HAVE_GPU
+fail_gpu:
+#if defined(PSM_DSA)
 	psm3_dsa_fini();
 #endif
-#if defined(PSM_DSA) || defined(PSM_CUDA) || defined(PSM_ONEAPI)
-fail_hal:
+#endif
+#if defined(PSM_DSA)
+fail_dsa:
+#endif
 	psm3_hwloc_topology_destroy();	// always safe to call
 	psm3_hal_finalize();
-#endif
 fail_epid:
 	psm3_epid_fini();
 fail_unref:
@@ -1611,6 +608,8 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out,
 		2, /* PSM2_INFO_QUERY_PORT_SPEED        */
 		0, /* PSM2_INFO_QUERY_NUM_ADDR_PER_UNIT */
 		4, /* PSM2_INFO_QUERY_UNIT_ADDR_NAME    */
+		0, /* PSM2_INFO_QUERY_GPU_THRESH_RNDV   */
+		0, /* PSM2_INFO_QUERY_MQ_RNDV_SHM_GPU_THRESH_DEFAULT */
 	};
 	psm2_error_t rv = PSM2_INTERNAL_ERR;
 
@@ -1637,15 +636,7 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out,
 		rv = PSM2_OK;
 		break;
 	case PSM2_INFO_QUERY_FEATURE_MASK:
-		{
-#ifdef PSM_CUDA
-		*((uint32_t*)out) = PSM2_INFO_QUERY_FEATURE_CUDA;
-#elif defined(PSM_ONEAPI)
-		*((uint32_t*)out) = PSM2_INFO_QUERY_FEATURE_ONEAPI;
-#else
-		*((uint32_t*)out) = 0;
-#endif /* PSM_CUDA */
-		}
+		*((uint32_t*)out) = PSM3_GPU_QUERY_FEATURE_MASK();
 		rv = PSM2_OK;
 		break;
 	case PSM2_INFO_QUERY_UNIT_NAME:
@@ -1776,6 +767,18 @@ psm2_error_t psm3_info_query(psm2_info_query_t q, void *out,
 			rv = PSM2_OK;
 		}
 		break;
+	case PSM2_INFO_QUERY_GPU_THRESH_RNDV:
+#ifdef PSM_HAVE_GPU
+		*((uint32_t*)out) = psm3_gpu_thresh_rndv;
+		rv = PSM2_OK;
+#endif
+		break;
+	case PSM2_INFO_QUERY_MQ_RNDV_SHM_GPU_THRESH_DEFAULT:
+#ifdef PSM_HAVE_GPU
+		*((uint32_t*)out) = psm3_gpu_mq_rndv_shm_gpu_thresh_default;
+		rv = PSM2_OK;
+#endif
+		break;
 	default:
 		return 	PSM2_IQ_INVALID_QUERY;
 	}
@@ -1921,24 +924,8 @@ psm2_error_t psm3_finalize(void)
 
 	psm3_hwloc_topology_destroy();	// always safe to call
 	psm3_hal_finalize();
-#ifdef PSM_CUDA
-	if (PSMI_IS_GPU_ENABLED)
-		psm3_stats_deregister_type(PSMI_STATSTYPE_GPU, &psmi_count_cuInit);
-#elif defined(PSM_ONEAPI)
-	if (PSMI_IS_GPU_ENABLED) {
-		psm3_stats_deregister_type(PSMI_STATSTYPE_GPU, &psmi_count_zeInit);
-		/*
-		 * Trying to destroy command list, queue, and context will result in
-		 *  segfaults here.
-		 */
-		/*psmi_oneapi_putqueue_free();
-		psmi_oneapi_cmd_destroy();
-		if (ze_context) {
-			PSMI_ONEAPI_ZE_CALL(zeContextDestroy, ze_context);
-			ze_context = NULL;
-		} */
-	}
-#endif // PSM_CUDA
+
+	PSM3_GPU_FINALIZE();
 
 	psmi_refcount = PSMI_FINALIZED;
 	PSM2_LOG_MSG("leaving");
diff --git a/prov/psm3/psm3/psm2.h b/prov/psm3/psm3/psm2.h
index cadb561dbd4..ce007280daf 100644
--- a/prov/psm3/psm3/psm2.h
+++ b/prov/psm3/psm3/psm2.h
@@ -314,8 +314,8 @@ enum psm2_error {
 	/*! PSM2 is finalized */
 	PSM2_IS_FINALIZED = 13,
 
-	/*! TCP data send is successful */
-	PSM2_TCP_DATA_SENT = 14,
+	/*! data was sent reliably */
+	PSM2_RELIABLE_DATA_SENT = 14,
 
 	/*! Endpoint was closed */
 	PSM2_EP_WAS_CLOSED = 20,
@@ -1325,6 +1325,15 @@ void *psm3_epaddr_getctxt(psm2_epaddr_t epaddr);
    * option value: Context associated with PSM2 endpoint address.
    */
 
+/* PSM2 endpoint CUDA_PERMITTED flag */
+#define PSM2_CORE_OPT_EP_CUDA_PERMITTED   0x103
+  /**< [@b uint32_t ] Set/Get the CUDA_PERMITTED flag associated with a PSM2
+   * endpoint (psm2_ep_t).
+   *
+   * component object: PSM2 endpoint (@ref psm2_ep_t).
+   * option value: Boolean flag.
+   */
+
 /* PSM2_COMPONENT_IB options */
 /* Default service level to use to communicate with remote endpoints */
 #define PSM2_IB_OPT_DF_SL 0x201
@@ -1717,6 +1726,14 @@ typedef enum psm2_info_query_et
        Output parameter: char*, description: name of the device's address. */
 	PSM2_INFO_QUERY_UNIT_ADDR_NAME,
 
+/*! Required input arguments 0
+   Output parameter: uint32_t*, description: configured PSM3_GPU_THRESH_RNDV */
+	PSM2_INFO_QUERY_GPU_THRESH_RNDV,
+
+/*! Required input arguments 0
+   Output parameter: uint32_t*, description: default for PSM3_MQ_RNDV_SHM_GPU_THRESH */
+	PSM2_INFO_QUERY_MQ_RNDV_SHM_GPU_THRESH_DEFAULT,
+
 	PSM2_INFO_QUERY_LAST, /* must appear last, and the info query
 				 constants are used as an index. */
 } psm2_info_query_t;
@@ -1772,14 +1789,14 @@ psm2_error_t psm3_info_query(psm2_info_query_t, void *out,
  * Used to support interrupt driven progress with CPU release when
  * >1 process per core
  *
- * @param[in] int timeout  timeout in milliseconds.  <0 is infinite timeout
+ * @param[in] int timeout_ms  timeout in milliseconds.  <0 is infinite timeout
  *
  * @returns PSM2_OK if wait completed and some progress may have been made
  * @returns PSM2_TIMEOUT if wait timeout exceeded with no progress made
  * @returns PSM2_INTERNAL_ERR if wait mode not allowed for given HAL
  * @returns PSM2_PARAM_ERR if not allowed for use with current PSM settings/mode
  */
-psm2_error_t psm3_wait(int timeout);
+psm2_error_t psm3_wait(int timeout_ms);
 
 /** @brief PSM2 env initialization
  *
@@ -1905,6 +1922,7 @@ int psm3_getenv_str(const char *name, const char *descr, int visible,
  * @param[in] unint32_t parameter copy length
  */
 void psm3_memcpy(void *dest, const void *src, uint32_t len);
+void psm3_ep_memcpy(psm2_ep_t ep, void *dest, const void *src, uint32_t len);
 
 /*! @} */
 
diff --git a/prov/psm3/psm3/psm2_hal.c b/prov/psm3/psm3/psm2_hal.c
index 31a1cf67ecf..7b17fe757ba 100644
--- a/prov/psm3/psm3/psm2_hal.c
+++ b/prov/psm3/psm3/psm2_hal.c
@@ -97,7 +97,7 @@ void psm3_hal_register_instance(psmi_hal_instance_t *psm_hi)
 	REJECT_IMPROPER_HI(hfp_mq_init_defaults);
 	REJECT_IMPROPER_HI(hfp_ep_open_opts_get_defaults);
 	REJECT_IMPROPER_HI(hfp_context_initstats);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	REJECT_IMPROPER_HI(hfp_gdr_open);
 #endif
 
@@ -147,10 +147,10 @@ void psm3_hal_register_instance(psmi_hal_instance_t *psm_hi)
 	REJECT_IMPROPER_HI(hfp_ips_ibta_init);
 	REJECT_IMPROPER_HI(hfp_ips_path_rec_init);
 	REJECT_IMPROPER_HI(hfp_ips_ptl_pollintr);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	REJECT_IMPROPER_HI(hfp_gdr_close);
 	REJECT_IMPROPER_HI(hfp_gdr_convert_gpu_to_host_addr);
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 	REJECT_IMPROPER_HI(hfp_get_port_index2pkey);
 	REJECT_IMPROPER_HI(hfp_poll_type);
 
diff --git a/prov/psm3/psm3/psm2_hal.h b/prov/psm3/psm3/psm2_hal.h
index 91d187dcd56..a1f83899ad4 100644
--- a/prov/psm3/psm3/psm2_hal.h
+++ b/prov/psm3/psm3/psm2_hal.h
@@ -253,16 +253,6 @@ typedef struct _psmi_hal_params
 	char **unit_driver;
 } psmi_hal_params_t;
 
-
-#define PSM_HAL_ALG_ACROSS     0
-#define PSM_HAL_ALG_WITHIN     1
-#define PSM_HAL_ALG_ACROSS_ALL 2
-#define PSM_HAL_ALG_CPU_CENTRIC 3
-#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
-#define PSM_HAL_ALG_GPU_CENTRIC 4
-#endif
-
-
 typedef enum {
 	PSMI_HAL_POLL_TYPE_NONE = 0,
 	PSMI_HAL_POLL_TYPE_URGENT = 1,
@@ -314,7 +304,7 @@ struct _psmi_hal_instance
 	/* Initialize PSM3_PRINT_STATS stats for given ep */
 	void (*hfp_context_initstats)(psm2_ep_t ep);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	void (*hfp_gdr_open)(void);
 #endif
 
@@ -414,12 +404,12 @@ struct _psmi_hal_instance
 				int next_timeout,
 				uint64_t *pollok, uint64_t *pollcyc, uint64_t *pollintr);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/* Direct GPU Copy */
 	void (*hfp_gdr_close)(void);
 	void* (*hfp_gdr_convert_gpu_to_host_addr)(unsigned long buf,
                                 size_t size, int flags, psm2_ep_t ep);
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 	/* Given an open context and index, return an error, or the
 	 * corresponding pkey for the index as programmed by the SM */
 	/* Returns an int, so -1 indicates an error. */
@@ -432,7 +422,7 @@ struct _psmi_hal_instance
 				       uint32_t *payload, uint32_t length,
 				       uint32_t isCtrlMsg, uint32_t cksum_valid,
 				       uint32_t cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, uint32_t is_gpu_payload
 #endif
 		);
@@ -441,7 +431,7 @@ struct _psmi_hal_instance
 				       uint32_t *payload, uint32_t length,
 				       uint32_t isCtrlMsg, uint32_t cksum_valid,
 				       uint32_t cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, uint32_t is_gpu_payload
 #endif
 		);
@@ -556,7 +546,7 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...)
 #define psmi_hal_mq_init_defaults(...)		                PSMI_HAL_DISPATCH_FUNC(mq_init_defaults,__VA_ARGS__)
 #define psmi_hal_ep_open_opts_get_defaults(...)	                PSMI_HAL_DISPATCH_FUNC(ep_open_opts_get_defaults,__VA_ARGS__)
 #define psmi_hal_context_initstats(...)				PSMI_HAL_DISPATCH_FUNC(context_initstats,__VA_ARGS__)
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #define psmi_hal_gdr_open(...)                                  PSMI_HAL_DISPATCH_FUNC(gdr_open,__VA_ARGS__)
 #endif
 #define psmi_hal_finalize_(...)                                 PSMI_HAL_DISPATCH_FUNC(finalize_,__VA_ARGS__)
@@ -603,10 +593,10 @@ int psm3_hal_pre_init_cache_func(enum psmi_hal_pre_init_cache_func_krnls k, ...)
 #define psmi_hal_ips_ibta_init(...)                             PSMI_HAL_DISPATCH(ips_ibta_init,__VA_ARGS__)
 #define psmi_hal_ips_path_rec_init(...)                         PSMI_HAL_DISPATCH(ips_path_rec_init,__VA_ARGS__)
 #define psmi_hal_ips_ptl_pollintr(...)                          PSMI_HAL_DISPATCH(ips_ptl_pollintr,__VA_ARGS__)
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #define psmi_hal_gdr_close(...)                                 PSMI_HAL_DISPATCH(gdr_close,__VA_ARGS__)
 #define psmi_hal_gdr_convert_gpu_to_host_addr(...)              PSMI_HAL_DISPATCH(gdr_convert_gpu_to_host_addr,__VA_ARGS__)
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 #define psmi_hal_get_port_index2pkey(...)			PSMI_HAL_DISPATCH(get_port_index2pkey,__VA_ARGS__)
 #define psmi_hal_poll_type(...)					PSMI_HAL_DISPATCH(poll_type,__VA_ARGS__)
diff --git a/prov/psm3/psm3/psm2_hal_inline_t.h b/prov/psm3/psm3/psm2_hal_inline_t.h
index 68e7276f425..ad86a97e9b9 100644
--- a/prov/psm3/psm3/psm2_hal_inline_t.h
+++ b/prov/psm3/psm3/psm2_hal_inline_t.h
@@ -125,13 +125,13 @@ static PSMI_HAL_INLINE psm2_error_t PSMI_HAL_CAT_INL_SYM(ips_ptl_pollintr)
 				(psm2_ep_t ep, struct ips_recvhdrq *recvq,
 					int fd_pipe, int next_timeout,
 					uint64_t *pollok, uint64_t *pollcyc, uint64_t *pollintr);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static PSMI_HAL_INLINE void PSMI_HAL_CAT_INL_SYM(gdr_close)
 				(void);
 static PSMI_HAL_INLINE void* PSMI_HAL_CAT_INL_SYM(gdr_convert_gpu_to_host_addr)
 				(unsigned long buf,
 					size_t size, int flags, psm2_ep_t ep);
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(get_port_index2pkey)
 				(psm2_ep_t ep, int index);
 static PSMI_HAL_INLINE int PSMI_HAL_CAT_INL_SYM(poll_type)
@@ -143,7 +143,7 @@ static PSMI_HAL_INLINE psm2_error_t PSMI_HAL_CAT_INL_SYM(spio_transfer_frame)
 				 uint32_t *payload, uint32_t length,
 				 uint32_t isCtrlMsg, uint32_t cksum_valid,
 				 uint32_t cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				 , uint32_t is_gpu_payload
 #endif
 					);
@@ -153,7 +153,7 @@ static PSMI_HAL_INLINE psm2_error_t PSMI_HAL_CAT_INL_SYM(transfer_frame)
 				 uint32_t *payload, uint32_t length,
 				 uint32_t isCtrlMsg, uint32_t cksum_valid,
 				 uint32_t cksum
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				 , uint32_t is_gpu_payload
 #endif
 					);
diff --git a/prov/psm3/psm3/psm2_hal_loopback.c b/prov/psm3/psm3/psm2_hal_loopback.c
index 6789ad18f59..5612a95fd90 100644
--- a/prov/psm3/psm3/psm2_hal_loopback.c
+++ b/prov/psm3/psm3/psm2_hal_loopback.c
@@ -200,7 +200,7 @@ static int psm3_hfp_loopback_get_port_lid(int unit, int port, int addr_index)
 static void psm3_hfp_loopback_mq_init_defaults(struct psm2_mq *mq)
 {
 	mq->ips_cpu_window_rv_str =  NULL; // no rendezvous
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	mq->ips_gpu_window_rv_str =  NULL; // no rendezvous
 #endif
 	mq->rndv_nic_thresh = (~(uint32_t)0); // disable rendezvous
@@ -222,11 +222,11 @@ static int psm3_hfp_loopback_get_default_pkey(void)
 	return 0x8001;	// not used (only used in ptl_ips), pick a safe value
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static void psm3_hfp_loopback_gdr_open(void)
 {
 	/* disable GPU Direct copy, no driver to help us */
-	is_gdr_copy_enabled = gdr_copy_limit_send = gdr_copy_limit_recv = 0;
+	psm3_gpu_is_gdr_copy_enabled = psm3_gpu_gdr_copy_limit_send = psm3_gpu_gdr_copy_limit_recv = 0;
 }
 #endif
 
@@ -257,7 +257,7 @@ hfp_loopback_t psm3_loopback_hi = {
 		.hfp_mq_init_defaults			  = psm3_hfp_loopback_mq_init_defaults,
 		.hfp_ep_open_opts_get_defaults		  = psm3_hfp_loopback_ep_open_opts_get_defaults,
 		.hfp_context_initstats			  = NULL, // ptl_ips only
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		.hfp_gdr_open			  	= psm3_hfp_loopback_gdr_open,
 #endif
 
@@ -302,10 +302,10 @@ hfp_loopback_t psm3_loopback_hi = {
 		.hfp_ips_ibta_init			  = NULL,
 		.hfp_ips_path_rec_init			  = NULL,
 		.hfp_ips_ptl_pollintr			  = NULL,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		.hfp_gdr_close				  = NULL,
 		.hfp_gdr_convert_gpu_to_host_addr	  = NULL,
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 		.hfp_get_port_index2pkey		  = NULL,
 		.hfp_poll_type				  = NULL,
 		.hfp_spio_transfer_frame		  = NULL,
diff --git a/prov/psm3/psm3/psm2_mq.h b/prov/psm3/psm3/psm2_mq.h
index 517b4802d5b..3f7f128e3ad 100644
--- a/prov/psm3/psm3/psm2_mq.h
+++ b/prov/psm3/psm3/psm2_mq.h
@@ -1639,7 +1639,7 @@ struct psm2_mq_stats {
 #else
 	uint64_t dsa_stats[DSA_STATS_SZ*2];	/* same size as dsa_stats[2] */
 #endif
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/** maximum handles allowed in cache **/
 	uint64_t gpu_ipc_cache_limit;
 	/** current handles in cache **/
@@ -1656,9 +1656,9 @@ struct psm2_mq_stats {
 	uint64_t gpu_ipc_cache_remove;
 	/** cache cleared due to error opening new Ipc Handle **/
 	uint64_t gpu_ipc_cache_clear;
-#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#else /* PSM_HAVE_GPU */
 	uint64_t _reserved_gpu[8];
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 
 	/** sysbufs are used for unexpected eager receive (and RTS payload) */
 	/** Number of messages using system buffers (not used for 0 byte msg) */
@@ -1669,7 +1669,7 @@ struct psm2_mq_stats {
 	/** rank in MPI_COMM_WORLD, while unchanging, easiest to put here */
 	uint64_t comm_world_rank;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/** Messages transmitted eagerly from CPU buffer */
 	uint64_t tx_eager_cpu_num;
 	/** Bytes transmitted eagerly from CPU buffer */
diff --git a/prov/psm3/psm3/psm_config.h b/prov/psm3/psm3/psm_config.h
index 9bd59690005..0c391320df7 100644
--- a/prov/psm3/psm3/psm_config.h
+++ b/prov/psm3/psm3/psm_config.h
@@ -81,6 +81,11 @@
 /* #define INTEL_GPU_DIRECT */
 #endif
 
+// define here so pxmx3 and psm_user.h can use this define
+#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#define PSM_HAVE_GPU
+#endif
+
 #ifndef PSM3_BRAKE_DEBUG
 /* #define PSM3_BRAKE_DEBUG */
 #endif
@@ -164,32 +169,23 @@
 
 #endif // PSM_CUDA
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #define GPU_WINDOW_PREFETCH_DEFAULT	2
 #define GPU_SMALLHOSTBUF_SZ	(256*1024)
 #define GPU_PAGE_OFFSET_MASK (PSMI_GPU_PAGESIZE -1)
 #define GPU_PAGE_MASK ~GPU_PAGE_OFFSET_MASK
-/* All GPU transfers beyond this threshold use
- * RNDV protocol. It is mostly a send side knob.
- */
-#define PSM3_GPU_THRESH_RNDV 8000
 
 #define GPUDIRECT_THRESH_RV 3
 
 #define GDR_COPY_LIMIT_SEND 128
 #define GDR_COPY_LIMIT_RECV 64000
 
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 
 #define PSM_MQ_NIC_MAX_TINY		8	/* max TINY payload allowed */
 #define PSM3_MQ_RNDV_NIC_THRESH	 	64000
 #define PSM_CPU_NIC_RNDV_WINDOW_STR "131072"
-#ifdef PSM_CUDA
-#define PSM_GPU_NIC_RNDV_WINDOW_STR "2097152"
-#elif defined(PSM_ONEAPI)
-#define PSM_GPU_NIC_RNDV_WINDOW_STR "131072:524287,262144:1048575,524288"
-#endif
 #define PSM3_MQ_RNDV_NIC_WINDOW_MAX	(4 * 1024 * 1024) /* max rndv window */
 
 /*
@@ -197,14 +193,6 @@
  */
 #define PSM3_MQ_RNDV_SHM_THRESH 16000
 
-#if defined(PSM_CUDA)
-/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */
-#define PSM3_MQ_RNDV_SHM_GPU_THRESH 63
-#elif defined(PSM_ONEAPI)
-/* Threshold for GPU rendezvous (aka scale-up transfer vs via CPU shared mem */
-#define PSM3_MQ_RNDV_SHM_GPU_THRESH 127
-#endif
-
 // LEARN_HASH_SELECTOR has PSM3 dynamically learn the combinations
 // of src_addr presence and tagsel used by a given middleware.  This
 // allows PSM3 to self-optimize for use with varied middleware uses
@@ -245,7 +233,11 @@
 #define PSMI_DEVICES_DEFAULT	"self,shm,nic"
 
 /* Lock */
+#if defined(__x86_64__) || defined(__i386__)
 #define PSMI_USE_PTHREAD_SPINLOCKS	0
+#else /* non-Intel arch */
+#define PSMI_USE_PTHREAD_SPINLOCKS	1
+#endif
 
 /* Utils */
 #define PSMI_EPID_TABSIZE_CHUNK		128
diff --git a/prov/psm3/psm3/psm_context.c b/prov/psm3/psm3/psm_context.c
index 678b394d71e..e82a701e74a 100644
--- a/prov/psm3/psm3/psm_context.c
+++ b/prov/psm3/psm3/psm_context.c
@@ -110,18 +110,12 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde
 		  psm2_uuid_t const job_key, uint16_t network_pkey,
 		  int64_t timeout_ns)
 {
-	long open_timeout = 0, unit_start, unit_end, unit_id, unit_id_prev;
+	long open_timeout = 0;
 	psm2_error_t err = PSM2_OK;
-	int nunits = psmi_hal_get_num_units(), nunitsactive=0;
+	int nunits = psmi_hal_get_num_units();
 	union psmi_envvar_val env_rcvthread;
 	static int norcvthread;	/* only for first rail */
 
-	/*
-	 * If shared contexts are enabled, try our best to schedule processes
-	 * across one or many devices
-	 */
-
-	/* if no units, then no joy. */
 	if (nunits <= 0)
 	{
 		err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
@@ -129,75 +123,41 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde
 		goto ret;
 	}
 
-	/* Calculate the number of active units: */
-	for (unit_id=0;unit_id < nunits;unit_id++)
-	{
-		if (psmi_hal_get_unit_active(unit_id) > 0)
-			nunitsactive++;
-	}
-	/* if no active units, then no joy. */
-	if (nunitsactive == 0)
-	{
-		err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-					"PSM3 no nic units are active");
-		goto ret;
-	}
 	if (timeout_ns > 0)
 		open_timeout = (long)(timeout_ns / MSEC_ULL);
 
-
-	unit_start = 0; unit_end = nunits - 1;
-	err = psm3_compute_start_and_end_unit(unit_param, addr_index,
-					      nunitsactive, nunits, job_key,
-					      &unit_start, &unit_end);
-	if (err != PSM2_OK)
-		goto ret;
-
-	/* Loop from unit_start to unit_end inclusive and pick 1st active found
-	 * As needed wrap, so it's valid for unit_start >= unit_end
-	 */
-	int success = 0;
-	unit_id_prev = unit_id = unit_start;
-	do
-	{
-		/* if the unit_id is not active, go to next one. */
-		if (psmi_hal_get_unit_active(unit_id) <= 0) {
-			unit_id_prev = unit_id;
-			unit_id = (unit_id + 1) % nunits;
-			continue;
+	if (unit_param == PSM3_NIC_ANY) {
+		/* user did not set PSM3_NIC and not PSM3_MULTIRAIL */
+		unit_param = psm3_autoselect_one(addr_index, nunits, job_key);
+		if (unit_param < 0) {
+			err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
+					"PSM3 no nic units are active");
+			goto ret;
 		}
+	} else {
+		_HFI_DBG("Caller selected NIC %ld.\n", unit_param);
+		psmi_assert(unit_param >= 0);	// caller checked valid
+	}
 
-		/* open this unit. */
-		if (psmi_hal_context_open(unit_id, port,
-				psmi_hash_addr_index(unit_id, port, addr_index),
-				open_timeout,
-				ep, job_key, HAL_CONTEXT_OPEN_RETRY_MAX)) {
-			// in modes where we refcount NIC use,
-			// psm3_compute_start_and_end_unit will have returned exactly
-			// 1 NIC and refcount'ed it, so we dec refcount here
-			psm3_dec_nic_refcount(unit_id);
-			/* go to next unit if failed to open. */
-			unit_id_prev = unit_id;
-			unit_id = (unit_id + 1) % nunits;
-			continue;
-		}
-		// HAL context_open has initialized:
-		// ep->unit_id, ep->portnum, ep->addr_index,
-		// ep->dev_name, ep->subnet, ep->addr, ep->gid, ep->wiremode,
-		// ep->epid and
-		// HAL specific ep fields (context, verbs_ep or sockets_ep)
-		psmi_assert_always(! psm3_epid_zero_internal(ep->epid));
-		success = 1;
-		break;
-
-	} while (unit_id_prev != unit_end);
-
-	if (!success)
-	{
+	/* open this unit. */
+	if (psmi_hal_get_unit_active(unit_param) <= 0
+	    || psmi_hal_context_open(unit_param, port,
+			psmi_hash_addr_index(unit_param, port, addr_index),
+			open_timeout,
+			ep, job_key, HAL_CONTEXT_OPEN_RETRY_MAX)) {
+		// in modes where we refcount NIC use,
+		// psm3_autoselect_one refcount'ed it, so we dec refcount here
+		psm3_dec_nic_refcount(unit_param);
 		err = psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-					"PSM3 can't open nic unit: %ld",unit_param);
+				"PSM3 can't open nic unit: %ld",unit_param);
 		goto bail;
 	}
+	// HAL context_open has initialized:
+	// ep->unit_id, ep->portnum, ep->addr_index,
+	// ep->dev_name, ep->subnet, ep->addr, ep->gid, ep->wiremode,
+	// ep->epid and
+	// HAL specific ep fields (context, verbs_ep or sockets_ep)
+	psmi_assert_always(! psm3_epid_zero_internal(ep->epid));
 
 	_HFI_VDBG("hal_context_open() passed.\n");
 
@@ -233,7 +193,7 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde
 				  "with RCVTHREAD on");
 #endif
 	}
-	_HFI_PRDBG("Opened unit %ld port %ld: EPID=%s %s\n", unit_id, port,
+	_HFI_PRDBG("Opened unit %ld port %ld: EPID=%s %s\n", unit_param, port,
 		psm3_epid_fmt_internal(ep->epid, 0), psm3_epid_fmt_addr(ep->epid, 1));
 
 	goto ret;
@@ -242,7 +202,7 @@ psm3_context_open(const psm2_ep_t ep, long unit_param, long port, long addr_inde
 	psmi_hal_close_context(ep);
 	psm3_dec_nic_refcount(ep->unit_id);
 bail:
-	_HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_id, err, strerror(errno));
+	_HFI_PRDBG("open failed: unit_id: %ld, err: %d (%s)\n", unit_param, err, strerror(errno));
 ret:
 
 	_HFI_VDBG("psm3_context_open() return %d\n", err);
diff --git a/prov/psm3/psm3/psm_ep.c b/prov/psm3/psm3/psm_ep.c
index 86dfa9a88d0..055d8ca11d2 100644
--- a/prov/psm3/psm3/psm_ep.c
+++ b/prov/psm3/psm3/psm_ep.c
@@ -678,9 +678,9 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled,
 	ep->hfi_num_send_rdma = 0;
 #endif
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	ep->rv_gpu_cache_size = 0;
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 #endif /* PSM_HAVE_RNDV_MOD */
 
 	/* See how many iterations we want to spin before yielding */
@@ -747,10 +747,7 @@ psm3_ep_open_internal(psm2_uuid_t const unique_job_key, int *devid_enabled,
 	if (! mq->ep)	// only call on 1st EP within MQ
 		psm3_mq_initstats(mq, ep->epid);
 
-#ifdef PSM_CUDA
-	if (PSMI_IS_GPU_ENABLED)
-		verify_device_support_unified_addr();
-#endif
+	PSM3_GPU_VERIFY_CAPABILITIES();
 
 	_HFI_VDBG("start ptl device init...\n");
 	if (psm3_ep_device_is_enabled(ep, PTL_DEVID_SELF)) {
@@ -827,15 +824,7 @@ psm3_ep_open(psm2_uuid_t const unique_job_key,
 		return PSM2_TOO_MANY_ENDPOINTS;
 	}
 
-#if defined(PSM_ONEAPI)
-	/* Make sure ze_context and command queue/list are available.
-	 * They could be destroyed when there is no more endpoints.
-	 * If another endpoint is created after that, the code here can
-	 * recreate the context, command queue and list.
-	 */
-	if (PSMI_IS_GPU_ENABLED && !cur_ze_dev)
-		psmi_oneapi_cmd_create_all();
-#endif //PSM_ONEAPI
+	PSM3_GPU_EP_OPEN();
 
 	/* Matched Queue initialization.  We do this early because we have to
 	 * make sure ep->mq exists and is valid before calling ips_do_work.
@@ -869,11 +858,12 @@ psm3_ep_open(psm2_uuid_t const unique_job_key,
 			opts.addr_index = multirail_config.addr_indexes[0];
 		}
 	}
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	// if HAL doesn't support GDR Copy, it may disable Gdr Copy
-	// by zeroing is_gdr_copy_enabled, gdr_copy_limit_send, and
-	// gdr_copy_limit_recv during gdr_open
-	if (PSMI_IS_GDR_COPY_ENABLED)
+	// by zeroing psm3_gpu_is_gdr_copy_enabled,
+	// psm3_gpu_gdr_copy_limit_send, and
+	// psm3_gpu_gdr_copy_limit_recv during gdr_open
+	if (PSM3_GPU_IS_GDR_COPY_ENABLED)
 		psmi_hal_gdr_open();
 #endif
 
@@ -982,12 +972,8 @@ psm3_ep_open(psm2_uuid_t const unique_job_key,
 fail:
 	fflush(stdout);
 	PSMI_UNLOCK(psm3_creation_lock);
-#if defined(PSM_ONEAPI)
-	if (PSMI_IS_GPU_ENABLED && psm3_opened_endpoint_count == 0) {
-		psmi_oneapi_putqueue_free();
-		psmi_oneapi_cmd_destroy_all();
-	}
-#endif //PSM_ONEAPI
+	if (psm3_opened_endpoint_count == 0)
+		PSM3_GPU_EP_CLOSE();
 	PSM2_LOG_MSG("leaving");
 	return err;
 }
@@ -1005,14 +991,14 @@ psm2_error_t psm3_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
 	}
 #endif
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/*
 	 * The close on the gdr fd needs to be called before the
 	 * close on the hfi fd as the the gdr device will hold
 	 * reference count on the hfi device which will make the close
 	 * on the hfi fd return without actually closing the fd.
 	 */
-	if (PSMI_IS_GDR_COPY_ENABLED)
+	if (PSM3_GPU_IS_GDR_COPY_ENABLED)
 		psmi_hal_gdr_close();
 #endif
 	union psmi_envvar_val timeout_intval;
@@ -1202,17 +1188,8 @@ psm2_error_t psm3_ep_close(psm2_ep_t ep, int mode, int64_t timeout_in)
 				 (double)cycles_to_nanosecs(get_cycles() -
 				 t_start) / SEC_ULL);
 	}
-#if defined(PSM_ONEAPI)
-	/*
-	 * It would be ideal to destroy the global command list, queue, and
-	 * context in psm3_finalize(). Unfortunately, it will cause segfaults
-	 * in Level-zero library.
-	 */
-	if (PSMI_IS_GPU_ENABLED && psm3_opened_endpoint_count == 0) {
-		psmi_oneapi_putqueue_free();
-		psmi_oneapi_cmd_destroy_all();
-	}
-#endif //PSM_ONEAPI
+	if (psm3_opened_endpoint_count == 0)
+		PSM3_GPU_EP_CLOSE();
 	PSM2_LOG_MSG("leaving");
 	return err;
 }
@@ -1376,7 +1353,7 @@ int psm3_ep_device_is_enabled(const psm2_ep_t ep, int devid)
 }
 
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 // used for GdrCopy
 
 // given an ep this returns the "next one".
@@ -1517,5 +1494,5 @@ int64_t psm3_gpu_evict_some(psm2_ep_t ep, uint64_t length, int access)
 	}
 	return evicted;
 }
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 #endif /* PSM_HAVE_RNDV_MOD */
diff --git a/prov/psm3/psm3/psm_ep.h b/prov/psm3/psm3/psm_ep.h
index f8376331e32..76b05b6fb80 100644
--- a/prov/psm3/psm3/psm_ep.h
+++ b/prov/psm3/psm3/psm_ep.h
@@ -96,6 +96,11 @@
 	node->mctxt_next = node->mctxt_prev = node; \
 	node->mctxt_master = NULL
 
+#define PSM_EP_FOR_EACH_MCTXT(root, iter) \
+	for ( struct psm2_ep *iter = (root)->mctxt_master \
+	    ; iter \
+	    ; iter = iter->mctxt_next == iter->mctxt_master ? NULL : iter->mctxt_next)
+
 struct psm2_ep {
 	psm2_epid_t epid;	    /**> This endpoint's Endpoint ID */
 	psm2_epaddr_t epaddr;	    /**> This ep's ep address */
@@ -108,6 +113,9 @@ struct psm2_ep {
 		struct psm3_sockets_ep sockets_ep;
 #endif
 	};
+#ifdef PSM_HAVE_GPU
+	union psm2_ep_gpu_specific gpu_specific;
+#endif
 
 	/* unit_id and portnum are set to 0 when ptl_ips not enabled */
 	int unit_id;
@@ -136,7 +144,7 @@ struct psm2_ep {
 #ifdef PSM_HAVE_RNDV_MOD
 	psm3_rv_t rv;   // rendezvous module open handle
 	uint32_t rv_mr_cache_size; /** PSM3_RV_MR_CACHE_SIZE */
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint32_t rv_gpu_cache_size; /** PSM3_RV_GPU_CACHE_SIZE */
 #endif
 #endif /* PSM_HAVE_RNDV_MOD */
@@ -144,14 +152,6 @@ struct psm2_ep {
 	uint32_t hfi_num_descriptors;/** Number of allocated scb descriptors*/
 #ifdef PSM_HAVE_REG_MR
 	uint32_t hfi_num_send_rdma;/** Number of concurrent RDMA*/
-#endif
-#ifdef PSM_ONEAPI
-#ifndef PSM_HAVE_PIDFD
-	// TBD - move to ptl_am
-	int ze_ipc_socket;	// AF_UNIX listener sock to recv GPU Dev FDs
-	char *listen_sockname;	// /dev/shm filename for ze_ipc_socket
-	int need_dev_fds_poll;	// are there outstanding dev_fds to be polled
-#endif
 #endif
 	uint8_t wiremode; /* EPID protocol specific basic modes
 			   * For RoCE/IB reflects
@@ -275,7 +275,7 @@ struct psm2_epaddr {
 int psm3_ep_device_is_enabled(const psm2_ep_t ep, int devid);
 
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 extern int64_t psm3_gpu_evict_some(psm2_ep_t ep, uint64_t length, int access);
 #endif
 #endif
diff --git a/prov/psm3/psm3/psm_help.h b/prov/psm3/psm3/psm_help.h
index a3908ba2563..5c738b95e2d 100644
--- a/prov/psm3/psm3/psm_help.h
+++ b/prov/psm3/psm3/psm_help.h
@@ -150,6 +150,12 @@
 	(((uint32_t)(val)) & (~((uint32_t)(align)-1)))
 #endif
 
+/* round down 64-bit value to align, align must be a power of 2 */
+#ifndef ROUNDDOWN64P2
+#define ROUNDDOWN64P2(val, align) \
+	(((uint64_t)(val)) & (~((uint64_t)(align)-1)))
+#endif
+
 /* round down value to align, align can be any value, less efficient than ROUNDDOWNP2 */
 #ifndef ROUNDDOWN
 #define ROUNDDOWN(val, align)   \
@@ -165,12 +171,20 @@
 /* how many entries are in a statically allocated table */
 #define PSMI_HOWMANY(table) (sizeof(table)/sizeof(table[0]))
 
-
+// cycles (e.g. rdtsc) to time conversions
 #define SEC_ULL	 1000000000ULL
 #define MSEC_ULL 1000000ULL
 #define USEC_ULL 1000ULL
 #define NSEC_ULL 1ULL
 
+// time units conversions
+#define NSEC_PER_SEC  1000000000
+#define NSEC_PER_MSEC 1000000
+#define NSEC_PER_USEC 1000
+#define USEC_PER_SEC  1000000
+#define USEC_PER_MSEC 1000
+#define MSEC_PER_SEC  1000
+
 #define PSMI_TRUE   1
 #define PSMI_FALSE  0
 
diff --git a/prov/psm3/psm3/psm_mpool.c b/prov/psm3/psm3/psm_mpool.c
index 6bf33b7d74a..6472752f5b7 100644
--- a/prov/psm3/psm3/psm_mpool.c
+++ b/prov/psm3/psm3/psm_mpool.c
@@ -99,7 +99,7 @@ struct mpool {
 	non_empty_callback_fn_t mp_non_empty_cb;
 	void *mp_non_empty_cb_context;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	alloc_dealloc_callback_fn_t mp_alloc_dealloc_cb;
 	void *mp_alloc_dealloc_cb_context;
 #endif
@@ -232,7 +232,7 @@ MOCKABLE(psm3_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
 }
 MOCK_DEF_EPILOGUE(psm3_mpool_create);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 mpool_t
 psm3_mpool_create_for_gpu(size_t obj_size, uint32_t num_obj_per_chunk,
 			  uint32_t num_obj_max_total, int flags,
@@ -259,7 +259,7 @@ psm3_mpool_create_for_gpu(size_t obj_size, uint32_t num_obj_per_chunk,
 
 	return mp;
 }
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 /**
  * psm3_mpool_get()
@@ -413,7 +413,7 @@ void *psm3_mpool_find_obj_by_index(mpool_t mp, int index)
 	return (void *)((uintptr_t) me + sizeof(struct mpool_element));
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /**
  * psmi_mpool_chunk_dealloc()
  * <mp>	    memory pool
@@ -430,7 +430,7 @@ void psmi_mpool_chunk_dealloc(mpool_t mp, int idx)
 					j * mp->mp_elm_size +
 					sizeof(struct mpool_element)));
 }
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 /**
  * psm3_mpool_destroy()
@@ -447,7 +447,7 @@ void psm3_mpool_destroy(mpool_t mp)
 
 	for (i = 0; i < mp->mp_elm_vector_size; i++) {
 		if (mp->mp_elm_vector[i]) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (mp->mp_alloc_dealloc_cb)
 				psmi_mpool_chunk_dealloc(mp, i);
 #endif
@@ -494,7 +494,7 @@ static int psmi_mpool_allocate_chunk(mpool_t mp)
 	if (num_to_allocate == 0)
 		return PSM2_NO_MEMORY;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (mp->mp_alloc_dealloc_cb)
 		chunk = psmi_calloc(PSMI_EP_NONE, mp->mp_memtype,
 				    num_to_allocate, mp->mp_elm_size);
@@ -504,7 +504,7 @@ static int psmi_mpool_allocate_chunk(mpool_t mp)
 #else
 	chunk = psmi_malloc(PSMI_EP_NONE, mp->mp_memtype,
 			    num_to_allocate * mp->mp_elm_size);
-#endif /* PSM_CUDA || PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 	if (chunk == NULL) {
 		fprintf(stderr,
 			"Failed to allocate memory for memory pool chunk: %s\n",
@@ -513,13 +513,13 @@ static int psmi_mpool_allocate_chunk(mpool_t mp)
 	}
 
 	for (i = 0; i < num_to_allocate; i++) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (mp->mp_alloc_dealloc_cb)
 			mp->mp_alloc_dealloc_cb(1 /* is alloc */,
 						mp->mp_alloc_dealloc_cb_context,
 						(void *)((uintptr_t)chunk + i * mp->mp_elm_size +
 						sizeof(struct mpool_element)));
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 		elm = (struct mpool_element *)((uintptr_t) chunk +
 					       i * mp->mp_elm_size +
 					       mp->mp_elm_offset);
diff --git a/prov/psm3/psm3/psm_mpool.h b/prov/psm3/psm3/psm_mpool.h
index 81655e81dc1..69038fff930 100644
--- a/prov/psm3/psm3/psm_mpool.h
+++ b/prov/psm3/psm3/psm_mpool.h
@@ -80,7 +80,7 @@ MOCKABLE(psm3_mpool_create)(size_t obj_size, uint32_t num_obj_per_chunk,
 			  non_empty_callback_fn_t cb, void *context);
 MOCK_DCL_EPILOGUE(psm3_mpool_create);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 mpool_t psm3_mpool_create_for_gpu(size_t obj_size, uint32_t num_obj_per_chunk,
 				  uint32_t num_obj_max_total, int flags,
 				  psmi_memtype_t statstype,
diff --git a/prov/psm3/psm3/psm_mq.c b/prov/psm3/psm3/psm_mq.c
index 4248ff7d28d..e617e44ab52 100644
--- a/prov/psm3/psm3/psm_mq.c
+++ b/prov/psm3/psm3/psm_mq.c
@@ -968,12 +968,12 @@ psm3_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 		if (req->req_data.buf != NULL) {	/* 0-byte messages don't alloc a sysbuf */
 			msglen = mq_set_msglen(req, len, req->req_data.send_msglen);
 			psm3_mq_recv_copy(mq, req,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 					req->is_buf_gpu_mem,
 #endif
 					buf, len, msglen);
 			psm3_mq_sysbuf_free(mq, req->req_data.buf);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		} else {
 			mq->stats.rx_sysbuf_cpu_num++;
 #endif
@@ -990,7 +990,7 @@ psm3_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 		 */
 		req->recv_msgoff = min(req->recv_msgoff, msglen);
 		psm3_mq_recv_copy(mq, req,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				req->is_buf_gpu_mem,
 #endif
 				buf, len, req->recv_msgoff);
@@ -1009,7 +1009,7 @@ psm3_mq_irecv_inner(psm2_mq_t mq, psm2_mq_req_t req, void *buf, uint32_t len)
 		req->recv_msgoff = min(req->recv_msgoff, msglen);
 		if (req->send_msgoff) {	// only have sysbuf if RTS w/payload
 			psm3_mq_recv_copy(mq, req,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 					req->is_buf_gpu_mem,
 #endif
 					buf, len, req->recv_msgoff);
@@ -1061,12 +1061,12 @@ psm3_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *ta
 		psm2_mq_req_t recv_req;
 		int table;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		int gpu_mem = 0;
 		void *gpu_user_buffer = NULL;
 
-		if (len && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(buf)) {
-			PSM3_MARK_BUF_SYNCHRONOUS(buf);
+		if (len && PSM3_IS_GPU_MEM(buf)) {
+			PSM3_GPU_MARK_BUF_SYNCHRONOUS(buf);
 
 			gpu_mem = 1;
 			gpu_user_buffer = buf;
@@ -1094,7 +1094,7 @@ psm3_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *ta
 			recv_req->recv_msgoff = 0;
 			recv_req->req_data.context = context;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			recv_req->is_buf_gpu_mem = gpu_mem;
 			recv_req->user_gpu_buffer = gpu_user_buffer;
 #endif
@@ -1110,7 +1110,7 @@ psm3_mq_fp_msg(psm2_ep_t ep, psm2_mq_t mq, psm2_epaddr_t addr, psm2_mq_tag_t *ta
 				  tag->tag[0], tag->tag[1], tag->tag[2],
 				  tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], recv_req);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			recv_req->is_buf_gpu_mem = gpu_mem;
 			recv_req->user_gpu_buffer = gpu_user_buffer;
 #endif
@@ -1141,11 +1141,11 @@ psm3_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
 	psm2_mq_req_t req;
 	int table;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI) 
+#ifdef PSM_HAVE_GPU
 	int gpu_mem = 0;
 
-	if (len && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(buf)) {
-		PSM3_MARK_BUF_SYNCHRONOUS(buf);
+	if (len && PSM3_IS_GPU_MEM(buf)) {
+		PSM3_GPU_MARK_BUF_SYNCHRONOUS(buf);
 
 		gpu_mem = 1;
 	}
@@ -1177,7 +1177,7 @@ psm3_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
 		req->recv_msgoff = 0;
 		req->req_data.context = context;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		req->is_buf_gpu_mem = gpu_mem;
 		if (gpu_mem)
 			req->user_gpu_buffer = buf;
@@ -1195,7 +1195,7 @@ psm3_mq_irecv2(psm2_mq_t mq, psm2_epaddr_t src,
 			  " tagsel=%08x.%08x.%08x req=%p\n", buf, len,
 			  tag->tag[0], tag->tag[1], tag->tag[2],
 			  tagsel->tag[0], tagsel->tag[1], tagsel->tag[2], req);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		req->is_buf_gpu_mem = gpu_mem;
 		if (gpu_mem)
 			req->user_gpu_buffer = buf;
@@ -1262,9 +1262,9 @@ psm3_mq_imrecv(psm2_mq_t mq, uint32_t flags, void *buf, uint32_t len,
 		   user's buffer. */
 		req->req_data.context = context;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-		if (len && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(buf)) {
-			PSM3_MARK_BUF_SYNCHRONOUS(buf);
+#ifdef PSM_HAVE_GPU
+		if (len && PSM3_IS_GPU_MEM(buf)) {
+			PSM3_GPU_MARK_BUF_SYNCHRONOUS(buf);
 			req->is_buf_gpu_mem = 1;
 			req->user_gpu_buffer = buf;
 		} else {
@@ -1445,7 +1445,7 @@ psm2_error_t psm3_mqopt_ctl(psm2_mq_t mq, uint32_t key, void *value, int get)
 		_HFI_VDBG("RNDV_SHM_SZ = %d (%s)\n",
 			  mq->shm_thresh_rv, get ? "GET" : "SET");
 		break;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	case PSM2_MQ_GPU_RNDV_SHM_SZ:
 		if (get)
 			*((uint32_t *) value) = mq->shm_gpu_thresh_rv;
@@ -1735,7 +1735,7 @@ uint32_t psm3_mq_max_window_rv(psm2_mq_t mq, int gpu)
 	// must do search since window_rv may not be increasing (but usually is)
 	uint32_t ret = 0;
 	struct psm3_mq_window_rv_entry *e;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (gpu)
 		e = mq->ips_gpu_window_rv;
 	else
@@ -1750,16 +1750,16 @@ uint32_t psm3_mq_max_window_rv(psm2_mq_t mq, int gpu)
 uint32_t psm3_mq_get_window_rv(psm2_mq_req_t req)
 {
 	if (! req->window_rv) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (req->is_buf_gpu_mem) {
 			req->window_rv = search_window(
 						req->mq->ips_gpu_window_rv,
 						req->req_data.send_msglen);
 		} else
-#endif	/* PSM_CUDA || PSM_ONEAPI */
+#endif	/* PSM_HAVE_GPU */
 		req->window_rv = search_window(req->mq->ips_cpu_window_rv,
 						req->req_data.send_msglen);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		_HFI_VDBG("Selected Window of %u for %u byte %s msg\n",
 			req->window_rv,
 			req->req_data.send_msglen,
@@ -2053,7 +2053,7 @@ static uint64_t shm_dsa_avg_copy_size_recv(void *context)
 }
 #endif /* PSM_DSA */
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static uint64_t gpu_ipc_hit_rate(void *context)
 {
 	psm2_mq_t mq = (psm2_mq_t)context;
@@ -2071,7 +2071,7 @@ static uint64_t gpu_ipc_miss_rate(void *context)
 	else
 		return 0;
 }
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 
 
 static uint64_t self_avg_msg_size_sent(void *context)
@@ -2083,7 +2083,7 @@ static uint64_t self_avg_msg_size_sent(void *context)
 		return 0;
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static uint64_t eager_cpu_avg_msg_size_sent(void *context)
 {
 	psm2_mq_t mq = (psm2_mq_t)context;
@@ -2133,7 +2133,7 @@ static uint64_t sysbuf_cuCopy_avg_size_recv(void *context)
 	else
 		return 0;
 }
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 
 psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid)
 {
@@ -2393,7 +2393,7 @@ psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid)
 				"Total DSA receive copiess which failured for non-page fault error",
 				&mq->stats.dsa_stats[1].dsa_error),
 #endif /* PSM_DSA */
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		// ------------------------------------------------------------
 		PSMI_STATS_DECL_HELP("Intra-node GPU messages may use GPU IPC Handles "
 			"to perform GPU to GPU rendezvous messages directly to and from "
@@ -2438,7 +2438,7 @@ psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid)
 		PSMI_STATS_DECLU64("gpu_ipc_clear",
 				"Number of times entire cache was cleared and reset due to error",
 				&mq->stats.gpu_ipc_cache_clear),
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 
 		// ------------------------------------------------------------
 		PSMI_STATS_DECL_HELP("The PSM3 self protocol is used in the "
@@ -2454,7 +2454,7 @@ psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid)
 				"Average message size sent using PSM3 self protocol",
 				self_avg_msg_size_sent),
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		// ------------------------------------------------------------
 		PSMI_STATS_DECL_HELP("Eager messages may be sent from GPU or "
 			"CPU application buffers.\n"
@@ -2514,7 +2514,7 @@ psm2_error_t psm3_mq_initstats(psm2_mq_t mq, psm2_epid_t epid)
 		PSMI_STATS_DECL_FUNC("sysbuf_cuCopy_avg_size_recv",
 				"Average gpuCopy size from a receive bounce buffer to a GPU buffer",
 				sysbuf_cuCopy_avg_size_recv),
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 	};
 
 	return psm3_stats_register_type("MPI_Statistics_Summary",
@@ -2577,8 +2577,8 @@ psm2_error_t psm3_mq_malloc(psm2_mq_t *mqo)
 	// shm_thresh_rv is N/A to NIC and HAL, so we set this here and let
 	// HAL set the rest of the defaults
 	mq->shm_thresh_rv = PSM3_MQ_RNDV_SHM_THRESH;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	mq->shm_gpu_thresh_rv = PSM3_MQ_RNDV_SHM_GPU_THRESH;
+#ifdef PSM_HAVE_GPU
+	mq->shm_gpu_thresh_rv = psm3_gpu_mq_rndv_shm_gpu_thresh_default;
 #endif
 
 	psmi_hal_mq_init_defaults(mq);
@@ -2604,7 +2604,7 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq)
 {
 	union psmi_envvar_val env_hfitiny, env_rvwin, env_hfirv,
 		env_shmrv, env_hash, env_stats;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	union psmi_envvar_val env_shmgpurv;
 #endif
 
@@ -2651,8 +2651,8 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq)
 			// already checked, shouldn't get parse errors nor empty strings
 			psmi_assert(0);
 		}
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-		if (PSMI_IS_GPU_ENABLED && mq->ips_gpu_window_rv_str) {
+#ifdef PSM_HAVE_GPU
+		if (mq->ips_gpu_window_rv_str) {
 			union psmi_envvar_val env_gpurvwin;
 			char *env;
 
@@ -2679,7 +2679,7 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq)
 		}
 #else
 		(void)got_depwin;	// keep compiler happy
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 	}
 
 	psm3_getenv("PSM3_MQ_RNDV_SHM_THRESH",
@@ -2688,8 +2688,8 @@ psm2_error_t psm3_mq_initialize_params(psm2_mq_t mq)
 		    (union psmi_envvar_val)mq->shm_thresh_rv, &env_shmrv);
 	mq->shm_thresh_rv = env_shmrv.e_uint;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (PSMI_IS_GPU_ENABLED) {
+#ifdef PSM_HAVE_GPU
+	if (PSM3_GPU_IS_ENABLED) {
 		psm3_getenv("PSM3_MQ_RNDV_SHM_GPU_THRESH",
 			"shm eager-to-rendezvous switchover for GPU send",
 			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
@@ -2729,7 +2729,7 @@ psm2_error_t MOCKABLE(psm3_mq_free)(psm2_mq_t mq)
 	psm3_mq_req_fini(mq);
 	psm3_mq_sysbuf_fini(mq);
 	psm3_stats_deregister_type(PSMI_STATSTYPE_MQ, mq);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	psmi_free(mq->ips_gpu_window_rv);
 #endif
 	psmi_free(mq->ips_cpu_window_rv);
diff --git a/prov/psm3/psm3/psm_mq_internal.h b/prov/psm3/psm3/psm_mq_internal.h
index 824dc1ad60a..64bc05a2288 100644
--- a/prov/psm3/psm3/psm_mq_internal.h
+++ b/prov/psm3/psm3/psm_mq_internal.h
@@ -180,12 +180,12 @@ struct psm2_mq {
 	uint32_t hfi_thresh_tiny;
 	uint32_t rndv_nic_thresh;
 	uint32_t shm_thresh_rv;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint32_t shm_gpu_thresh_rv;
 #endif
 	const char *ips_cpu_window_rv_str;	// default input to parser
 	struct psm3_mq_window_rv_entry *ips_cpu_window_rv;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	const char *ips_gpu_window_rv_str;	// default input to parser
 	struct psm3_mq_window_rv_entry *ips_gpu_window_rv;
 #endif
@@ -330,31 +330,10 @@ struct psm2_mq_req {
 	psm3_verbs_mr_t	mr;	// local registered memory for app buffer
 #endif
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint8_t* user_gpu_buffer;	/* for recv */
 	STAILQ_HEAD(sendreq_spec_, ips_gpu_hostbuf) sendreq_prefetch;
 	uint32_t prefetch_send_msgoff;
-#endif
-#ifdef PSM_CUDA
-	CUipcMemHandle cuda_ipc_handle;
-	uint8_t cuda_ipc_handle_attached;
-	uint32_t cuda_ipc_offset;
-#endif
-#ifdef PSM_ONEAPI
-	union {
-		ze_ipc_mem_handle_t ipc_handle; // for sender req
-		uint32_t ze_handle;		// receiver req pidfd or gem_handle
-	};
-	uint8_t ze_handle_attached;
-	uint8_t ze_alloc_type;
-	uint32_t ze_ipc_offset;
-#ifndef PSM_HAVE_PIDFD
-	uint32_t ze_device_index;
-#endif
-	uint64_t ze_alloc_id;
-#endif
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	int gpu_hostbuf_used;
 	/*
 	 * is_sendbuf_gpu_mem - Used to always select TID path on the receiver
 	 * when send is on a device buffer
@@ -365,6 +344,10 @@ struct psm2_mq_req {
 	 * on a device/host buffer.
 	 */
 	uint8_t is_buf_gpu_mem;
+	uint16_t pad;	// ensure fields below are 64 bit aligned
+	// GPU specific fields for use in PSM3 shm GPU IPC
+	union psm2_mq_req_gpu_specific gpu_specific;
+	int gpu_hostbuf_used;
 #endif
 
 	/* PTLs get to store their own per-request data.  MQ manages the allocation
@@ -547,8 +530,8 @@ PSMI_ALWAYS_INLINE(
 void
 mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len))
 {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (len && PSMI_IS_GPU_ENABLED && (PSMI_IS_GPU_MEM(dest) || PSMI_IS_GPU_MEM(src))) {
+#ifdef PSM_HAVE_GPU
+	if (len && (PSM3_IS_GPU_MEM(dest) || PSM3_IS_GPU_MEM(src))) {
 		PSM3_GPU_MEMCPY(dest, src, len);
 		return;
 	}
@@ -587,7 +570,7 @@ mq_copy_tiny(uint32_t *dest, uint32_t *src, uint8_t len))
 
 typedef void (*psmi_mtucpy_fn_t)(void *dest, const void *src, uint32_t len);
 typedef void (*psmi_copy_tiny_fn_t)(uint32_t *dest, uint32_t *src, uint8_t len);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 
 PSMI_ALWAYS_INLINE(
 void
@@ -781,7 +764,7 @@ MOCK_DCL_EPILOGUE(psm3_mq_free);
 void psm3_mq_handle_rts_complete(psm2_mq_req_t req);
 int psm3_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req,
 			uint32_t offset, const void *payload, uint32_t paylen
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			, int use_gdrcopy, psm2_ep_t ep
 #endif
 			);
@@ -804,7 +787,7 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t req);
 // 	can get future cache hits on other size messages in same buffer
 // not needed - msglen - negotiated total message size
 // copysz - actual amount to copy (<= msglen)
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 void psm3_mq_recv_copy(psm2_mq_t mq, psm2_mq_req_t req, uint8_t is_buf_gpu_mem,
                                 void *buf, uint32_t len, uint32_t copysz);
 #else
diff --git a/prov/psm3/psm3/psm_mq_recv.c b/prov/psm3/psm3/psm_mq_recv.c
index 181d4dd5ba7..f8ea86a5fa6 100644
--- a/prov/psm3/psm3/psm_mq_recv.c
+++ b/prov/psm3/psm3/psm_mq_recv.c
@@ -98,7 +98,7 @@ void psm3_mq_handle_rts_complete(psm2_mq_req_t req)
 	return;
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /*
  * Copy a packet from host buffer to a gpu buffer.
  *
@@ -170,12 +170,12 @@ psm3_mq_req_gpu_copy(uint64_t gpu_buf_start, uint32_t gpu_buf_len,
 		pkt_len = len;
 	}
 }
-#endif /* PSM_CUDA || PSM_ONEAPI  */
+#endif /* PSM_HAVE_GPU */
 
 static void
 psm3_mq_req_copy(psm2_mq_req_t req,
 		 uint32_t offset, const void *buf, uint32_t nbytes
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		, int use_gdrcopy, psm2_ep_t ep
 #endif
 		)
@@ -198,7 +198,7 @@ psm3_mq_req_copy(psm2_mq_req_t req,
 		msglen_this = nbytes;
 	}
 	if (msgptr != buf) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		// for loopback HAL, invalid to call psm3_mq_get_window_rv()
 		// however, for loopback HAL, gdr copy is disabled
 		if (use_gdrcopy)
@@ -227,7 +227,7 @@ psm3_mq_req_copy(psm2_mq_req_t req,
 int
 psm3_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req,
 		    uint32_t offset, const void *buf, uint32_t nbytes
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		    , int use_gdrcopy, psm2_ep_t ep
 #endif
 		)
@@ -245,7 +245,7 @@ psm3_mq_handle_data(psm2_mq_t mq, psm2_mq_req_t req,
 		rc = MQ_RET_UNEXP_OK;
 	}
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	psm3_mq_req_copy(req, offset, buf, nbytes, use_gdrcopy, ep);
 #else
 	psm3_mq_req_copy(req, offset, buf, nbytes);
@@ -416,7 +416,7 @@ psm3_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 		if (paylen) {
 			// payload of RTS can contain a single packet synchronous MPI msg
 			psm3_mq_mtucpy(req->req_data.buf, payload, paylen);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (req->is_buf_gpu_mem) {
 				stats->rndv_rts_cuCopy_recv++;
 				stats->rndv_rts_cuCopy_recv_bytes += paylen;
@@ -474,7 +474,7 @@ psm3_mq_handle_rts(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 		if (paylen) {
 			req->req_data.buf = psm3_mq_sysbuf_alloc(mq, paylen);
 			psmi_assert(paylen == 0 || req->req_data.buf != NULL);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			psm3_mq_mtucpy_host_mem(req->req_data.buf, payload, paylen);
 #else
 			psm3_mq_mtucpy(req->req_data.buf, payload, paylen);
@@ -521,9 +521,9 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 	psm2_mq_req_t req;
 	uint32_t msglen;
 	psmi_mtucpy_fn_t psmi_mtucpy_fn;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	int use_gdrcopy = 0;
-#endif /*  PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 	psm2_mq_tag_t *tag = (psm2_mq_tag_t *)_tag;
 
 	if (msgorder && (req = psm3_mq_req_match(mq, src, tag, 1))) {
@@ -543,7 +543,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 		switch (opcode) {
 		case MQ_MSG_TINY:
 			/* mq_copy_tiny() can handle zero byte */
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (!req->is_buf_gpu_mem) {
 				mq_copy_tiny_host_mem((uint32_t *) user_buffer, (uint32_t *) payload, msglen);
 				stats->tiny_cpu_recv++;
@@ -561,7 +561,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 				user_buffer = req->req_data.buf;
 #endif
 				mq_copy_tiny((uint32_t *) user_buffer, (uint32_t *) payload, msglen);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				stats->tiny_cuCopy_recv++;
 				stats->tiny_cuCopy_recv_bytes += msglen;
 			}
@@ -577,7 +577,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 
 		case MQ_MSG_SHORT:	/* message fits in 1 payload */
 			psmi_mtucpy_fn = psm3_mq_mtucpy;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (!req->is_buf_gpu_mem) {
 				psmi_mtucpy_fn = psm3_mq_mtucpy_host_mem;
 				stats->short_cpu_recv++;
@@ -589,15 +589,10 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 							(unsigned long)req->req_data.buf,
 							msglen, 1, mq->ep))) {
 				psmi_mtucpy_fn = psm3_mq_mtucpy_host_mem;
-#ifdef PSM_ONEAPI
-				use_gdrcopy = 1;
-#endif
 				stats->short_gdrcopy_recv++;
 				stats->short_gdrcopy_recv_bytes += msglen;
 			} else {
 				user_buffer = req->req_data.buf;
-#endif
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 				stats->short_cuCopy_recv++;
 				stats->short_cuCopy_recv_bytes += msglen;
 			}
@@ -635,7 +630,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 			_HFI_VDBG("exp MSG_EAGER of length %d bytes pay=%d\n",
 				  msglen, paylen);
 			// !offset -> only count recv msgs on 1st pkt in msg
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (!req->is_buf_gpu_mem) {
 				if (!offset) stats->eager_cpu_recv++;
 				stats->eager_cpu_recv_bytes += paylen;
@@ -655,7 +650,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 #endif
 			if (paylen > 0)
 				psm3_mq_handle_data(mq, req, offset, payload,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 						    paylen, use_gdrcopy, mq->ep);
 #else
 						    paylen);
@@ -721,7 +716,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 		if (msglen > 0) {
 			req->req_data.buf = psm3_mq_sysbuf_alloc(mq, msglen);
 			psmi_assert(msglen == 0 || req->req_data.buf != NULL);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			mq_copy_tiny_host_mem((uint32_t *) req->req_data.buf,
 				     (uint32_t *) payload, msglen);
 #else
@@ -741,14 +736,14 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 		req->req_data.buf = psm3_mq_sysbuf_alloc(mq, msglen);
 		psmi_assert(msglen == 0 || req->req_data.buf != NULL);
 		if (msglen <= paylen) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			psm3_mq_mtucpy_host_mem(req->req_data.buf, payload, msglen);
 #else
 			psm3_mq_mtucpy(req->req_data.buf, payload, msglen);
 #endif
 		} else {
 			psmi_assert((msglen & ~0x3) == paylen);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			psm3_mq_mtucpy_host_mem(req->req_data.buf, payload, paylen);
 #else
 			psm3_mq_mtucpy(req->req_data.buf, payload, paylen);
@@ -758,7 +753,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 			 * copy after the DW payload.
 			 */
 			uint32_t off[] = { offset };
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			mq_copy_tiny_host_mem((uint32_t *)(req->req_data.buf+paylen),
 				(uint32_t *)off, msglen & 0x3);
 #else
@@ -781,7 +776,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 		_HFI_VDBG("unexp MSG_EAGER of length %d bytes pay=%d\n",
 			  msglen, paylen);
 		if (paylen > 0)
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			psm3_mq_handle_data(mq, req, offset, payload, paylen, 0, NULL);
 #else
 			psm3_mq_handle_data(mq, req, offset, payload, paylen);
@@ -807,7 +802,7 @@ psm3_mq_handle_envelope(psm2_mq_t mq, psm2_epaddr_t src, uint32_t *_tag,
 	return MQ_RET_UNEXP_OK;
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI) // declared inline in psm_mq_internal.h for non-CUDA
+#ifdef PSM_HAVE_GPU // declared inline in psm_mq_internal.h for non-GPU
 // perform the actual copy for an psmi_mq_irecv_inner.  We copy from a sysbuf
 // (req->req_data.buf) to the actual user buffer (buf) and keep statistics.
 // is_buf_gpu_mem indicates if buf is a gpu buffer
@@ -826,22 +821,22 @@ void psm3_mq_recv_copy(psm2_mq_t mq, psm2_mq_req_t req, uint8_t is_buf_gpu_mem,
 		return;
 	}
 	if (!is_buf_gpu_mem) {
-		psmi_assert(! PSMI_IS_GPU_ENABLED || !PSMI_IS_GPU_MEM(buf));
+		psmi_assert(!PSM3_IS_GPU_MEM(buf));
 		mq->stats.rx_sysbuf_cpu_num++;
 		mq->stats.rx_sysbuf_cpu_bytes += copysz;
 		psmi_mtucpy_fn = psm3_mq_mtucpy_host_mem;
-	// len could be huge, so limit ourselves to gdr_copy_limit_recv
-	// Note to get here copysz <= gdr_copy_limit_recv
+	// len could be huge, so limit ourselves to psm3_gpu_gdr_copy_limit_recv
+	// Note to get here copysz <= psm3_gpu_gdr_copy_limit_recv
 	} else if (PSMI_USE_GDR_COPY_RECV(copysz) &&
 		NULL != (ubuf = psmi_hal_gdr_convert_gpu_to_host_addr((unsigned long)buf,
-						    min(gdr_copy_limit_recv, len), 1,
+						    min(psm3_gpu_gdr_copy_limit_recv, len), 1,
 						    mq->ep))) {
-		psmi_assert(! PSMI_IS_GPU_ENABLED || PSMI_IS_GPU_MEM(buf));
+		psmi_assert(! PSM3_GPU_IS_ENABLED || PSM3_IS_GPU_MEM(buf));
 		psmi_mtucpy_fn = psm3_mq_mtucpy_host_mem;
 		mq->stats.rx_sysbuf_gdrcopy_num++;
 		mq->stats.rx_sysbuf_gdrcopy_bytes += copysz;
 	} else {
-		psmi_assert(! PSMI_IS_GPU_ENABLED || PSMI_IS_GPU_MEM(buf));
+		psmi_assert(! PSM3_GPU_IS_ENABLED || PSM3_IS_GPU_MEM(buf));
 		ubuf = buf;
 		mq->stats.rx_sysbuf_cuCopy_num++;
 		mq->stats.rx_sysbuf_cuCopy_bytes += copysz;
@@ -849,7 +844,7 @@ void psm3_mq_recv_copy(psm2_mq_t mq, psm2_mq_req_t req, uint8_t is_buf_gpu_mem,
 	if (copysz)
 		psmi_mtucpy_fn(ubuf, (const void *)req->req_data.buf, copysz);
 }
-#endif // defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#endif // PSM_HAVE_GPU
 
 // we landed an out of order message in a sysbuf and can now process it
 // ureq is where we landed it.  If found, ereq is the user posted receive.
@@ -873,13 +868,13 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
 	case MQ_STATE_COMPLETE:
 		if (ureq->req_data.buf != NULL) {	/* 0-byte don't alloc a sysreq_data.buf */
 			psm3_mq_recv_copy(mq, ureq,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 					ereq->is_buf_gpu_mem,
 #endif
 					ereq->req_data.buf,
 					ereq->req_data.buf_len, msglen);
 			psm3_mq_sysbuf_free(mq, ureq->req_data.buf);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		} else {
 			mq->stats.rx_sysbuf_cpu_num++; // zero length
 #endif
@@ -895,7 +890,7 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
 		ereq->send_msgoff = ureq->send_msgoff;
 		ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
 		psm3_mq_recv_copy(mq, ureq,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				ereq->is_buf_gpu_mem,
 #endif
 				ereq->req_data.buf,
@@ -913,7 +908,7 @@ int psm3_mq_handle_outoforder(psm2_mq_t mq, psm2_mq_req_t ureq)
 		ereq->recv_msgoff = min(ureq->recv_msgoff, msglen);
 		if (ereq->send_msgoff) { // only have sysbuf if RTS w/payload
 			psm3_mq_recv_copy(mq, ureq,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 					ereq->is_buf_gpu_mem,
 #endif
 					ereq->req_data.buf,
diff --git a/prov/psm3/psm3/psm_nic_select.c b/prov/psm3/psm3/psm_nic_select.c
index 58d3ab72b15..cbd8943d457 100644
--- a/prov/psm3/psm3/psm_nic_select.c
+++ b/prov/psm3/psm3/psm_nic_select.c
@@ -72,6 +72,22 @@
 #endif
 #endif
 
+// PSM3_NIC_SELECTION_ALG choices.
+// ALG_NUMA is the default. This option spreads the NIC selection within the
+// local CPU socket's NICs (NUMA).
+// If it is preferred to spread job over over entire set of NICs within the
+// system, use ALG_ANY.
+// For systems with PCIe switches for GPU Direct, GPU_CENTRIC is typically best.
+// For GPU systems w/o switches, CPU_CENTRIC may be best.
+#define PSMI_NIC_SEL_ALG_NUMA        0 /* Round Robin within NUMA */
+#define PSMI_NIC_SEL_ALG_FIRST       1 /* First Active NIC */
+#define PSMI_NIC_SEL_ALG_ANY         2 /* Round Robin All */
+#define PSMI_NIC_SEL_ALG_CPU_CENTRIC 3 /* Round Robin, prefer CPU distance */
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+#define PSMI_NIC_SEL_ALG_GPU_CENTRIC 4 /* Round Robin, prefer GPU distance */
+#endif
+
+
 // subnuma is risky right now, so disable and explore in future
 //#ifdef PSM_USE_HWLOC
 //#define PSM3_HAVE_CPU_SUBNUMA
@@ -152,7 +168,7 @@ psm3_get_uuid_hash(psm2_uuid_t const uuid)
 
 int psm3_get_current_proc_location()
 {
-        int core_id, node_id;
+	int core_id, node_id;
 
 	core_id = sched_getcpu();
 	if (core_id < 0)
@@ -250,66 +266,6 @@ int psm3_get_max_cpu_numa()
 	return max_cpu_numa;
 }
 
-/* search the list of all units for those which are active
- * and optionally match the given NUMA node_id (when node_id >= 0)
- * returns the number of active units found.
- * Note get_unit_active tests for active ports, valid addresses and
- * performs filtering as done in get_port_subnets
- */
-static int
-hfi_find_active_hfis(int nunits, int node_id, int *saved_hfis)
-{
-	int found = 0, unit_id;
-
-	for (unit_id = 0; unit_id < nunits; unit_id++) {
-		int node_id_i;
-
-		if (psmi_hal_get_unit_active(unit_id) <= 0)
-			continue;
-
-		if (node_id < 0) {
-			saved_hfis[found++] = unit_id;
-			_HFI_DBG("RoundRobinAll Found NIC unit= %d, local rank=%d.\n",
-				unit_id, psm3_get_mylocalrank());
-		} else if (!psmi_hal_get_node_id(unit_id, &node_id_i)
-				&& node_id_i == node_id) {
-			saved_hfis[found++] = unit_id;
-			_HFI_DBG("RoundRobin Found NIC unit= %d, node = %d, local rank=%d.\n",
-				unit_id, node_id, psm3_get_mylocalrank());
-		}
-	}
-	return found;
-}
-
-// select NIC across all NICs, use a hash of job_id and local rank to
-// distribute local ranks across NICs and to attempt to distribute
-// jobs across NICs.
-// TBD - if know never have >1 job per node, could ignore job_id, perhaps
-// have an env to exclude job_id from hash so NIC selection is deterministic
-static void
-psmi_spread_nic_selection(psm2_uuid_t const job_key, long *unit_start,
-			     long *unit_end, int nunits)
-{
-	int found, saved_hfis[nunits];
-
-	/* we are going to look at:
-	   (a hash of the job key plus the local rank id) mod nunits. */
-	found = hfi_find_active_hfis(nunits, -1, saved_hfis);
-	if (found)
-		*unit_start = saved_hfis[((psm3_get_mylocalrank()+1) +
-			psm3_get_uuid_hash(job_key)) % found];
-	else
-		// none found, caller will fail, start is a don't care
-		*unit_start = 0;
-	/* just in case, caller will check all other units, with wrap */
-	if (*unit_start > 0)
-		*unit_end = *unit_start - 1;
-	else
-		*unit_end = nunits-1;
-	_HFI_DBG("RoundRobinAll Will select 1st viable NIC unit= %ld to %ld.\n",
-		*unit_start, *unit_end);
-}
-
 static int
 psm3_create_and_open_affinity_shm(psm2_uuid_t const job_key)
 {
@@ -411,57 +367,6 @@ psm3_create_and_open_affinity_shm(psm2_uuid_t const job_key)
 	return -1;
 }
 
-/*
- * Spread HFI selection between units if we find more than one within a socket.
- */
-static void
-psmi_spread_hfi_within_socket(long *unit_start, long *unit_end, int node_id,
-			      int *saved_hfis, int found, psm2_uuid_t const job_key)
-{
-	int ret, shm_location;
-
-	/*
-	 * Take affinity lock and open shared memory region to be able to
-	 * accurately determine which HFI to pick for this process. If any
-	 * issues, bail by picking first known HFI.
-	 */
-	if (!psm3_affinity_semaphore_open)
-		goto spread_hfi_fallback;
-
-	ret = psm3_create_and_open_affinity_shm(job_key);
-	if (ret < 0)
-		goto spread_hfi_fallback;
-
-	// one shm entry per CPU NUMA domain
-	// The entry contains the next round robin NIC to use
-	// in the form of a index into saved_hfis
-	// saved_hfis has a list of all the NUMA local active NICs
-	shm_location = AFFINITY_SHM_HFI_INDEX_LOCATION + node_id;
-	if (shm_location > PSMI_PAGESIZE)
-		goto spread_hfi_fallback;
-
-	/* Start critical section to read/write shm object */
-	if (psmi_sem_timedwait(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name)) {
-		_HFI_VDBG("Could not enter critical section to update NIC index\n");
-		goto spread_hfi_fallback;
-	}
-
-	*unit_start = *unit_end = saved_hfis[psm3_shared_affinity_ptr[shm_location]];
-	psm3_shared_affinity_ptr[shm_location] =
-		(psm3_shared_affinity_ptr[shm_location] + 1) % found;
-	_HFI_DBG("RoundRobin Selected NIC unit= %ld, Next NIC=%ld, node = %d, local rank=%d, found=%d.\n",
-		  *unit_start, psm3_shared_affinity_ptr[shm_location], node_id,
-		  psm3_get_mylocalrank(), found);
-
-	/* End Critical Section */
-	psmi_sem_post(psm3_sem_affinity_shm_rw, psm3_sem_affinity_shm_rw_name);
-
-	return;
-
-spread_hfi_fallback:
-	*unit_start = *unit_end = saved_hfis[0];
-}
-
 static void
 psm3_create_affinity_semaphores(psm2_uuid_t const job_key)
 {
@@ -730,9 +635,9 @@ void nic_info_filter_gpu_distance(struct nic_info *nic_info, unsigned ninfo)
 	unsigned i;
 	int min_distance = INT_MAX;	// smallest distance found
 	unsigned found = 0;
-	struct pci_addr gpu_pci_addr;
+	struct pci_addr gpu_pci_addr = { 0 };
 
-	if (! PSMI_IS_GPU_ENABLED)
+	if (! PSM3_GPU_IS_ENABLED)
 		return;
 
 	psm3_deferred_hwloc_topology_init();
@@ -740,66 +645,9 @@ void nic_info_filter_gpu_distance(struct nic_info *nic_info, unsigned ninfo)
 		return;	// hwloc incorrect version
 	psmi_assert(psm3_hwloc_topology_initialized);
 
-	// Get current GPU PCIe address to gpu_pci_addr;
-#ifdef PSM_CUDA
-	{
-		int domain, bus, dev;
-		int num_devices;
-		CUdevice device;
-
-		PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
-		_HFI_DBG("%d Cuda GPUs found\n", num_devices);
-		if (! num_devices)
-			return;
-
-		if (num_devices == 1) {
-			PSMI_CUDA_CALL(cuDeviceGet, &device, 0);
-		} else {
-			// all GPUs will be visible to process, see if app chose one first
-			CUcontext ctxt = {0};
-			if (! psmi_cuCtxGetCurrent || psmi_cuCtxGetCurrent(&ctxt) || ! ctxt) {
-				_HFI_DBG("Unable to get Cuda ctxt\n");
-				//PSMI_CUDA_CALL(cuDeviceGet, &device, 0);
-				return;
-			} else {
-				PSMI_CUDA_CALL(cuCtxGetDevice, &device);
-			}
-		}
-		_HFI_DBG("Using Cuda GPU %d\n", device);
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-								&domain,
-								CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID,
-								device);
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-								&bus,
-								CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
-								device);
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-								&dev,
-								CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
-								device);
-		gpu_pci_addr.domain = domain;
-		gpu_pci_addr.bus = bus;
-		gpu_pci_addr.dev = dev;
-		gpu_pci_addr.func = 0;
-	}
-#elif defined(PSM_ONEAPI)
-	{
-		ze_pci_ext_properties_t PciProperties;
-
-		_HFI_DBG("%d Level Zero GPUs found\n", num_ze_devices);
-		if (! num_ze_devices)
-			return;
-
-		// caling middleware will have limited GPUs visible to process
-		PSMI_ONEAPI_ZE_CALL(zeDevicePciGetPropertiesExt,
-							ze_devices[0].dev, &PciProperties);
-		gpu_pci_addr.domain = PciProperties.address.domain;
-		gpu_pci_addr.bus = PciProperties.address.bus;
-		gpu_pci_addr.dev = PciProperties.address.device;
-		gpu_pci_addr.func = PciProperties.address.function;
-	}
-#endif
+	// Get current GPU's PCIe address to gpu_pci_addr;
+	PSM3_GPU_GET_PCI_ADDR( &gpu_pci_addr.domain, &gpu_pci_addr.bus,
+		&gpu_pci_addr.dev, &gpu_pci_addr.func);
 	_HFI_DBG("GPU PCIe address is %04x:%02x:%02x.%x\n",
 				gpu_pci_addr.domain, gpu_pci_addr.bus,
 				gpu_pci_addr.dev, gpu_pci_addr.func);
@@ -847,6 +695,14 @@ void nic_info_filter_gpu_distance(struct nic_info *nic_info, unsigned ninfo)
 }
 #endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */
 
+// filter down the list of NICs solely based on CPU NUMA locality
+static void nic_info_filter_cpu_numa(struct nic_info *nic_info,
+										unsigned ninfo)
+{
+	_HFI_DBG("Filtering NICs with CPU NUMA Strategy\n");
+	nic_info_filter_numa(nic_info, ninfo);
+}
+
 // filter down the list of NICs with a CPU locality focus as priority
 // if present, the GPU is considered last.  If the GPU is NUMA local
 // to the CPU, the GPU filter can further limit NICs to those close to the
@@ -1008,7 +864,7 @@ psm3_open_shm_scoreboard_and_select_nic(
 		goto fallback;
 	}
 
-	// balance among procceses within current node
+	// balance among processes within current node
 	nic_info_filter_refcount(nic_info, ninfo,
 					psm3_shared_affinity_nic_refcount_ptr,
 					nunits, "local node");
@@ -1057,198 +913,58 @@ void psm3_dec_nic_refcount(int unit_id)
 	}
 }
 
-psm2_error_t
-psm3_compute_start_and_end_unit_cpu_centric(
-				psm2_uuid_t const job_key,
-				long *unit_start,long *unit_end, int nunits)
+static int parse_selection_alg(const char *str)
 {
-	unsigned index;
-	unsigned ninfo;
-	struct nic_info nic_info[PSMI_MAX_RAILS];
-
-	// caller will enumerate addr_index, just just get all active ports
-	ninfo = nic_info_init(nic_info, nunits, 0);
-	if (! ninfo) {
-		// should not happen, caller already confirmed there is >1 active unit
-		// mimic what caller of psm3_compute_start_and_end_unit would do
-		return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-                    "PSM3 no nic units are active");
-	}
-
-	nic_info_filter_cpu_centric(nic_info, ninfo);
-
-	index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo,
-								job_key, nunits);
-	psmi_assert(index >= 0 && index < ninfo);
-
-	// caller will select 1st active port and an addr_index within unit
-	*unit_start = *unit_end = nic_info[index].unit;
-	return PSM2_OK;
-}
-
+	if (!strcasecmp(str, "Round Robin")
+		|| !strcasecmp(str, "RoundRobin")
+		|| !strcasecmp(str, "rr"))
+		return PSMI_NIC_SEL_ALG_NUMA;
+	else if (!strcasecmp(str, "Packed")
+			 || !strcasecmp(str, "p"))
+		return PSMI_NIC_SEL_ALG_FIRST;
+	else if (!strcasecmp(str, "Round Robin All")
+			 || !strcasecmp(str, "RoundRobinAll")
+			 || !strcasecmp(str, "rra"))
+		return PSMI_NIC_SEL_ALG_ANY;
+	else if (!strcasecmp(str, "CPU Centric Round Robin")
+			 || !strcasecmp(str, "CpuRoundRobin")
+			 || !strcasecmp(str, "crr"))
+		return PSMI_NIC_SEL_ALG_CPU_CENTRIC;
 #ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
-psm2_error_t
-psm3_compute_start_and_end_unit_gpu_centric(
-				psm2_uuid_t const job_key,
-				long *unit_start,long *unit_end, int nunits)
-{
-	unsigned index;
-	unsigned ninfo;
-	struct nic_info nic_info[PSMI_MAX_RAILS];
-
-	// caller will enumerate addr_index, just just get all active ports
-	ninfo = nic_info_init(nic_info, nunits, 0);
-	if (! ninfo) {
-		// should not happen, caller already confirmed there is >1 active unit
-		// mimic what caller of psm3_compute_start_and_end_unit would do
-		return psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-                    "PSM3 no nic units are active");
-	}
-
-	nic_info_filter_gpu_centric(nic_info, ninfo);
-
-	index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo,
-								job_key, nunits);
-	psmi_assert(index >= 0 && index < ninfo);
-
-	// caller will select 1st active port and an addr_index within unit
-	*unit_start = *unit_end = nic_info[index].unit;
-	return PSM2_OK;
+	else if (!strcasecmp(str, "GPU Centric Round Robin")
+			 || !strcasecmp(str, "GpuRoundRobin")
+			 || !strcasecmp(str, "grr"))
+		return PSMI_NIC_SEL_ALG_GPU_CENTRIC;
+#endif
+	else
+		return -1;
 }
-#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */
 
-// return set of units to consider and which to start at.
-// caller will use 1st active unit which can be opened.
-// caller will wrap around so it's valid for start >= end
-// Note: When using multiple rails per PSM process, higher level code will
-// walk through desired units and unit_param will specify a specific unit
-// if unit_param is PSM3_NIC_ANY, this will pick starting point for nic search
-psm2_error_t
-psm3_compute_start_and_end_unit(long unit_param, long addr_index,
-				int nunitsactive, int nunits,
-				psm2_uuid_t const job_key,
-				long *unit_start, long *unit_end)
+/* check for valid PSM3_SELECTION_ALG
+ * returns:
+ * 0 - valid
+ * -1 - empty string
+ * -2 - invalid syntax
+ */
+static int parse_check_selection_alg(int type, const union psmi_envvar_val val,
+						void *ptr, size_t errstr_size, char errstr[])
 {
-	unsigned short nic_sel_alg = PSMI_UNIT_SEL_ALG_ACROSS;
-	int node_id, found = 0;
-	int saved_hfis[nunits];
-
-	/* if the user did not set PSM3_NIC then ... */
-	if (unit_param == PSM3_NIC_ANY)
-	{
-		if (nunitsactive > 1) {
-			// if NICs are on different planes (non-routed subnets)
-			// we need to have all ranks default to the same plane
-			// so force 1st active NIC in that case
-			int have_subnet = 0, unit_id;
-			psmi_subnet128_t got_subnet = { };
-			for (unit_id = 0; unit_id < nunits; unit_id++) {
-				psmi_subnet128_t subnet;
-				if (psmi_hal_get_unit_active(unit_id) <= 0)
-					continue;
-				if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/,
-								addr_index>0?addr_index:0,
-								&subnet, NULL, NULL, NULL))
-					continue; // can't access NIC
-				if (! have_subnet) {
-					have_subnet = 1;
-					got_subnet = subnet;
-				} else if (! psm3_subnets_match(got_subnet,
-								subnet)) {
-					// active units have different tech
-					// (IB/OPA vs Eth) or different subnets
-					// caller will pick 1st active unit
-					*unit_start = 0;
-					*unit_end = nunits - 1;
-					_HFI_DBG("Multi-Plane config: Will select 1st viable NIC unit= %ld to %ld.\n",
-						*unit_start, *unit_end);
-					return PSM2_OK;
-				}
-			}
-		}
-
-		/* Get the actual selection algorithm from the environment: */
-		nic_sel_alg = psmi_parse_nic_selection_algorithm();
-		/* If round-robin is selection algorithm and ... */
-		if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS) &&
-		    /* there are more than 1 active units then ... */
-		    (nunitsactive > 1))
-		{
-			/*
-			 * Pick an HFI on same root complex as current task.
-			 * linux IPC ensures balanced NIC usage within job.
-			 * If none found, fall back to
-			 * RoundRobinAll load-balancing algorithm.
-			 */
-			node_id = psm3_get_current_proc_location();
-			if (node_id >= 0) {
-				found = hfi_find_active_hfis(nunits, node_id,
-								saved_hfis);
-				if (found > 1) {
-					psm3_create_affinity_semaphores(job_key);
-					psmi_spread_hfi_within_socket(unit_start, unit_end,
-								      node_id, saved_hfis,
-								      found, job_key);
-				} else if (found == 1) {
-					*unit_start = *unit_end = saved_hfis[0];
-					_HFI_DBG("RoundRobin Selected NIC unit= %ld, node = %d, local rank=%d, found=%d.\n",
-						*unit_start, node_id,
-						psm3_get_mylocalrank(), found);
-				}
-			}
-
-			if (node_id < 0 || !found) {
-				_HFI_DBG("RoundRobin No local NIC found, using RoundRobinAll, node = %d, local rank=%d, found=%d.\n",
-						node_id,
-						psm3_get_mylocalrank(), found);
-				psmi_spread_nic_selection(job_key, unit_start,
-							  unit_end, nunits);
-			}
-		} else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_ACROSS_ALL) &&
-			 (nunitsactive > 1)) {
-				psmi_spread_nic_selection(job_key, unit_start,
-							  unit_end, nunits);
-		} else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_CPU_CENTRIC) &&
-			 (nunitsactive > 1)) {
-				return psm3_compute_start_and_end_unit_cpu_centric(job_key,
-							unit_start, unit_end, nunits);
-#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
-		} else if ((nic_sel_alg == PSMI_UNIT_SEL_ALG_GPU_CENTRIC) &&
-			 (nunitsactive > 1)) {
-				return psm3_compute_start_and_end_unit_gpu_centric(job_key,
-							unit_start, unit_end, nunits);
-#endif
-		} else { // PSMI_UNIT_SEL_ALG_WITHIN or only 1 active unit
-			// caller will pick 1st active unit
-			*unit_start = 0;
-			*unit_end = nunits - 1;
-			_HFI_DBG("%s: Will select 1st viable NIC unit= %ld to %ld.\n",
-				(nic_sel_alg == PSMI_UNIT_SEL_ALG_WITHIN)
-					?"Packed":"Only 1 viable NIC",
-				*unit_start, *unit_end);
-		}
-	} else if (unit_param >= 0) {
-		/* the user specified PSM3_NIC, we use it. */
-		*unit_start = *unit_end = unit_param;
-		_HFI_DBG("Caller selected NIC %ld.\n", *unit_start);
-	} else {
-		psm3_handle_error(NULL, PSM2_EP_DEVICE_FAILURE,
-				 "PSM3 can't open unit: %ld for reading and writing",
-				 unit_param);
-		return PSM2_EP_DEVICE_FAILURE;
-	}
-
-	return PSM2_OK;
+		psmi_assert(type == PSMI_ENVVAR_TYPE_STR);
+		if (! val.e_str || ! *val.e_str)
+			return -1;
+		if (parse_selection_alg(val.e_str) < 0)
+			return -2;
+		return 0;
 }
 
 static
 int psmi_parse_nic_selection_algorithm(void)
 {
 	union psmi_envvar_val env_nic_alg;
-	int nic_alg = PSMI_UNIT_SEL_ALG_ACROSS;
+	int nic_alg;
 
 	const char* PSM3_NIC_SELECTION_ALG_HELP =
-		    "NIC Device Selection Algorithm to use. Round Robin[RoundRobin or rr] (Default) "
+		    "Round Robin[RoundRobin or rr] (Default)"
 		    ", Packed[p], Round Robin All[RoundRobinAll or rra],"
 #ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
 			" CPU Centric Round Robin [CpuRoundRobin or crr]"
@@ -1257,43 +973,106 @@ int psmi_parse_nic_selection_algorithm(void)
 			" or CPU Centric Round Robin [CpuRoundRobin or crr]";
 #endif
 
+	psm3_getenv_range("PSM3_NIC_SELECTION_ALG",
+			"NIC Device Selection Algorithm",
+			PSM3_NIC_SELECTION_ALG_HELP,
+			PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
+			(union psmi_envvar_val)"rr",
+			(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+			parse_check_selection_alg, NULL, &env_nic_alg);
 
-	/* If a specific unit is set in the environment, use that one. */
-	psm3_getenv("PSM3_NIC_SELECTION_ALG", PSM3_NIC_SELECTION_ALG_HELP,
-		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR,
-		    (union psmi_envvar_val)"rr", &env_nic_alg);
-
-	if (!strcasecmp(env_nic_alg.e_str, "Round Robin")
-		|| !strcasecmp(env_nic_alg.e_str, "RoundRobin")
-		|| !strcasecmp(env_nic_alg.e_str, "rr"))
-		nic_alg = PSMI_UNIT_SEL_ALG_ACROSS;
-	else if (!strcasecmp(env_nic_alg.e_str, "Packed")
-			 || !strcasecmp(env_nic_alg.e_str, "p"))
-		nic_alg = PSMI_UNIT_SEL_ALG_WITHIN;
-	else if (!strcasecmp(env_nic_alg.e_str, "Round Robin All")
-			 || !strcasecmp(env_nic_alg.e_str, "RoundRobinAll")
-			 || !strcasecmp(env_nic_alg.e_str, "rra"))
-		nic_alg = PSMI_UNIT_SEL_ALG_ACROSS_ALL;
-	else if (!strcasecmp(env_nic_alg.e_str, "CPU Centric Round Robin")
-			 || !strcasecmp(env_nic_alg.e_str, "CpuRoundRobin")
-			 || !strcasecmp(env_nic_alg.e_str, "crr"))
-		nic_alg = PSMI_UNIT_SEL_ALG_CPU_CENTRIC;
+	nic_alg = parse_selection_alg(env_nic_alg.e_str);
+	psmi_assert(nic_alg >= 0);
+	return nic_alg;
+}
+
+// Autoselect one unit for non-multirail operation.
+// caller will select 1st active port and an addr_index within unit
+// returns the unit number or -1 if unable to find an active unit
+int
+psm3_autoselect_one(long addr_index, int nunits, psm2_uuid_t const job_key)
+{
+	unsigned short nic_sel_alg;
+	unsigned first_active = nunits;	// invalid value. for error check
+	int have_subnet = 0, unit_id;
+	psmi_subnet128_t got_subnet = { };
+	unsigned ninfo;
+	struct nic_info nic_info[PSMI_MAX_RAILS];
+	unsigned index;
+	int nunitsactive = 0;
+
+	// find first_active, also if NICs are on different planes
+	// (non-routed subnets) we need to have all ranks default to the
+	// same plane so force 1st active NIC in that case
+	for (unit_id = 0; unit_id < nunits; unit_id++) {
+		psmi_subnet128_t subnet;
+		if (psmi_hal_get_unit_active(unit_id) <= 0)
+			continue;
+		if (0 != psmi_hal_get_port_subnet(unit_id, 1 /* VERBS_PORT*/,
+						addr_index>0?addr_index:0,
+						&subnet, NULL, NULL, NULL))
+			continue; // can't access NIC
+		// found an active viable NIC
+		nunitsactive++;
+		if (! have_subnet) {
+			have_subnet = 1;
+			got_subnet = subnet;
+			first_active = unit_id;
+		} else if (! psm3_subnets_match(got_subnet, subnet)) {
+			// Active units have different tech (IB/OPA vs Eth)
+			// or different subnets.
+			// Use 1st active unit so all ranks in job can communicate
+			_HFI_DBG("Multi-Plane config: Using 1st viable NIC unit= %u.\n",
+					first_active);
+			return first_active;
+		}
+	}
+	if (nunitsactive == 0)
+		return -1;
+
+	nic_sel_alg = psmi_parse_nic_selection_algorithm();
+
+	if (nunitsactive <= 1 || nic_sel_alg == PSMI_NIC_SEL_ALG_FIRST) {
+		// pick 1st active unit
+		_HFI_DBG("%s: Selected 1st viable NIC unit= %u.\n",
+			(nic_sel_alg == PSMI_NIC_SEL_ALG_FIRST)
+				?"Packed":"Only 1 viable NIC",
+			first_active);
+		return first_active;
+	}
+
+	ninfo = nic_info_init(nic_info, nunits, 0);
+	if (! ninfo) {
+		// should not happen, already confirmed there is >1 active unit
+		return -1;
+	}
+	switch (nic_sel_alg) {
+	default:
+	case PSMI_NIC_SEL_ALG_NUMA: /* round-robin is selection algorithm */
+		nic_info_filter_cpu_numa(nic_info, ninfo);
+		break;
+	case PSMI_NIC_SEL_ALG_ANY:
+		// we will use any active unit
+		_HFI_DBG("No further NIC filtering\n");
+		break;
+	case PSMI_NIC_SEL_ALG_CPU_CENTRIC:
+		nic_info_filter_cpu_centric(nic_info, ninfo);
+		break;
 #ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
-	else if (!strcasecmp(env_nic_alg.e_str, "GPU Centric Round Robin")
-			 || !strcasecmp(env_nic_alg.e_str, "GpuRoundRobin")
-			 || !strcasecmp(env_nic_alg.e_str, "grr"))
-		nic_alg = PSMI_UNIT_SEL_ALG_GPU_CENTRIC;
+	case PSMI_NIC_SEL_ALG_GPU_CENTRIC:
+		nic_info_filter_gpu_centric(nic_info, ninfo);
+		break;
 #endif
-	else {
-		_HFI_INFO(
-		    "Invalid value for PSM3_NIC_SELECTION_ALG ('%s') %-40s Using: %s\n",
-			env_nic_alg.e_str, PSM3_NIC_SELECTION_ALG_HELP, "RoundRobin");
-		nic_alg = PSMI_UNIT_SEL_ALG_ACROSS;
 	}
 
-	return nic_alg;
+	index = psm3_open_shm_scoreboard_and_select_nic(nic_info, ninfo,
+									job_key, nunits);
+	psmi_assert(index >= 0 && index < ninfo);
+
+	return nic_info[index].unit;
 }
 
+
 /* parse a list of NIC rails for PSM3_MULTIRAIL_MAP
  * map is in format: unit:port-addr_index,unit:port-addr_index,...;unit....
  * where :port is optional (default of 1) and unit can be name or number
@@ -1590,109 +1369,95 @@ psm3_copy_nic_info_to_multitrail_config(
 	}
 }
 
-// select a list of NICs to use, optimizing for CPU locality first
+// Multirail enabled, see if PSM3_MULTIRAIL_MAP is selecting NICs
+// for PSM3_MULTIRAIL=1 or 2, PSM3_MULTIRAIL_MAP can explicitly select NICs.
+// returns:
+//   PSM2_OK - PSM3_MULTIRAIL_MAP specified and valid, multirail_config updated
+//   PSM2_EP_NO_DEVICE - PSM3_MULTIRAIL_MAP not specified or invalid
 static psm2_error_t
-psm3_ep_multirail_autoselect_cpu_centric(uint32_t nunits,
+psm3_ep_multirail_map(int multirail_mode,
 							struct multirail_config *multirail_config)
 {
-	unsigned ninfo;
-	struct nic_info nic_info[PSMI_MAX_RAILS];
+	int ret;
+	union psmi_envvar_val env_multirail_map;
+	int map_index;
 
-	// enumerate addr_index too
-	ninfo = nic_info_init(nic_info, nunits, 1);
-	if (! ninfo) {
-		// caller will try single NIC selection next
-		multirail_config->num_rails = 0;
-		return PSM2_OK;
+	// PSM3_MUTLIRAIL_MAP only allowed for PSM3_MULTIRAIL=1 or 2
+	// We treat invalid input, such as bad syntax or selection of an unusable
+	// port (down/missing/etc), as a fatal error instead of attempting to run
+	// on the default PSM3_MULTIRAIL_MAP config.  This helps avoid
+	// inconsistent NIC selections, especially for down ports, which may
+	// cause confusing behaviors or errors.
+	// If PSM3_MULTIRAIL_MAP contains multiple lists of NICs, then
+	// if PSM3_MULTIRAIL=1 - use local rank index (0, ...) to select
+	// if PSM3_MULTIRAIL=2 - use process NUMA (0, ...) to select
+	if (multirail_mode == 1) {
+		map_index = psm3_get_mylocalrank();
+	} else if (multirail_mode == 2) {
+		map_index = psm3_get_current_proc_location();
+		if (map_index < 0) {
+			return psm3_handle_error(PSMI_EP_NORETURN,
+				PSM2_EP_DEVICE_FAILURE,
+				"Unable to get NUMA location of current process\n");
+		}
+	} else {
+		return PSM2_EP_NO_DEVICE;	// caller will ignore MULTIRAIL_MAP
 	}
-
-	nic_info_filter_cpu_centric(nic_info, ninfo);
-
-	// we will use all unfiltered units
-
-	// ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU
-	// selection, it will be called per rail and if rails are in
-	// different CPU NUMA could have an undesired impact
-	setenv("PSM3_NO_AFFINITY", "1", 1);
-
-	psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config);
-	return PSM2_OK;
-}
-
-#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
-// select a list of NICs to use, optimizing for GPU locality first
-static psm2_error_t
-psm3_ep_multirail_autoselect_gpu_centric(uint32_t nunits,
-							struct multirail_config *multirail_config)
-{
-	unsigned ninfo;
-	struct nic_info nic_info[PSMI_MAX_RAILS];
-
-	// enumerate addr_index too
-	ninfo = nic_info_init(nic_info, nunits, 1);
-	if (! ninfo) {
-		// caller will try single NIC selection next
+	ret = psm3_getenv_range("PSM3_MULTIRAIL_MAP",
+		"Explicit NIC selections for each rail",
+		"Specified as:\n"
+		"     rail,rail,...;rail,rail,...\n"
+#if 0
+		"Where rail can be: unit:port-addr_index or unit\n"
+#else
+		"Where rail can be: unit-addr_index or unit\n"
+#endif
+		"unit can be device name or unit number\n"
+#if 0
+		"where :port is optional (default of 1)\n"
+#endif
+		"addr_index can be 0 to PSM3_ADDR_PER_NIC-1, or 'any' or 'all'\n"
+		"When addr_index is omitted, it defaults to 'all'\n"
+		"When more than 1 set of rails is present (each set is separated by ;),\n"
+		"the set to use for a given process is selected based on PSM3_MULTIRAIL.\n"
+		"    1 - use local rank number to select\n"
+		"    2 - use local CPU NUMA to select\n"
+		"When empty, PSM3 will autoselect NICs as controlled by PSM3_MULTIRAIL.",
+			PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_FATAL, PSMI_ENVVAR_TYPE_STR,
+			(union psmi_envvar_val)"",
+			(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
+			psm3_parse_check_multirail_map, &map_index, &env_multirail_map);
+	if (ret < 0) {	// syntax error in input, ret error instead of using default
+		psmi_assert(0); // should not get here since specified FLAG_FATAL
 		multirail_config->num_rails = 0;
-		return PSM2_OK;
+		return psm3_handle_error(PSMI_EP_NORETURN,
+				PSM2_EP_DEVICE_FAILURE,
+				"Invalid value for PSM3_MULTIRAIL_MAP: '%s', can't proceed\n",
+				env_multirail_map.e_str);
 	}
-
-	nic_info_filter_gpu_centric(nic_info, ninfo);
-
-	// we will use all unfiltered units
-
-	// ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU
-	// selection, it will be called per rail and if rails are in
-	// different CPU NUMA could have an undesired impact
-	setenv("PSM3_NO_AFFINITY", "1", 1);
-
-	psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config);
-	return PSM2_OK;
-}
-#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */
-
-// for use in psm3_ep_multirail_autoselect so can sort rails
-// by subnet and addr_index
-struct rail_info {
-	psmi_subnet128_t subnet;
-	unsigned unit;
-	unsigned port;
-	unsigned addr_index;
-};
-
-static int cmpfunc(const void *p1, const void *p2)
-{
-	struct rail_info *a = ((struct rail_info *) p1);
-	struct rail_info *b = ((struct rail_info *) p2);
-	int ret;
-
-	ret = psmi_subnet128_cmp(a->subnet, b->subnet);
-	if (ret == 0) {
-		if (a->addr_index < b->addr_index)
-			return -1;
-		else if (a->addr_index > b->addr_index)
-			return 1;
+	if (! ret) {
+		// valid input
+		if (psm3_parse_multirail_map(env_multirail_map.e_str, map_index, 0, NULL,
+			multirail_config) < 0) {
+			// already checked, shouldn't get parse errors nor empty strings
+			psmi_assert(0);
+		}
+		return PSM2_OK;
 	}
-	return ret;
+	return PSM2_EP_NO_DEVICE;
 }
 
 // Multirail enabled, autoselect one or more NICs for this process
-// multirail_mode is PSM3_MULTIRAIL selection (1=all NICs, 2=NUMA local NICs)
+// multirail_mode is PSM3_MULTIRAIL selection
+// (1=all NICs, 2=NUMA local NICs, 3=cpu centric, 4=gpu centric)
 static psm2_error_t
 psm3_ep_multirail_autoselect(int multirail_mode,
 							struct multirail_config *multirail_config)
 {
 	uint32_t num_units = 0;
-	psmi_subnet128_t subnet;
-	unsigned i, j, k, count = 0;
-	int ret;
 	psm2_error_t err = PSM2_OK;
-	struct rail_info rail_info[PSMI_MAX_RAILS];
-	int multirail_within_socket_used = 0;
-	int node_id = -1, found = 0;
-
-	if (multirail_mode == 2)
-		multirail_within_socket_used = 1;
-
+	unsigned ninfo;
+	struct nic_info nic_info[PSMI_MAX_RAILS];
 
 	if ((err = psm3_ep_num_devunits(&num_units))) {
 		return err;
@@ -1705,87 +1470,41 @@ psm3_ep_multirail_autoselect(int multirail_mode,
 		num_units = PSMI_MAX_RAILS;
 	}
 
-	if (multirail_mode == 3)
-		return psm3_ep_multirail_autoselect_cpu_centric(num_units, multirail_config);
-#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
-	if (multirail_mode == 4)
-		return psm3_ep_multirail_autoselect_gpu_centric(num_units, multirail_config);
-#endif
-
-	/*
-	 * PSM3_MULTIRAIL=2 functionality-
-	 *   - Try to find at least find one NIC in the same root
-	 *     complex. If none found, continue to run and
-	 *     use remaining NIC in the system.
-	 *   - If we do find at least one NIC in same root complex, we
-	 *     go ahead and add to list.
-	 */
-	if (multirail_within_socket_used) {
-		node_id = psm3_get_current_proc_location();
-		for (i = 0; i < num_units; i++) {
-			if (psmi_hal_get_unit_active(i) <= 0)
-				continue;
-			int node_id_i;
-
-			if (!psmi_hal_get_node_id(i, &node_id_i)) {
-				if (node_id_i == node_id) {
-					found = 1;
-					break;
-				}
-			}
-		}
+	// enumerate addr_index too
+	ninfo = nic_info_init(nic_info, num_units, 1);
+	if (! ninfo) {
+		// caller will try single NIC selection next
+		multirail_config->num_rails = 0;
+		return PSM2_OK;
 	}
-/*
- * Get all the ports and addr_index with a valid lid and gid, one port per unit.
- * but up to PSM3_ADDR_PER_NIC addresses.  If we are using the NUMA selection
- * algorithm and found at list 1 NUMA local NIC above, limit the list to NUMA
- * local NICs, otherwise list all NICs
- */
-	for (i = 0; i < num_units; i++) {
-		int node_id_i;
-
-		if (!psmi_hal_get_node_id(i, &node_id_i))
-		{
-			if (multirail_within_socket_used &&
-			    found && (node_id_i != node_id))
-				continue;
-		}
-
-		for (j = PSM3_NIC_MIN_PORT; j <= PSM3_NIC_MAX_PORT; j++) {
-			int got_port = 0;
-			for (k = 0; k < psm3_addr_per_nic; k++) {
-				ret = psmi_hal_get_port_lid(i, j, k);
-				if (ret <= 0)
-					continue;
-				ret = psmi_hal_get_port_subnet(i, j, k, &subnet, NULL, NULL, NULL);
-				if (ret == -1)
-					continue;
 
-				rail_info[count].subnet = subnet;
-				rail_info[count].unit = i;
-				rail_info[count].port = j;
-				rail_info[count].addr_index = k;
-				got_port = 1;
-				count++;
-			}
-			if (got_port)	// one port per unit
-				break;
-		}
+	switch (multirail_mode) {
+	default:
+	case 1:
+		// we will use all active units
+		_HFI_DBG("No further NIC filtering\n");
+		break;
+	case 2:
+		nic_info_filter_cpu_numa(nic_info, ninfo);
+		break;
+	case 3:
+		nic_info_filter_cpu_centric(nic_info, ninfo);
+		break;
+#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
+	case 4:
+		nic_info_filter_gpu_centric(nic_info, ninfo);
+		break;
+#endif /* PSM_HAVE_GPU_CENTRIC_AFFINITY */
 	}
 
-/*
- * Sort all the ports within rail_info from small to big.
- * This is for multiple fabrics, and we use fabric with the
- * smallest subnet to make the master connection.
- */
-	qsort(rail_info, count, sizeof(rail_info[0]), cmpfunc);
+	// we will use all unfiltered units
 
-	for (i = 0; i < count; i++) {
-		multirail_config->units[i] = rail_info[i].unit;
-		multirail_config->ports[i] = rail_info[i].port;
-		multirail_config->addr_indexes[i] = rail_info[i].addr_index;
-	}
-	multirail_config->num_rails = count;
+	// ensure psm3_context_set_affinity doesn't unnecessarily narrow CPU
+	// selection, it will be called per rail and if rails are in
+	// different CPU NUMA could have an undesired impact
+	setenv("PSM3_NO_AFFINITY", "1", 1);
+
+	psm3_copy_nic_info_to_multitrail_config(nic_info, ninfo, multirail_config);
 	return PSM2_OK;
 }
 
@@ -1793,7 +1512,8 @@ psm3_ep_multirail_autoselect(int multirail_mode,
 // list of unit/port/addr_index in multirail_config.
 // When multirail_config->num_rails is returned as 0, multirail is not enabled
 // and other mechanisms (PSM3_NIC, PSM3_NIC_SELECTION_ALG) must be
-// used by the caller to select a single NIC for the process.
+// used by the caller to select a single NIC for the process
+// via psm3_autoselect_one().
 // This can return num_rails==1 if exactly 1 NIC is to be used by this process
 // or num_rails>1 if this process is to stripe data across multiple NICs
 // in which case the 1st NIC in multirail_config should be used as the
@@ -1801,10 +1521,7 @@ psm3_ep_multirail_autoselect(int multirail_mode,
 psm2_error_t
 psm3_ep_multirail(struct multirail_config *multirail_config)
 {
-	int ret;
 	union psmi_envvar_val env_multirail;
-	union psmi_envvar_val env_multirail_map;
-	int map_index;
 
 	psm3_getenv_range("PSM3_MULTIRAIL",
 			"Control use of multiple NICs",
@@ -1863,71 +1580,9 @@ psm3_ep_multirail(struct multirail_config *multirail_config)
 		return PSM2_OK;
 	}
 
-	if (env_multirail.e_int == 1 || env_multirail.e_int == 2) {
-		// TBD - move this code to a separate function
-		// for PSM3_MULTIRAIL=1 or 2, PSM3_MULTIRAIL_MAP can explicitly select NICs.
-		// We treat invalid input, such as bad syntax or selection of an unusable
-		// port (down/missing/etc), as a fatal error instead of attempting to run
-		// on the default PSM3_MULTIRAIL_MAP config.  This helps avoid
-		// inconsistent NIC selections, especially for down ports, which may
-		// cause confusing behaviors or errors.
-		// If PSM3_MULTIRAIL_MAP contains multiple lists of NICs, then
-		// if PSM3_MULTIRAIL=1 - use local rank index (0, ...) to select
-		// if PSM3_MULTIRAIL=2 - use process NUMA (0, ...) to select
-		if (env_multirail.e_int == 1) {
-			map_index = psm3_get_mylocalrank();
-		} else if (env_multirail.e_int == 2) {
-			map_index = psm3_get_current_proc_location();
-			if (map_index < 0) {
-				return psm3_handle_error(PSMI_EP_NORETURN,
-					PSM2_EP_DEVICE_FAILURE,
-					"Unable to get NUMA location of current process\n");
-			}
-		} else {
-			psmi_assert(0);
-		}
-		ret = psm3_getenv_range("PSM3_MULTIRAIL_MAP",
-			"Explicit NIC selections for each rail",
-			"Specified as:\n"
-			"     rail,rail,...;rail,rail,...\n"
-#if	 0
-			"Where rail can be: unit:port-addr_index or unit\n"
-#else
-			"Where rail can be: unit-addr_index or unit\n"
-#endif
-			"unit can be device name or unit number\n"
-#if 0
-			"where :port is optional (default of 1)\n"
-#endif
-			"addr_index can be 0 to PSM3_ADDR_PER_NIC-1, or 'any' or 'all'\n"
-			"When addr_index is omitted, it defaults to 'all'\n"
-			"When more than 1 set of rails is present (each set is separated by ;),\n"
-			"the set to use for a given process is selected based on PSM3_MULTIRAIL.\n"
-			"    1 - use local rank number to select\n"
-			"    2 - use local CPU NUMA to select\n"
-			"When empty, PSM3 will autoselect NICs as controlled by PSM3_MULTIRAIL.",
-				PSMI_ENVVAR_LEVEL_USER|PSMI_ENVVAR_FLAG_FATAL, PSMI_ENVVAR_TYPE_STR,
-				(union psmi_envvar_val)"",
-				(union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
-				psm3_parse_check_multirail_map, &map_index, &env_multirail_map);
-		if (ret < 0) {	// syntax error in input, ret error instead of using default
-			psmi_assert(0); // should not get here since specified FLAG_FATAL
-			multirail_config->num_rails = 0;
-			return psm3_handle_error(PSMI_EP_NORETURN,
-					PSM2_EP_DEVICE_FAILURE,
-					"Invalid value for PSM3_MULTIRAIL_MAP: '%s', can't proceed\n",
-					env_multirail_map.e_str);
-		}
-		if (! ret) {
-			// valid input
-			if (psm3_parse_multirail_map(env_multirail_map.e_str, map_index, 0, NULL,
-				multirail_config) < 0) {
-				// already checked, shouldn't get parse errors nor empty strings
-				psmi_assert(0);
-			}
-			return PSM2_OK;
-		}
-	}
+	// see if PSM3_MULTIRAIL_MAP is manually selecting NICs
+	if (psm3_ep_multirail_map(env_multirail.e_int, multirail_config) == PSM2_OK)
+		return PSM2_OK;
 
 	// multirail enabled, automatically select 1 or more NICs
 	return psm3_ep_multirail_autoselect(env_multirail.e_int, multirail_config);
diff --git a/prov/psm3/psm3/psm_nic_select.h b/prov/psm3/psm3/psm_nic_select.h
index cfd23ea1081..c69b52b0e83 100644
--- a/prov/psm3/psm3/psm_nic_select.h
+++ b/prov/psm3/psm3/psm_nic_select.h
@@ -60,29 +60,6 @@
 #ifndef _PSM_NIC_SELECT_H
 #define _PSM_NIC_SELECT_H
 
-// PSM3_NIC_SELECTION_ALG choices
-/*
- * round robin contexts across HFIs, then
- * ports; this is the default.
- * This option spreads the HFI selection within the local socket.
- * If it is preferred to spread job over over entire set of
- * HFIs within the system, see ALG_ACROSS_ALL below.
- */
-#define PSMI_UNIT_SEL_ALG_ACROSS     PSM_HAL_ALG_ACROSS
-
-#define PSMI_UNIT_SEL_ALG_ACROSS_ALL PSM_HAL_ALG_ACROSS_ALL
-
-/*
- * use all contexts on an HFI (round robin
- * active ports within), then next HFI
- */
-#define PSMI_UNIT_SEL_ALG_WITHIN     PSM_HAL_ALG_WITHIN
-
-#define PSMI_UNIT_SEL_ALG_CPU_CENTRIC     PSM_HAL_ALG_CPU_CENTRIC
-#ifdef PSM_HAVE_GPU_CENTRIC_AFFINITY
-#define PSMI_UNIT_SEL_ALG_GPU_CENTRIC     PSM_HAL_ALG_GPU_CENTRIC
-#endif
-
 struct multirail_config {
     int num_rails;
     uint32_t units[PSMI_MAX_RAILS];
@@ -90,18 +67,15 @@ struct multirail_config {
     int addr_indexes[PSMI_MAX_RAILS];
 };
 
-// return set of units to consider and which to start at.
-// caller will use 1st active unit which can be opened.
-// caller will wrap around so it's valid for start >= end
-// Note: When using multiple rails per PSM process, higher level code will
-// walk through desired units and unit_param will specify a specific unit
-// if unit_param is PSM3_NIC_ANY, this will pick starting point for nic search
-psm2_error_t
-psm3_compute_start_and_end_unit(long unit_param, long addr_index,
-				int nunitsactive,int nunits,
-				psm2_uuid_t const job_key,
-				long *unit_start,long *unit_end);
+// Autoselect one unit for non-multirail operation.
+// caller will select 1st active port and an addr_index within unit
+// returns the unit number or -1 if unable to find an active unit
+int
+psm3_autoselect_one(long addr_index, int nunits, psm2_uuid_t const job_key);
 
+// determine if PSM3_MULTIRAIL is enabled, and if so select the rails
+// and place the list in multirail_config.  If multirail is not enabled
+// multirail_config.num_rails will be set to 0
 psm2_error_t
 psm3_ep_multirail(struct multirail_config *multirail_config);
 
diff --git a/prov/psm3/psm3/psm_oneapi_ze.c b/prov/psm3/psm3/psm_oneapi_ze.c
deleted file mode 100644
index 2090fb68326..00000000000
--- a/prov/psm3/psm3/psm_oneapi_ze.c
+++ /dev/null
@@ -1,1040 +0,0 @@
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2021 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  Contact Information:
-  Intel Corporation, www.intel.com
-
-  BSD LICENSE
-
-  Copyright(c) 2021 Intel Corporation.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-#ifdef PSM_ONEAPI
-#include <dirent.h>
-#include <unistd.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/uio.h>
-#include <sys/ioctl.h>
-#include <linux/sockios.h>
-#include <sys/poll.h>
-#include "psm_user.h"
-#include "psm_mq_internal.h"
-#include "ptl_am/psm_am_internal.h"
-#include "psmi_wrappers.h"
-
-#ifndef PSM_HAVE_PIDFD
-static int psm3_ze_dev_fds[MAX_ZE_DEVICES];
-int psm3_num_ze_dev_fds;
-#endif
-int psm3_oneapi_immed_sync_copy;
-int psm3_oneapi_immed_async_copy;
-unsigned psm3_oneapi_parallel_dtod_copy_thresh;
-
-const char* psmi_oneapi_ze_result_to_string(const ze_result_t result) {
-#define ZE_RESULT_CASE(RES) case ZE_RESULT_##RES: return STRINGIFY(RES)
-
-	switch (result) {
-	ZE_RESULT_CASE(SUCCESS);
-	ZE_RESULT_CASE(NOT_READY);
-	ZE_RESULT_CASE(ERROR_UNINITIALIZED);
-	ZE_RESULT_CASE(ERROR_DEVICE_LOST);
-	ZE_RESULT_CASE(ERROR_INVALID_ARGUMENT);
-	ZE_RESULT_CASE(ERROR_OUT_OF_HOST_MEMORY);
-	ZE_RESULT_CASE(ERROR_OUT_OF_DEVICE_MEMORY);
-	ZE_RESULT_CASE(ERROR_MODULE_BUILD_FAILURE);
-	ZE_RESULT_CASE(ERROR_INSUFFICIENT_PERMISSIONS);
-	ZE_RESULT_CASE(ERROR_NOT_AVAILABLE);
-	ZE_RESULT_CASE(ERROR_UNSUPPORTED_VERSION);
-	ZE_RESULT_CASE(ERROR_UNSUPPORTED_FEATURE);
-	ZE_RESULT_CASE(ERROR_INVALID_NULL_HANDLE);
-	ZE_RESULT_CASE(ERROR_HANDLE_OBJECT_IN_USE);
-	ZE_RESULT_CASE(ERROR_INVALID_NULL_POINTER);
-	ZE_RESULT_CASE(ERROR_INVALID_SIZE);
-	ZE_RESULT_CASE(ERROR_UNSUPPORTED_SIZE);
-	ZE_RESULT_CASE(ERROR_UNSUPPORTED_ALIGNMENT);
-	ZE_RESULT_CASE(ERROR_INVALID_SYNCHRONIZATION_OBJECT);
-	ZE_RESULT_CASE(ERROR_INVALID_ENUMERATION);
-	ZE_RESULT_CASE(ERROR_UNSUPPORTED_ENUMERATION);
-	ZE_RESULT_CASE(ERROR_UNSUPPORTED_IMAGE_FORMAT);
-	ZE_RESULT_CASE(ERROR_INVALID_NATIVE_BINARY);
-	ZE_RESULT_CASE(ERROR_INVALID_GLOBAL_NAME);
-	ZE_RESULT_CASE(ERROR_INVALID_KERNEL_NAME);
-	ZE_RESULT_CASE(ERROR_INVALID_FUNCTION_NAME);
-	ZE_RESULT_CASE(ERROR_INVALID_GROUP_SIZE_DIMENSION);
-	ZE_RESULT_CASE(ERROR_INVALID_GLOBAL_WIDTH_DIMENSION);
-	ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ARGUMENT_INDEX);
-	ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ARGUMENT_SIZE);
-	ZE_RESULT_CASE(ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE);
-	ZE_RESULT_CASE(ERROR_INVALID_COMMAND_LIST_TYPE);
-	ZE_RESULT_CASE(ERROR_OVERLAPPING_REGIONS);
-	ZE_RESULT_CASE(ERROR_UNKNOWN);
-	default:
-		return "Unknown error";
-	}
-
-#undef ZE_RESULT_CASE
-}
-
-// when allocating bounce buffers either malloc w/Import or
-// zeMemAllocHost can be used.  zeMemAllocHost tends to perform
-// better in the subsequent GPU copy's AppendMemoryCopy.  However
-// zeMemAllocHost results in a GPU-like address which requires dmabuf
-// so we can't use zeMemAllocHost for DMA to/from the bounce buffer
-// unless rv is available to handle GPU addresses (eg. PSM3_GPUDIRECT=1)
-
-void *psm3_oneapi_ze_host_alloc_malloc(unsigned size)
-{
-	void *ret_ptr = psmi_malloc(PSMI_EP_NONE, UNDEFINED, size);
-#ifndef PSM3_NO_ONEAPI_IMPORT
-	PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver, ret_ptr, size);
-#endif
-	return ret_ptr;
-}
-
-void psm3_oneapi_ze_host_free_malloc(void *ptr)
-{
-#ifndef PSM3_NO_ONEAPI_IMPORT
-	PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, ptr);
-#endif
-	psmi_free(ptr);
-}
-
-#ifndef PSM3_USE_ONEAPI_MALLOC
-void *psm3_oneapi_ze_host_alloc_zemem(unsigned size)
-{
-	void *ret_ptr;
-	ze_host_mem_alloc_desc_t host_desc = {
-		.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC,
-		.flags = ZE_MEMORY_ACCESS_CAP_FLAG_RW
-	};
-	PSMI_ONEAPI_ZE_CALL(zeMemAllocHost, ze_context,
-						&host_desc, size, 8, &ret_ptr);
-	return ret_ptr;
-}
-
-void psm3_oneapi_ze_host_free_zemem(void *ptr)
-{
-	PSMI_ONEAPI_ZE_CALL(zeMemFree, ze_context, ptr);
-}
-
-void *(*psm3_oneapi_ze_host_alloc)(unsigned size) = psm3_oneapi_ze_host_alloc_malloc;
-void (*psm3_oneapi_ze_host_free)(void *ptr) = psm3_oneapi_ze_host_free_malloc;
-int psm3_oneapi_ze_using_zemem_alloc = 0;
-#endif /* PSM3_USE_ONEAPI_MALLOC */
-
-// this is only called if GPU Direct is enabled in rv such that
-// GDR Copy and/or RDMA MRs can provide GPU-like addresses to rv
-void psm3_oneapi_ze_can_use_zemem()
-{
-#ifndef PSM3_USE_ONEAPI_MALLOC
-	psm3_oneapi_ze_host_alloc = psm3_oneapi_ze_host_alloc_zemem;
-	psm3_oneapi_ze_host_free = psm3_oneapi_ze_host_free_zemem;
-	psm3_oneapi_ze_using_zemem_alloc = 1;
-#endif
-}
-
-// synchronous GPU memcpy
-void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size)
-{
-	struct ze_dev_ctxt *ctxt;
-
-	psmi_assert(size > 0);
-	ctxt = psmi_oneapi_dev_ctxt_get(dstptr);
-	if (!ctxt) {
-		ctxt = psmi_oneapi_dev_ctxt_get(srcptr);
-		if (!ctxt) {
-			_HFI_ERROR("dst %p src %p not GPU buf for copying\n",
-				   dstptr, srcptr);
-			return;
-		}
-	}
-	if (psm3_oneapi_immed_sync_copy) {
-		PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl,
-					dstptr, srcptr, size, NULL, 0, NULL);
-	} else {
-		PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl);
-		PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl,
-					dstptr, srcptr, size, NULL, 0, NULL);
-		PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl);
-		PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq,
-					1, &ctxt->cl, NULL);
-		PSMI_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX);
-	}
-}
-
-// synchronous GPU memcpy DTOD (xeLink)
-void psmi_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size)
-{
-	struct ze_dev_ctxt *ctxt;
-
-	psmi_assert(size > 0);
-	ctxt = psmi_oneapi_dev_ctxt_get(dstptr);
-	if (!ctxt) {
-		_HFI_ERROR("dst %p src %p not GPU buf for copying\n",
-			   dstptr, srcptr);
-		return;
-	}
-	if (size <= psm3_oneapi_parallel_dtod_copy_thresh) {
-		if (psm3_oneapi_immed_sync_copy) {
-			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl,
-					dstptr, srcptr, size, NULL, 0, NULL);
-		} else {
-			PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->cl);
-			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->cl,
-					dstptr, srcptr, size, NULL, 0, NULL);
-			PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->cl);
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->cq,
-					1, &ctxt->cl, NULL);
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueSynchronize, ctxt->cq, UINT32_MAX);
-		}
-	} else {
-		// for large DTOD copies, start 2 parallel commands
-		// then wait for both
-		size_t size0 = ROUNDUP64P2(size/2, 64*1024);
-		size_t size1 = size - size0;
-
-		if (psm3_oneapi_immed_sync_copy) {
-			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0,
-					dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL);
-
-			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1,
-					(void*)((uintptr_t)dstptr+size0),
-					(void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1,
-					0, NULL);
-		} else {
-			PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl0);
-			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl0,
-					dstptr, srcptr, size0, ctxt->copy_status0, 0, NULL);
-			PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl0);
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq0,
-					1, &ctxt->async_cl0, NULL);
-
-			PSMI_ONEAPI_ZE_CALL(zeCommandListReset, ctxt->async_cl1);
-			PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy, ctxt->async_cl1,
-					(void*)((uintptr_t)dstptr+size0),
-					(void*)((uintptr_t)srcptr+size0), size1, ctxt->copy_status1,
-					0, NULL);
-			PSMI_ONEAPI_ZE_CALL(zeCommandListClose, ctxt->async_cl1);
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists, ctxt->async_cq1,
-					1, &ctxt->async_cl1, NULL);
-		}
-		// 2nd copy may be slightly smaller so waity for it first so
-		// can potentially hide its Reset latency while 1st copy completes
-		PSMI_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status1, UINT32_MAX);
-		PSMI_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status1);
-
-		PSMI_ONEAPI_ZE_CALL(zeEventHostSynchronize, ctxt->copy_status0, UINT32_MAX);
-		PSMI_ONEAPI_ZE_CALL(zeEventHostReset, ctxt->copy_status0);
-	}
-}
-
-// for pipelined async GPU memcpy
-// *p_cq is left as NULL when psm3_oneapi_immed_async_copy enabled
-void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt,
-		ze_command_queue_handle_t *p_cq, ze_command_list_handle_t *p_cl)
-{
-	psmi_assert(! *p_cl);
-	if (psm3_oneapi_immed_async_copy) {
-		ze_command_queue_desc_t cq_desc = {
-			.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
-			.flags = 0,
-			.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
-			.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL
-		};
-		cq_desc.ordinal = ctxt->ordinal;
-		cq_desc.index = ctxt->index++;
-		ctxt->index %= ctxt->num_queues;
-		PSMI_ONEAPI_ZE_CALL(zeCommandListCreateImmediate,
-			ze_context, ctxt->dev, &cq_desc, p_cl);
-	} else {
-		if (! *p_cq) {
-			ze_command_queue_desc_t cq_desc = {
-				.stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
-				.flags = 0,
-				.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
-				.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL
-			};
-			cq_desc.ordinal = ctxt->ordinal;
-			cq_desc.index = ctxt->index++;
-			ctxt->index %= ctxt->num_queues;
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueCreate,
-					ze_context, ctxt->dev, &cq_desc, p_cq);
-		}
-		ze_command_list_desc_t cl_desc = {
-			.stype = ZE_STRUCTURE_TYPE_COMMAND_LIST_DESC,
-			.flags = 0
-		};
-		cl_desc.commandQueueGroupOrdinal = ctxt->ordinal;
-		PSMI_ONEAPI_ZE_CALL(zeCommandListCreate,
-			ze_context, ctxt->dev, &cl_desc, p_cl);
-	}
-}
-
-#ifndef PSM_HAVE_PIDFD
-/*
- * psmi_ze_init_fds - initialize the file descriptors (ze_dev_fds) 
- *
- * Open the file descriptors for our GPUs (psm3_ze_dev_fds[])
- *
- * The file descriptors are used in intra-node communication to pass to peers
- * via socket with sendmsg/recvmsg SCM_RIGHTS message type.
- *
- */
-
-psm2_error_t psm3_ze_init_fds(void)
-{
-	const char *dev_dir = "/dev/dri/by-path/";
-	const char *suffix = "-render";
-	DIR *dir;
-	struct dirent *ent = NULL;
-	char dev_name[NAME_MAX];
-	int i = 0, ret;
-
-	if (psm3_num_ze_dev_fds)
-		return PSM2_OK;
-
-	dir = opendir(dev_dir);
-	if (dir == NULL)
-		return PSM2_INTERNAL_ERR;
-
-	while ((ent = readdir(dir)) != NULL) {
-		if (ent->d_name[0] == '.' ||
-		    strstr(ent->d_name, suffix) == NULL)
-			continue;
-
-		memset(dev_name, 0, sizeof(dev_name));
-		ret = snprintf(dev_name, NAME_MAX, "%s%s", dev_dir, ent->d_name);
-		if (ret < 0 || ret >= NAME_MAX) {
-			_HFI_INFO("GPU dev name too long: %s%s\n", dev_dir, ent->d_name);
-			goto err;
-		}
-
-		psm3_ze_dev_fds[i] = open(dev_name, O_RDWR);
-		if (psm3_ze_dev_fds[i] == -1) {
-			_HFI_INFO("Failed to open %s GPU dev FD: %s\n", dev_name,
-					 strerror(errno));
-			goto err;
-		}
-		_HFI_DBG("Opened %s GPU dev FD: %d\n", dev_name,
-				psm3_ze_dev_fds[i]);
-		i++;
-		psm3_num_ze_dev_fds++;
-	}
-	(void) closedir(dir);
-	_HFI_DBG("Opened %d GPU dev FDs\n", psm3_num_ze_dev_fds);
-	return PSM2_OK;
-
-err:
-	(void) closedir(dir);
-	return PSM2_INTERNAL_ERR;
-}
-
-/*
- * psmi_ze_get_dev_fds - fetch device file descriptors
- *
- * Returns a pointer to ze_dev_fds while putting the number
- * of fds into the in/out nfds parameter
- *
- */
-
-int *psm3_ze_get_dev_fds(int *nfds)
-{
-	*nfds = psm3_num_ze_dev_fds;
-	return psm3_ze_dev_fds;
-}
-
-/*
- * psmi_sendmsg_fds - send device file descriptors over socket w/ sendmsg
- *
- * Prepares message of type SCM_RIGHTS, copies file descriptors as payload,
- * and sends over socket via sendmsg while creating appropriate fd numbers
- * for dest (effectively a dup(2) of our file descriptor)
- *
- * returns -errno on error or number of bytes sent (>0) on success
- */
-
-static int psmi_sendmsg_fds(int sock, int *fds, int nfds, psm2_epid_t epid)
-{
-	struct msghdr msg;
-	struct cmsghdr *cmsg;
-	struct iovec iov;
-	int64_t peer_id = *(int64_t *)&epid;
-	char *ctrl_buf;
-	size_t ctrl_size;
-	int ret;
-
-	ctrl_size = sizeof(*fds) * nfds;
-	ctrl_buf = (char *)psmi_calloc(NULL, UNDEFINED, 1, CMSG_SPACE(ctrl_size));
-	if (!ctrl_buf)
-		return -ENOMEM;
-
-	iov.iov_base = &peer_id;
-	iov.iov_len = sizeof(peer_id);
-
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_control = ctrl_buf;
-	msg.msg_controllen = CMSG_SPACE(ctrl_size);
-
-	msg.msg_iov = &iov;
-	msg.msg_iovlen = 1;
-
-	cmsg = CMSG_FIRSTHDR(&msg);
-	cmsg->cmsg_level = SOL_SOCKET;
-	cmsg->cmsg_type = SCM_RIGHTS;
-	cmsg->cmsg_len = CMSG_LEN(ctrl_size);
-	memcpy(CMSG_DATA(cmsg), fds, ctrl_size);
-
-	ret = sendmsg(sock, &msg, 0);
-	if (ret < 0)
-		ret = -errno;
-	else if (! ret)
-		ret = -EAGAIN;
-
-	psmi_free(ctrl_buf);
-	return ret;
-}
-
-/*
- * psmi_recvmsg_fds - receive device file descriptors from socket w/ recvmsg
- *
- * Prepares message buffer of type SCM_RIGHTS, receives message from socket
- * via recvmsg, and copies device file descriptors to in/out parameter.
- * The received file descriptors are usable in our process and need to
- * be closed when done being used
- *
- * returns -errno on error or number of bytes received (>0) on success
- */
-
-static int psmi_recvmsg_fd(int sock, int *fds, int nfds, psm2_epid_t epid)
-{
-	struct msghdr msg;
-	struct cmsghdr *cmsg;
-	struct iovec iov;
-	int64_t peer_id = *(int64_t *)&epid;
-	char *ctrl_buf;
-	size_t ctrl_size;
-	int ret;
-
-	ctrl_size = sizeof(*fds) * nfds;
-	ctrl_buf = (char *)psmi_calloc(NULL, UNDEFINED, 1, CMSG_SPACE(ctrl_size));
-	if (!ctrl_buf)
-		return -ENOMEM;
-
-	iov.iov_base = &peer_id;
-	iov.iov_len = sizeof(peer_id);
-
-	memset(&msg, 0, sizeof(msg));
-	msg.msg_control = ctrl_buf;
-	msg.msg_controllen = CMSG_SPACE(ctrl_size);
-	msg.msg_iov = &iov;
-	msg.msg_iovlen = 1;
-
-	ret = recvmsg(sock, &msg, 0);
-	if (ret < 0) {
-		ret = -errno;
-	} else if (ret != sizeof(peer_id)) {
-		_HFI_CONNDBG("recvmsg from: %s returns %d expect %u\n",
-						psm3_epid_fmt_addr(epid, 0), ret,
-						(unsigned)sizeof(peer_id) );
-		ret = -EAGAIN;
-		goto out;
-	}
-
-	psmi_assert(!(msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)));
-	cmsg = CMSG_FIRSTHDR(&msg);
-	psmi_assert(cmsg && cmsg->cmsg_len == CMSG_LEN(ctrl_size) &&
-	       cmsg->cmsg_level == SOL_SOCKET &&
-	       cmsg->cmsg_type == SCM_RIGHTS && CMSG_DATA(cmsg));
-	memcpy(fds, CMSG_DATA(cmsg), ctrl_size);
-out:
-	psmi_free(ctrl_buf);
-	return ret;
-}
-
-/*
- * psm3_ze_init_ipc_socket - initialize ipc socket in ep
- *
- * Set up the AF_UNIX ipc socket in the ep for listen mode. Name it
- * using our epid, and bind it.
- *
- */
-
-psm2_error_t psm3_ze_init_ipc_socket(ptl_t *ptl_gen)
-{
-	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
-	psm2_error_t err = PSM2_OK;
-	int ret;
-	struct sockaddr_un sockaddr = {0};
-	socklen_t len = sizeof(sockaddr);
-
-	if ((ptl->ep->ze_ipc_socket = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
-		_HFI_ERROR("error creating GPU dev FDs AF_UNIX sock: %s\n",
-					strerror(errno));
-		err =  PSM2_INTERNAL_ERR;
-		goto fail;
-	}
-
-	sockaddr.sun_family = AF_UNIX;
-	snprintf(sockaddr.sun_path, 108, "/dev/shm/psm3_shm.ze_sock2.%ld.%s",
-				(long int) getuid(), psm3_epid_fmt_internal(ptl->epid, 0));
-	ptl->ep->listen_sockname = psmi_strdup(NULL, sockaddr.sun_path);
-	if (ptl->ep->listen_sockname == NULL) {
-		err = PSM2_NO_MEMORY;
-		goto fail;
-	}
-
-	if ((ret = bind(ptl->ep->ze_ipc_socket, (struct sockaddr *) &sockaddr, len)) < 0) {
-		_HFI_ERROR("error binding GPU dev FDs AF_UNIX sock to %s: %s\n",
-					sockaddr.sun_path, strerror(errno));
-		err = PSM2_INTERNAL_ERR;
-		goto fail;
-	}
-
-	if ((ret = listen(ptl->ep->ze_ipc_socket, 256)) < 0) {
-		_HFI_ERROR("error listening on GPU dev FDs AF_UNIX sock %s: %s\n",
-					sockaddr.sun_path, strerror(errno));
-		err = PSM2_INTERNAL_ERR;
-		goto fail;
-	}
-	return PSM2_OK;
-
-fail:
-	if (ptl->ep->ze_ipc_socket >= 0)
-		close(ptl->ep->ze_ipc_socket);
-	ptl->ep->ze_ipc_socket = -1;
-	if (ptl->ep->listen_sockname)
-		psmi_free(ptl->ep->listen_sockname);
-	ptl->ep->listen_sockname = NULL;
-	return err;
-}
-
-/*
- * psm3_receive_ze_dev_fds - receive the dev fds on the listen socket
- *
- * Set up the listen socket to be polled for POLLIN. When the event is
- * received, accept for the new socket and then read the peer epid,
- * and locate the epaddr for it. Then receive the dev fds to be stored
- * in the am_epaddr.
- *
- * returns:
- *		PSM_OK - GPU dev FDs received from a peer
- *		PSM2_OK_NO_PROGRESS - nothing received
- *		other - error
- */
-
-static psm2_error_t psm3_receive_ze_dev_fds(ptl_t *ptl_gen)
-{
-	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
-	psm2_error_t err = PSM2_OK;
-	struct pollfd fdset;
-	int newsock = -1;
-
-	fdset.fd = ptl->ep->ze_ipc_socket;
-	fdset.events = POLLIN;
-
-	if (poll(&fdset, 1, 0) <= 0)
-		return PSM2_OK_NO_PROGRESS;
-
-	{
-		struct sockaddr_un sockaddr = {0};
-		socklen_t len = sizeof(sockaddr);
-		int nfds = psm3_num_ze_dev_fds;
-		int nread;
-		psm2_epid_t epid;
-		psm2_epaddr_t epaddr;
-		am_epaddr_t *am_epaddr;
-
-		newsock = accept(ptl->ep->ze_ipc_socket, (struct sockaddr *)&sockaddr, &len);
-		if (newsock < 0) {
-			_HFI_ERROR("GPU dev FDs AF_UNIX accept failed: %s\n",
-						strerror(errno));
-			err =  PSM2_INTERNAL_ERR;
-			goto fail;
-		} else {
-			int ret;
-			// technically we could get less than we asked for and need to
-			// call recv again in future but our transfers are small enough
-			// we should get it all
-			if ((nread = recv(newsock, &epid, sizeof(epid), 0)) < 0) {
-				_HFI_ERROR("GPU dev FDs AF_UNIX recv failed: %s\n",
-							strerror(errno));
-				err =  PSM2_INTERNAL_ERR;
-				goto fail;
-			}
-			if (nread != sizeof(epid)) {
-				_HFI_ERROR("GPU dev FDs AF_UNIX recv incomplete: %d\n", nread);
-				err =  PSM2_INTERNAL_ERR;
-				goto fail;
-			}
-			// we only poll for recv FDs after processing a am_shm connect
-			// so the epid should always be known
-			if ((epaddr = psm3_epid_lookup(ptl->ep, epid)) == NULL) {
-				_HFI_ERROR("Peer Unknown, unable to receive GPU dev FDs from: %s\n",
-								psm3_epid_fmt_addr(epid, 0));
-				err =  PSM2_INTERNAL_ERR;
-				goto fail;
-			}
-			am_epaddr = (am_epaddr_t *)epaddr;
-			am_epaddr->num_peer_fds = nfds;
-			ret = psmi_recvmsg_fd(newsock, am_epaddr->peer_fds, nfds, ptl->epid);
-			if (ret <= 0) {
-				_HFI_ERROR("Unable to recvmsg %d GPU dev FDs from: %s: %s\n",
-								nfds, psm3_epid_fmt_addr(epid, 0),
-								strerror(-ret));
-				err =  PSM2_INTERNAL_ERR;
-				goto fail;
-			}
-			_HFI_CONNDBG("%d GPU dev FDs Received from: %s\n",
-								nfds, psm3_epid_fmt_addr(epid, 0));
-		}
-	}
-
-fail:
-	if (newsock >= 0)
-		close(newsock);
-	return err;
-}
-
-/*
- * psm3_send_dev_fds - do next step to send the dev fds to the peer's
- *		listen socket
- *
- * Check the connected state and proceed accordingly:
- * - ZE_SOCK_NOT_CONNECTED
- *     We have not done anything yet, so connect and send our epid,
- *     followed by the dev fds. Set state to ZE_SOCK_DEV_FDS_SENT
- * - ZE_SOCK_DEV_FDS_SENT
- *     The dev fds have been sent. Issue ioctl to see if the output
- *     queue has been emptied indicating that the peer has read the data.
- *     If so, set state to ZE_SOCK_DEV_FDS_SENT_AND_RECD.
- * - ZE_SOCK_DEV_FDS_SENT_AND_RECD
- *     We are done, just return.
- *
- * returns:
- *		PSM_OK - next step completed
- *		PSM2_OK_NO_PROGRESS - nothing to do
- *		other - error
- */
-
-psm2_error_t psm3_send_dev_fds(ptl_t *ptl_gen, psm2_epaddr_t epaddr)
-{
-	am_epaddr_t *am_epaddr = (am_epaddr_t *)epaddr;
-
-	switch (am_epaddr->sock_connected_state) {
-		case ZE_SOCK_DEV_FDS_SENT_AND_RECD:
-			return PSM2_OK_NO_PROGRESS;
-			break;
-
-		case ZE_SOCK_DEV_FDS_SENT:
-		{
-			int pending;
-
-			psmi_assert(am_epaddr->sock >= 0);
-			if_pf (ioctl(am_epaddr->sock, SIOCOUTQ, &pending) != 0) {
-				return	psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-					"error sending dev FDs: %s\n", strerror(errno));
-			}
-			if (pending == 0) {
-				am_epaddr->sock_connected_state = ZE_SOCK_DEV_FDS_SENT_AND_RECD;
-				_HFI_CONNDBG("GPU dev FDs Send Completed to: %s\n",
-								psm3_epid_fmt_addr(epaddr->epid, 0));
-				close(am_epaddr->sock);
-				am_epaddr->sock = -1;
-				return PSM2_OK;
-			}
-			// be paranoid just in case 1st call to send_dev_fds for given
-			// epaddr gets here
-			if (! ((struct ptl_am *)ptl_gen)->ep->need_dev_fds_poll)
-				_HFI_CONNDBG("restart GPU dev FDs poll\n");
-			((struct ptl_am *)ptl_gen)->ep->need_dev_fds_poll = 1;
-			return PSM2_OK_NO_PROGRESS;
-			break;
-		}
-
-		case ZE_SOCK_NOT_CONNECTED:
-		{
-			struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
-			struct sockaddr_un sockaddr = {0};
-			socklen_t len = sizeof(sockaddr);
-			psm2_epid_t peer_epid = epaddr->epid;
-			int *fds, nfds;
-
-			if (!ptl->ep->need_dev_fds_poll)
-				_HFI_CONNDBG("restart GPU dev FDs poll\n");
-			ptl->ep->need_dev_fds_poll = 1;
-
-			fds = psm3_ze_get_dev_fds(&nfds);
-
-			if ((am_epaddr->sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
-				_HFI_ERROR("error creating GPU dev FDs AF_UNIX sock: %s\n",
-							strerror(errno));
-				goto fail;
-			}
-
-			sockaddr.sun_family = AF_UNIX;
-			snprintf(sockaddr.sun_path, 108, "/dev/shm/psm3_shm.ze_sock2.%ld.%s",
-				(long int) getuid(), psm3_epid_fmt_internal(peer_epid, 0));
-
-			if (connect(am_epaddr->sock, (struct sockaddr *) &sockaddr, len) < 0) {
-				_HFI_ERROR("GPU dev FDs connect to %s (via %s) failed: %s\n",
-								psm3_epid_fmt_addr(epaddr->epid, 0),
-								sockaddr.sun_path,  strerror(errno));
-				goto fail;
-			} else {
-				int ret;
-				ret = send(am_epaddr->sock, &ptl->epid, sizeof(ptl->epid), 0);
-				if (ret < 0) {
-					_HFI_ERROR("GPU dev FDs send to %s (via %s) failed: %s\n",
-							psm3_epid_fmt_addr(epaddr->epid, 0),
-							sockaddr.sun_path, strerror(errno));
-					goto fail;
-				}
-		
-				ret = psmi_sendmsg_fds(am_epaddr->sock, fds, nfds, peer_epid);
-				if (ret <= 0) {
-					/* ret is -errno */
-					_HFI_ERROR("GPU dev FDs sendmsg to %s (via %s) failed: %s\n",
-							psm3_epid_fmt_addr(epaddr->epid, 0),
-							sockaddr.sun_path,  strerror(-ret));
-					goto fail;
-				}
-				am_epaddr->sock_connected_state = ZE_SOCK_DEV_FDS_SENT;
-				_HFI_CONNDBG("%d GPU dev FDs Posted Send to: %s (via %s)\n",
-						nfds, psm3_epid_fmt_addr(epaddr->epid, 0),
-						sockaddr.sun_path);
-				return PSM2_OK;
-			}
-			/* NOTREACHED */
-			break;
-		}
-
-		default:
-			return PSM2_INTERNAL_ERR;
-			break;
-	}
-	/* NOTREACHED */
-	return PSM2_INTERNAL_ERR;
-
-fail:
-	if (am_epaddr->sock >= 0)
-		close(am_epaddr->sock);
-	am_epaddr->sock = -1;
-	return PSM2_INTERNAL_ERR;
-}
-
-// simple test if dev_fds bi-dir exchange completed for given epaddr
-// 1 = yes, 0 = no
-static
-int psm3_dev_fds_exchanged(psm2_epaddr_t epaddr)
-{
-	am_epaddr_t *am_epaddr = (am_epaddr_t *)epaddr;
-	return (am_epaddr->sock_connected_state == ZE_SOCK_DEV_FDS_SENT_AND_RECD
-			&& am_epaddr->num_peer_fds) ;
-}
-
-/*
- * psm3_check_dev_fds_exchanged - check that dev fds have been bi-dir exchanged
- * with given peer. Poll to try and move forward as needed.
- *
- * connect state ZE_SOCK_DEV_FDS_SENT_AND_RECD indicates peer has received
- * our send of dev_fds
- *
- * num_peer_fds indicates if we received peer's fds.
- *
- * if both are satisfied, exchange is complete, return PSM2_OK
- *
- *Returns:
- *   PSM2_OK - both are done
- *   PSM2_OK_NO_PROGRESS - more work needed
- *   other - error
- */
-psm2_error_t psm3_check_dev_fds_exchanged(ptl_t *ptl_gen, psm2_epaddr_t epaddr)
-{
-	psm2_error_t err;
-	psm2_error_t ret;
-	am_epaddr_t *am_epaddr = (am_epaddr_t *)epaddr;
-
-	psmi_assert(epaddr);
-	psmi_assert(! psm3_epid_zero_internal(epaddr->epid));
-
-	if (psm3_dev_fds_exchanged(epaddr))
-		return PSM2_OK;
-
-	if (am_epaddr->cstate_outgoing != AMSH_CSTATE_OUTGOING_ESTABLISHED
-		&& am_epaddr->cstate_incoming != AMSH_CSTATE_INCOMING_ESTABLISHED)
-		return PSM2_OK_NO_PROGRESS;
-
-	// try to move forward 1 step
-	err = psm3_send_dev_fds(ptl_gen, epaddr);
-	if (am_epaddr->sock_connected_state == ZE_SOCK_DEV_FDS_SENT_AND_RECD)
-		err = PSM2_OK;
-	else /* err will be NO_PROGRESS or worse */
-		err = psm3_error_cmp(err, PSM2_OK_NO_PROGRESS);
-
-	// only poll recv if we need to
-	ret = PSM2_OK_NO_PROGRESS;	// keep KW happy
-	if (am_epaddr->num_peer_fds == 0) 
-		ret = psm3_receive_ze_dev_fds(ptl_gen);
-	if (am_epaddr->num_peer_fds) 
-		ret = PSM2_OK;
-
-	 /* worst err, NO_PROGRESS is worse than PSM2_OK */
-	return psm3_error_cmp(ret, err);
-}
-
-/*
- * psm3_poll_dev_fds_exchanged - poll to make forward progress on
- * GPU dev FDs exchange
- *
- * Loop through the epaddrs in am_ep and check_dev_fds_exchanged
- *
- * Returns:
- *		PSM2_OK - we found some work to do and made progress
- *		PSM2_OK_NO_PROGRESS - didn't find anything to do
- *		other - error
- */
-
-psm2_error_t psm3_poll_dev_fds_exchange(ptl_t *ptl_gen)
-{
-	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
-	psm2_error_t err = PSM2_OK_NO_PROGRESS;
-	psm2_error_t ret;
-	int i;
-	int num_left = 0;
-
-	err = psm3_receive_ze_dev_fds(ptl_gen);
-
-	for (i = 0; i <= ptl->max_ep_idx; i++) {
-		am_epaddr_t *am_epaddr = (am_epaddr_t *)ptl->am_ep[i].epaddr;
-
-		if (!am_epaddr || psm3_epid_zero_internal(ptl->am_ep[i].epid))
-			continue;
-
-		if (psm3_dev_fds_exchanged(&am_epaddr->epaddr))
-			continue;
-
-		num_left++;	// causes one extra poll if complete now below, but no harm
-
-		// don't try if uni-dir REQ/REP is incomplete
-		if (am_epaddr->cstate_outgoing != AMSH_CSTATE_OUTGOING_ESTABLISHED
-			&& am_epaddr->cstate_incoming != AMSH_CSTATE_INCOMING_ESTABLISHED)
-			continue;
-
-		// try to move forward 1 step
-		ret = psm3_send_dev_fds(ptl_gen, &am_epaddr->epaddr);
-		if (ret > PSM2_OK_NO_PROGRESS)
-			err = psm3_error_cmp(ret, err);
-		else if (ret == PSM2_OK && err == PSM2_OK_NO_PROGRESS)
-			err = ret;
-	}
-	if (num_left == 0 && ptl->ep->need_dev_fds_poll)
-		_HFI_CONNDBG("stop GPU dev FDs poll\n");
-	ptl->ep->need_dev_fds_poll = (num_left != 0);
-
-	return err;
-}
-
-psm2_error_t psm3_sock_detach(ptl_t *ptl_gen)
-{
-	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
-
-	if (ptl->ep->ze_ipc_socket >= 0)
-		close(ptl->ep->ze_ipc_socket);
-	ptl->ep->ze_ipc_socket = -1;
-	if (ptl->ep->listen_sockname) {
-		unlink(ptl->ep->listen_sockname);
-		psmi_free(ptl->ep->listen_sockname);
-	}
-	ptl->ep->listen_sockname = NULL;
-	return PSM2_OK;
-}
-#endif /* not PSM_HAVE_PIDFD */
-
-#ifndef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-static int psm3_ipc_handle_cached(const void *buf,
-				ze_ipc_mem_handle_t ipc_handle)
-{
-	static int first = 1;
-	static int cached = 0;
-	ze_ipc_mem_handle_t tmp_ipc_handle;
-	int tmp_fd;
-
-	/* Only detect the first time */
-	if (!first)
-		return cached;
-
-	PSMI_ONEAPI_ZE_CALL(zeMemGetIpcHandle, ze_context,
-			    buf, &tmp_ipc_handle);
-	tmp_fd = *(uint32_t *)tmp_ipc_handle.data;
-	if (tmp_fd == *(uint32_t *)ipc_handle.data)
-		cached = 1;
-	else
-		close(tmp_fd);
-
-	first = 0;
-	_HFI_VDBG("fd %u tmp_fd %d cached %d\n", *(uint32_t *)ipc_handle.data,
-						tmp_fd, cached);
-
-	return cached;
-}
-#endif
-
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-// queue for delayed Put to get better GetIpcHandle performance
-// while having an upper bound on number of active Ipc Handles
-// sized based on PSM3_ONEAPI_PUTQUEUE_SIZE
-struct {
-	psmi_lock_t lock;
-	struct oneapi_handle_array {
-		uint8_t valid;
-		ze_ipc_mem_handle_t ipc_handle;
-	} *array;
-	unsigned index;	// where to add next entry and remove oldest
-	int size;	// number of slots in queue, -1 disables put
-} psm3_oneapi_putqueue;
-#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */
-
-psm2_error_t psmi_oneapi_putqueue_alloc(void)
-{
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-	union psmi_envvar_val env;
-	psm3_getenv("PSM3_ONEAPI_PUTQUEUE_SIZE",
-				"How many Ipc Handle Puts to queue for shm send and nic Direct GPU Access [-1 disables Put, 0 disables queue]",
-				PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
-				(union psmi_envvar_val)ONEAPI_PUTQUEUE_SIZE, &env);
-	_HFI_DBG("OneApi PutQueue Size=%d\n", env.e_int);
-	psm3_oneapi_putqueue.size = env.e_int;
-	if (env.e_int > 0) {
-		psm3_oneapi_putqueue.array = (struct oneapi_handle_array *)psmi_calloc(
-										PSMI_EP_NONE, UNDEFINED, env.e_int,
-										sizeof(*psm3_oneapi_putqueue.array));
-		if (! psm3_oneapi_putqueue.array)
-			return PSM2_NO_MEMORY;
-		psm3_oneapi_putqueue.index = 0;
-		psmi_init_lock(&psm3_oneapi_putqueue.lock);
-	}
-#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */
-	return PSM2_OK;
-}
-
-void psm3_put_ipc_handle(const void *buf, ze_ipc_mem_handle_t ipc_handle)
-{
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-	if (! psm3_oneapi_putqueue.array) {	// queue disabled
-		if (psm3_oneapi_putqueue.size >= 0)	// negative size disables Put
-			PSMI_ONEAPI_ZE_CALL(zeMemPutIpcHandle, ze_context, ipc_handle);
-		return;
-	}
-	PSMI_LOCK(psm3_oneapi_putqueue.lock);
-	if (psm3_oneapi_putqueue.array[psm3_oneapi_putqueue.index].valid) {
-		// Put the oldest one to make room for new entry
-		ze_ipc_mem_handle_t tmp_ipc_handle =
-			psm3_oneapi_putqueue.array[psm3_oneapi_putqueue.index].ipc_handle;
-		PSMI_ONEAPI_ZE_CALL(zeMemPutIpcHandle, ze_context, tmp_ipc_handle);
-	}
-	// queue the new one
-	psm3_oneapi_putqueue.array[psm3_oneapi_putqueue.index].valid = 1;
-	psm3_oneapi_putqueue.array[psm3_oneapi_putqueue.index++].ipc_handle = ipc_handle;
-	psm3_oneapi_putqueue.index %= psm3_oneapi_putqueue.size;
-	PSMI_UNLOCK(psm3_oneapi_putqueue.lock);
-#else /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */
-	// for older Agama with handle "cache" but no reference counting
-	// no way to put handle without affecting all IOs using that buffer
-	// on ATS w/o Agama handle cache, no benefit to holding onto fd so close
-	if (!psm3_ipc_handle_cached(buf, ipc_handle))
-		close(*(uint32_t *)ipc_handle.data);
-#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */
-}
-
-void psmi_oneapi_putqueue_free(void)
-{
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-#if 0 // we are shutting down, so don't worry about Putting the queued handles
-	int i;
-
-	// no need for lock, destroying object, no more callers
-	for (i=0; i < psm3_oneapi_putqueue.size; i++) {
-		if (psm3_oneapi_putqueue.array[i].valid) {
-			ze_ipc_mem_handle_t ipc_handle = psm3_oneapi_putqueue.array[i].ipc_handle;
-			PSMI_ONEAPI_ZE_CALL(zeMemPutIpcHandle, ze_context, ipc_handle);
-		}
-	}
-#endif /* 0 */
-	if (psm3_oneapi_putqueue.array) {
-		psmi_free(psm3_oneapi_putqueue.array);
-		psm3_oneapi_putqueue.array = NULL;
-		psmi_destroy_lock(&psm3_oneapi_putqueue.lock);
-	}
-#endif /* PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE */
-}
-
-/*
- * get OneAPI alloc_id for a GPU address
- *
- * The address should be part of a buffer allocated from an OneAPI
- * library call (zeMemAllocDevice() or zeMemAllocHost()).
- * The alloc_id changes on each OneAPI allocation call. PSM3/rv uses the
- * alloc_id to determine if a cache hit is a potentially stale entry which
- * should be invalidated.
- */
-uint64_t psm3_oneapi_ze_get_alloc_id(void *addr, uint8_t *type)
-{
-	ze_memory_allocation_properties_t mem_props = {
-		.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES
-	};
-	ze_device_handle_t device;
-
-	PSMI_ONEAPI_ZE_CALL(zeMemGetAllocProperties, ze_context,
-			    addr, &mem_props, &device);
-	if (type)
-		*type = (uint8_t)mem_props.type;
-	/*
-	 * id is unique across all allocates on all devices within a given
-	 * process
-	 */
-	return mem_props.id;
-}
-
-#endif // PSM_ONEAPI
diff --git a/prov/psm3/psm3/psm_rndv_mod.c b/prov/psm3/psm3/psm_rndv_mod.c
index 1daa81f5c2c..b754320f7d2 100644
--- a/prov/psm3/psm3/psm_rndv_mod.c
+++ b/prov/psm3/psm3/psm_rndv_mod.c
@@ -102,154 +102,6 @@ struct irdma_mem_reg_req {
 //#define my_calloc(nmemb, size) (psmi_calloc(PSMI_EP_NONE, NETWORK_BUFFERS, (nmemb), (size)))
 #define my_free(p) (psmi_free(p))
 
-#ifdef PSM_CUDA
-static int gpu_pin_check;	// PSM3_GPU_PIN_CHECK
-static uint64_t *gpu_bars;
-static int num_gpu_bars = 0;
-static uint64_t min_gpu_bar_size;
-
-// The second BAR address is where the GPU will map GPUDirect memory.
-// The beginning of this BAR is reserved for non-GPUDirect uses.
-// However, it has been observed that in some multi-process
-// pinning failures, HED-2035, the nvidia_p2p_get_pages can foul up
-// it's IOMMU after which the next successful pin will incorrectly
-// return the 1st physical address of the BAR for the pinned pages.
-// In this case it will report this same physical address for other GPU virtual
-// addresses and cause RDMA to use the wrong memory.
-// As a workaround, we gather the Region 1 BAR address start for each
-// GPU and if we see this address returned as the phys_addr of a mmapped
-// GPUDirect Copy or the iova of a GPU MR we fail the job before it can
-// corrupt any more application data.
-static uint64_t get_nvidia_bar_addr(int domain, int bus, int slot)
-{
-	char sysfs[100];
-	int ret;
-	FILE *f;
-	unsigned long long start_addr, end_addr, bar_size;
-
-	ret = snprintf(sysfs, sizeof(sysfs),
-		"/sys/class/pci_bus/%04x:%02x/device/%04x:%02x:%02x.0/resource",
-		domain, bus, domain, bus, slot);
-	psmi_assert_always(ret < sizeof(sysfs));
-	f = fopen(sysfs, "r");
-	if (! f) {
-		if (gpu_pin_check) {
-			_HFI_ERROR("Unable to open %s for GPU BAR Address: %s\n",
-				sysfs, strerror(errno));
-			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-				"Unable to get GPU BAR address\n");
-		}
-		return 0;
-	}
-	// for each BAR region, start, end and flags are listed in hex
-	// nVidia uses the 2nd BAR region (aka Region #1) to map peer to peer
-	// accesses into it's potentially larger GPU local memory space
-	ret = fscanf(f, "%*x %*x %*x %llx %llx", &start_addr, &end_addr);
-	if (ret != 2) {
-		if (gpu_pin_check) {
-			_HFI_ERROR("Unable to get GPU BAR Address from %s: %s\n",
-				sysfs, strerror(errno));
-			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-				"Unable to get GPU BAR address\n");
-		}
-		fclose(f);
-		return 0;
-	}
-	fclose(f);
-
-	bar_size = (end_addr - start_addr) + 1;
-	_HFI_DBG("GPU BAR Addr from %s is 0x%llx - 0x%llx (size 0x%llx)\n", sysfs, start_addr, end_addr, bar_size);
-	if (! min_gpu_bar_size || bar_size < min_gpu_bar_size)
-		min_gpu_bar_size = bar_size;
-	return start_addr;
-}
-
-void psm2_get_gpu_bars(void)
-{
-	int num_devices, dev;
-	union psmi_envvar_val env;
-
-	psm3_getenv("PSM3_GPU_PIN_CHECK",
-			"Enable sanity check of physical addresses mapped into GPU BAR space (Enabled by default)",
-			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
-			(union psmi_envvar_val)1, &env);
-	gpu_pin_check = env.e_int;
-
-	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
-	gpu_bars = psmi_calloc(PSMI_EP_NONE, UNDEFINED, num_devices, sizeof(gpu_bars[0]));
-	if (! gpu_bars)
-		return;	// psmi_calloc will have exited for Out of Memory
-
-	if (gpu_pin_check)
-		num_gpu_bars = num_devices;
-
-	for (dev = 0; dev < num_devices; dev++) {
-		CUdevice device;
-		int domain, bus, slot;
-
-		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-				&domain,
-				CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID,
-				device);
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-				&bus,
-				CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
-				device);
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-				&slot,
-				CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
-				device);
-		gpu_bars[dev] = get_nvidia_bar_addr(domain, bus, slot);
-	}
-}
-
-static psm2_error_t psm2_check_phys_addr(uint64_t phys_addr)
-{
-	int i;
-	for (i=0; i < num_gpu_bars; i++) {
-		if (phys_addr == gpu_bars[i]) {
-			_HFI_ERROR("Incorrect Physical Address (0x%"PRIx64") returned by nVidia driver.  PSM3 exiting to avoid data corruption.  Job may be rerun with PSM3_GPUDIRECT=0 to avoid this issue.\n",
-				phys_addr);
-			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-				"Incorrect Physical Address returned by nVidia driver\n");
-			psmi_assert_always(0);
-			return PSM2_INTERNAL_ERR;
-		}
-	}
-	return PSM2_OK;
-}
-#endif
-
-#ifdef PSM_ONEAPI
-// PSM3_RV_GPU_IGNORE_ALLOC_ID allows internal testing
-// =0 -> default, alloc_id used to identify new buffers which have same
-// 		virt addr as an existing cache entry.  In which case a cache miss
-// 		and invalidation of the old cache entry occurs.
-// =1 -> an alloc_id of 0 is always used.  This has been demonstrated to
-// 		cause false cache hits which can lead to landing data in safe but
-// 		incorrect pages.  Useful only for development experiments and tests.
-// =2 -> for cache miss performance testing.  This will use a different alloc_id
-// 		per IO which will force cache invalidation on every IO.  So no
-// 		MR/mmap cache hits will occur, but all the normal MR handling will
-// 		occur just as if there was a miss when running in normal mode
-static int ignore_alloc_id;	// PSM3_RV_GPU_IGNORE_ALLOC_ID
-static uint64_t fake_alloc_id;	// for when PSM3_RV_GPU_IGNORE_ALLOC_ID==2
-#endif
-
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-uint64_t psm3_min_gpu_bar_size(void)
-{
-#ifdef PSM_ONEAPI
-	// not yet implemented
-	// psmi_assert_always(0);
-	return 0;
-#else
-	return min_gpu_bar_size;
-#endif
-}
-#endif
-
 static int rv_map_event_ring(psm3_rv_t rv, struct rv_event_ring* ring,
 				int entries, int offset)
 {
@@ -309,15 +161,6 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info)
 	int ret;
 	int save_errno;
 
-#ifdef PSM_ONEAPI
-	union psmi_envvar_val env;
-
-	psm3_getenv("PSM3_RV_GPU_IGNORE_ALLOC_ID",
-			"Disable use of alloc_id to identify GPU MRs to invalidate in RV GPU cache. 1=ignore, 2=use fake id to get 100% miss",
-			PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_INT,
-			(union psmi_envvar_val)0, &env);
-	ignore_alloc_id = env.e_int;
-#endif
 	loc_info->capability = 0;
 	rv = (psm3_rv_t)my_calloc(1, sizeof(struct psm2_rv));
 	if (! rv) {
@@ -340,20 +183,15 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info)
 	if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_USER)
 		qparams.capability |= RV_CAP_USER_MR;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	qparams.gpu_major_rev = RV_GPU_ABI_VER_MAJOR;
 	qparams.gpu_minor_rev = RV_GPU_ABI_VER_MINOR;
 	if ((loc_info->rdma_mode & RV_RDMA_MODE_GPU)
 		|| (loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY) {
-		qparams.capability |= RV_CAP_GPU_DIRECT | RV_CAP_EVICT;
-#ifdef PSM_CUDA
-		qparams.capability |= RV_CAP_NVIDIA_GPU;
-#endif
-#ifdef PSM_ONEAPI
-		qparams.capability |= RV_CAP_INTEL_GPU;
-#endif
+		qparams.capability |= RV_CAP_GPU_DIRECT | RV_CAP_EVICT
+					| PSM3_GPU_RV_CAPABILITY_EXPECTED;
 	}
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 
 	if ((ret = ioctl(rv->fd, RV_IOCTL_CAPABILITY, &qparams)) != 0) {
 		int save_cap_errno = errno;
@@ -379,74 +217,57 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info)
 	loc_info->minor_rev = qparams.minor_rev;
 	loc_info->capability = qparams.capability;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	loc_info->gpu_major_rev = qparams.gpu_major_rev;
 	loc_info->gpu_minor_rev = qparams.gpu_minor_rev;
 	rv->ioctl_gpu_pin_mmap = RV_IOCTL_GPU_PIN_MMAP;
 	if ((loc_info->rdma_mode & RV_RDMA_MODE_GPU)
 		|| (loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY) {
-		if (!(qparams.capability & RV_CAP_GPU_DIRECT)) {
-			// caller will warn and avoid GPUDirect use
-#ifdef PSM_CUDA
-			_HFI_INFO("WARNING: Mismatch: PSM3(cuda) vs RV(non-GPU).\n");
-#else
-			_HFI_INFO("WARNING: Mismatch: PSM3(oneapi-ze) vs RV(non-GPU).\n");
-#endif
-			loc_info->rdma_mode &= ~(RV_RDMA_MODE_GPU|RV_RDMA_MODE_UPSIZE_GPU);
-			if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY)
-				goto fail_sockets;
-		}
-#ifdef PSM_CUDA
-		if ((qparams.capability & (RV_CAP_INTEL_GPU | RV_CAP_NVIDIA_GPU)) ==
-		    RV_CAP_INTEL_GPU) {
-			// caller will warn and avoid GPUDirect use
-			_HFI_INFO("WARNING: Mismatch: PSM3(cuda) vs RV(oneapi-ze).\n");
-			loc_info->rdma_mode &= ~(RV_RDMA_MODE_GPU|RV_RDMA_MODE_UPSIZE_GPU);
-			loc_info->capability &= ~RV_CAP_GPU_DIRECT;
-			if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY)
-				goto fail_sockets;
-		}
 #ifdef RV_GPU_ABI_VER_MINOR_0	/* not defined if compile against older RV header */
 		// RV GPU API <= 1.0 is ok, ioctl different but arg subset
 		if (loc_info->gpu_major_rev <= RV_GPU_ABI_VER_MAJOR_1
 			&& loc_info->gpu_minor_rev <= RV_GPU_ABI_VER_MINOR_0)
 			rv->ioctl_gpu_pin_mmap = RV_IOCTL_GPU_PIN_MMAP_R0;
 #endif
-#endif /* CUDA */
-#ifdef PSM_ONEAPI
-		if ((qparams.capability & (RV_CAP_INTEL_GPU | RV_CAP_NVIDIA_GPU)) ==
-		    RV_CAP_NVIDIA_GPU) {
+		if (!(qparams.capability & RV_CAP_GPU_DIRECT)) {
+			// caller will warn and avoid GPUDirect use
+			_HFI_INFO("WARNING: Mismatch: PSM3" PSM3_GPU_TYPES " vs RV non-GPU.\n");
+			loc_info->rdma_mode &= ~(RV_RDMA_MODE_GPU|RV_RDMA_MODE_UPSIZE_GPU);
+			if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY)
+				goto fail_sockets;
+		}
+		if (!(qparams.capability & PSM3_GPU_RV_CAPABILITY_EXPECTED)) {
 			// caller will warn and avoid GPUDirect use
-			_HFI_INFO("WARNING: Mismatch: PSM3(oneapi-ze) vs RV(cuda).\n");
+			char buf1[100];
+			char buf2[100];
+			PSM3_GPU_RV_CAP_STRING(buf1, sizeof(buf1), PSM3_GPU_RV_CAPABILITY_EXPECTED);
+			PSM3_GPU_RV_CAP_STRING(buf2, sizeof(buf2), loc_info->capability);
+			_HFI_INFO("WARNING: Mismatch: PSM3 %s vs RV %s\n", buf1, buf2);
 			loc_info->rdma_mode &= ~(RV_RDMA_MODE_GPU|RV_RDMA_MODE_UPSIZE_GPU);
 			loc_info->capability &= ~RV_CAP_GPU_DIRECT;
 			if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY)
 				goto fail_sockets;
 		}
-#ifdef RV_GPU_ABI_VER_MINOR_0
-		// RV GPU API <= 1.0 does not have track GPU alloc_id
-		// RV GPU API <= 1.1 requires munmap_unpin
-		// so if RV GPU API <= 1.1, do not allow GPUDirect
-		if (loc_info->gpu_major_rev <= RV_GPU_ABI_VER_MAJOR_1
-			&& loc_info->gpu_minor_rev <= RV_GPU_ABI_VER_MINOR_1) {
-			_HFI_INFO("WARNING: Mismatch: Unsupported RV(oneapi-ze) revision.\n");
+		if ((PSM3_GPU_RV_MAJOR_REV_FAIL && PSM3_GPU_RV_MINOR_REV_FAIL)
+			&& loc_info->gpu_major_rev <= PSM3_GPU_RV_MAJOR_REV_FAIL
+			&& loc_info->gpu_minor_rev <= PSM3_GPU_RV_MINOR_REV_FAIL) {
+			char buf2[100];
+			PSM3_GPU_RV_CAP_STRING(buf2, sizeof(buf2), loc_info->capability);
+			_HFI_INFO("WARNING: Mismatch: Unsupported RV %s revision (v%u.%u) ne > v%u.%u.\n",
+				buf2, loc_info->gpu_major_rev, loc_info->gpu_minor_rev,
+				PSM3_GPU_RV_MAJOR_REV_FAIL, PSM3_GPU_RV_MINOR_REV_FAIL);
 			loc_info->rdma_mode &= ~(RV_RDMA_MODE_GPU|RV_RDMA_MODE_UPSIZE_GPU);
 			loc_info->capability &= ~RV_CAP_GPU_DIRECT;
 			if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_GPU_ONLY)
 				goto fail_sockets;
 		}
-#else
-		/* not defined if compile against older RV header */
-#error "Intel GPU Support requires version 1.1 or newer rv_user_ioctls.h header"
-#endif
-#endif /* PSM_ONEAPI */
 		if (!(qparams.capability & RV_CAP_EVICT)) {
 			save_errno = ENOTSUP;
 			_HFI_ERROR("Error: rv lacks EVICT ioctl, needed for GPU Support\n");
 			goto fail;
 		}
 	}
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 	if ((loc_info->rdma_mode & RV_RDMA_MODE_MASK) == RV_RDMA_MODE_USER
 		&& !(qparams.capability & RV_CAP_USER_MR)) {
 		save_errno = ENOTSUP;
@@ -460,11 +281,17 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info)
 		&& loc_info->minor_rev <= RV_ABI_VER_MINOR_1)
 		rv->ioctl_reg_mem = RV_IOCTL_REG_MEM_R1;
 #endif
+#ifdef RV_ABI_VER_MINOR_4	/* not defined if compile against older RV header */
+	// RV API <= 1.4 is ok, ioctl different but arg subset
+	if (loc_info->major_rev <= RV_ABI_VER_MAJOR_1 &&
+	    loc_info->minor_rev <= RV_ABI_VER_MINOR_4)
+		rv->ioctl_reg_mem = RV_IOCTL_REG_MEM_R4;
+#endif
 
 	memset(&aparams, 0, sizeof(aparams));
 	snprintf(aparams.in.dev_name, RV_MAX_DEV_NAME_LEN, "%s", devname);
 	aparams.in.mr_cache_size = loc_info->mr_cache_size;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	aparams.in.gpu_cache_size = loc_info->gpu_cache_size;
 #endif
 	aparams.in.rdma_mode = loc_info->rdma_mode;
@@ -500,7 +327,7 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info)
 		goto fail;
 	}
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (loc_info->rdma_mode & RV_RDMA_MODE_GPU) {
 		loc_info->rv_index = aparams.out_gpu.rv_index;
 		loc_info->mr_cache_size = aparams.out_gpu.mr_cache_size;
@@ -513,7 +340,7 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info)
 			loc_info->max_fmr_size = aparams.out_gpu.max_fmr_size;
 #endif
 	} else {
-#endif
+#endif /* PSM_HAVE_GPU */
 		loc_info->rv_index = aparams.out.rv_index;
 		loc_info->mr_cache_size = aparams.out.mr_cache_size;
 		loc_info->q_depth = aparams.out.q_depth;
@@ -523,7 +350,7 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info)
 		    loc_info->minor_rev > RV_ABI_VER_MINOR_3)
 			loc_info->max_fmr_size = aparams.out.max_fmr_size;
 #endif
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		loc_info->gpu_cache_size = 0;
 	}
 #endif
@@ -538,43 +365,37 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info)
 	}
 
 #ifndef RV_CAP_GPU_DIRECT
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #error "Inconsistent build.  RV_CAP_GPU_DIRECT must be defined for GPU builds. Must use GPU enabled rv headers"
 #else
-// lifted from rv_user_ioctls.h
+// lifted from rv_user_ioctls.h so code builds below and can report if runtime
+// RV supports an unknown GPU type
  #define RV_CAP_GPU_DIRECT (1UL << 63)
 #endif
- #endif
-	if (loc_info->capability & RV_CAP_GPU_DIRECT)
-#ifdef PSM_CUDA
-		psm3_print_identify("%s %s run-time rv interface v%d.%d%s gpu v%d.%d cuda\n",
-		       psm3_get_mylabel(), psm3_ident_tag,
-		       loc_info->major_rev,
-		       loc_info->minor_rev,
-		       (loc_info->capability & RV_CAP_USER_MR)?" user_mr":"",
-		       loc_info->gpu_major_rev,
-		       loc_info->gpu_minor_rev);
-#elif defined(PSM_ONEAPI)
-		psm3_print_identify("%s %s run-time rv interface v%d.%d%s gpu v%d.%d oneapi-ze\n",
+#endif /* ! RV_CAP_GPUDIRECT */
+	if (loc_info->capability & RV_CAP_GPU_DIRECT) {
+		// RV has GPU capability
+#ifdef PSM_HAVE_GPU
+		char buf[100];
+		PSM3_GPU_RV_CAP_STRING(buf, sizeof(buf), loc_info->capability);
+		psm3_print_identify("%s %s run-time rv interface v%u.%u%s gpu v%u.%u%s\n",
 		       psm3_get_mylabel(), psm3_ident_tag,
-		       loc_info->major_rev,
-		       loc_info->minor_rev,
+		       loc_info->major_rev, loc_info->minor_rev,
 		       (loc_info->capability & RV_CAP_USER_MR)?" user_mr":"",
-		       loc_info->gpu_major_rev,
-		       loc_info->gpu_minor_rev);
-#else
-		psm3_print_identify("%s %s run-time rv interface v%d.%d%s cuda\n",
+		       loc_info->gpu_major_rev, loc_info->gpu_minor_rev, buf);
+#else /* PSM_HAVE_GPU */
+		psm3_print_identify("%s %s run-time rv interface v%u.%u%s gpu unknown\n",
 		       psm3_get_mylabel(), psm3_ident_tag,
-		       loc_info->major_rev,
-		       loc_info->minor_rev,
+		       loc_info->major_rev, loc_info->minor_rev,
 		       (loc_info->capability & RV_CAP_USER_MR)?" user_mr":"");
-#endif /*  PSM_CUDA */
-	else
-		psm3_print_identify("%s %s run-time rv interface v%d.%d%s\n",
+#endif /* PSM_HAVE_GPU */
+	} else {
+		psm3_print_identify("%s %s run-time rv interface v%u.%u%s\n",
 		       psm3_get_mylabel(), psm3_ident_tag,
 		       loc_info->major_rev,
 		       loc_info->minor_rev,
 		       (loc_info->capability & RV_CAP_USER_MR)?" user_mr":"");
+	}
 	return rv;
 fail:
 	if (rv) {
@@ -583,7 +404,7 @@ psm3_rv_t psm3_rv_open(const char *devname, struct local_info *loc_info)
 	errno = save_errno;
 	return NULL;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 fail_sockets:
 	// unacceptable RV module for sockets use case, just fail open
 	loc_info->rdma_mode = 0;
@@ -650,7 +471,7 @@ int psm3_rv_get_cache_stats(psm3_rv_t rv, struct psm3_rv_cache_stats *stats)
 	return -1;
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 int psm3_rv_gpu_get_cache_stats(psm3_rv_t rv, struct psm3_rv_gpu_cache_stats *stats)
 {
 	struct rv_gpu_cache_stats_params_out sparams;
@@ -1049,10 +870,11 @@ void psm3_rv_destroy_conn(psm3_rv_conn_t conn)
 	my_free(conn);
 }
 
+#ifdef PSM_HAVE_REG_MR
 psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd,
 				void *addr, uint64_t length, int access
-#ifdef PSM_ONEAPI
-				, uint64_t alloc_id
+#ifdef PSM_HAVE_GPU
+				, union psm3_verbs_mr_gpu_specific *gpu_specific
 #endif
 				)
 {
@@ -1060,9 +882,8 @@ psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd,
 	struct rv_mem_params mparams;
 	struct irdma_mem_reg_req req;
 	int save_errno;
-#ifdef PSM_ONEAPI
-	ze_ipc_mem_handle_t ipc_handle;
-	uint64_t handle_fd = 0;
+#ifdef PSM_HAVE_GPU
+	union psm3_gpu_rv_reg_mmap_mem_scratchpad gpu_scratchpad = { };
 #endif
 
 	if (!rv || (!pd && !(access & IBV_ACCESS_KERNEL))) {
@@ -1070,7 +891,7 @@ psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd,
 		goto fail;
 	}
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #ifdef PSM_FI
 	if_pf((access & IBV_ACCESS_IS_GPU_ADDR) && PSM3_FAULTINJ_ENABLED()) {
                 PSM3_FAULTINJ_STATIC_DECL(fi_gpu_reg_mr, "gpu_reg_mr",
@@ -1096,27 +917,13 @@ psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd,
 		mparams.in.ibv_pd_handle = pd->handle;
 	mparams.in.cmd_fd_int = cmd_fd_int;
 	mparams.in.access = access;
-#ifdef PSM_ONEAPI
+#ifdef PSM_HAVE_GPU
 	if (access & IBV_ACCESS_IS_GPU_ADDR) {
-		PSMI_ONEAPI_ZE_CALL(zeMemGetIpcHandle, ze_context,
-				    (const void *)addr, &ipc_handle);
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-		PSMI_ONEAPI_ZE_CALL(zeMemGetFileDescriptorFromIpcHandleExp, ze_context, ipc_handle, &handle_fd);
-#else
-		handle_fd = *(uint32_t *)ipc_handle.data;
-#endif
-		mparams.in.ipc_handle = (uint32_t)handle_fd;
-		if (!mparams.in.ipc_handle) {
-			_HFI_ERROR("zeMemGetIpcHandle for %p returned empty handle: 0x%02x%02x%02x%02x %02x%02x%02x%02x\n",
-				   addr, ipc_handle.data[0], ipc_handle.data[1],
-				   ipc_handle.data[2], ipc_handle.data[3],
-				   ipc_handle.data[4], ipc_handle.data[5],
-				   ipc_handle.data[6], ipc_handle.data[7]);
-			// tends to mean out of fd's
-			save_errno = ENOSPC;
+		if (0 != (save_errno = PSM3_GPU_INIT_RV_REG_MR_PARAMS(addr,
+					length, access, &mparams,
+					gpu_specific, &gpu_scratchpad))) {
 			goto fail;
 		}
-		mparams.in.alloc_id = ignore_alloc_id?(ignore_alloc_id==1?0:fake_alloc_id++):alloc_id;
 	}
 #endif
 	mparams.in.addr = (uint64_t)addr;
@@ -1130,9 +937,9 @@ psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd,
 		save_errno = errno;
 		goto fail;
 	}
-#ifdef PSM_CUDA
+#ifdef PSM_HAVE_GPU
 	if ((access & IBV_ACCESS_IS_GPU_ADDR)
-		&& PSM2_OK != psm2_check_phys_addr(mparams.out.iova)) {
+		&& PSM2_OK != PSM3_GPU_CHECK_PHYS_ADDR(mparams.out.iova)) {
 		(void)psm3_rv_dereg_mem(rv, mr);
 		errno = EFAULT;
 		return NULL;
@@ -1157,12 +964,8 @@ psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd_int, struct ibv_pd *pd,
 	}
 	errno = save_errno;
 exit:
-#ifdef PSM_ONEAPI
-	if (handle_fd) {
-		save_errno = errno;
-		psm3_put_ipc_handle((const void *)addr, ipc_handle);
-		errno = save_errno;
-	}
+#ifdef PSM_HAVE_GPU
+	PSM3_GPU_RV_REG_MMAP_CLEANUP(addr, length, access, &gpu_scratchpad);
 #endif
 	return mr;
 }
@@ -1186,18 +989,16 @@ int psm3_rv_dereg_mem(psm3_rv_t rv, psm3_rv_mr_t mr)
 	my_free(mr);
 	return 0;
 }
+#endif /* PSM_HAVE_REG_MR */
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 void * psm3_rv_pin_and_mmap(psm3_rv_t rv, uintptr_t pageaddr,
 				uint64_t pagelen, int access)
 {
 	struct rv_gpu_mem_params params;
 	int ret;
 	void *ret_ptr = NULL;
-#ifdef PSM_ONEAPI
-	ze_ipc_mem_handle_t ipc_handle;
-	uint64_t handle_fd = 0;
-#endif
+	union psm3_gpu_rv_reg_mmap_mem_scratchpad gpu_scratchpad = { };
 
 #ifdef PSM_FI
 	if_pf(PSM3_FAULTINJ_ENABLED()) {
@@ -1215,63 +1016,30 @@ void * psm3_rv_pin_and_mmap(psm3_rv_t rv, uintptr_t pageaddr,
 	params.in.gpu_buf_addr = pageaddr;
 	params.in.gpu_buf_size = pagelen;
 	params.in.access = access;
-#ifdef PSM_ONEAPI
 	if (access & IBV_ACCESS_IS_GPU_ADDR) {
-		ze_memory_allocation_properties_t mem_props = {
-				.stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES
-		};
-		ze_device_handle_t device;
-
-		PSMI_ONEAPI_ZE_CALL(zeMemGetIpcHandle, ze_context,
-				    (const void *)pageaddr, &ipc_handle);
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-		PSMI_ONEAPI_ZE_CALL(zeMemGetFileDescriptorFromIpcHandleExp, ze_context, ipc_handle, &handle_fd);
-#else
-		handle_fd = *(uint32_t *)ipc_handle.data;
-#endif
-		params.in.ipc_handle = (uint32_t)handle_fd;
-		if (!params.in.ipc_handle) {
-			_HFI_ERROR("No ipc_handle: 0x%02x%02x%02x%02x %02x%02x%02x%02x\n",
-				   ipc_handle.data[0], ipc_handle.data[1],
-				   ipc_handle.data[2], ipc_handle.data[3],
-				   ipc_handle.data[4], ipc_handle.data[5],
-				   ipc_handle.data[6], ipc_handle.data[7]);
-			errno = EFAULT;
+		if (0 != (errno = PSM3_GPU_INIT_RV_PIN_MMAP_PARAMS(
+					(void*)pageaddr, pagelen, access,
+					&params, &gpu_scratchpad))) {
 			goto exit;
 		}
-		PSMI_ONEAPI_ZE_CALL(zeMemGetAllocProperties, ze_context,
-							(const void *)pageaddr, &mem_props, &device);
-		// id is unique across all allocs on all devices in a process
-		params.in.alloc_id = ignore_alloc_id?(ignore_alloc_id==1?0:fake_alloc_id++):mem_props.id;
-		_HFI_VDBG("pageaddr 0x%"PRIx64" pagelen %"PRIu64" id %"PRIu64" access 0x%x\n",
-					(uint64_t)pageaddr, pagelen, mem_props.id, access);
 	}
-#endif
 
 	if ((ret = ioctl(rv->fd, rv->ioctl_gpu_pin_mmap, &params)) != 0)
 		goto exit;
 
-#ifdef PSM_CUDA
-	if (PSM2_OK != psm2_check_phys_addr(params.out.phys_addr)) {
+	if (PSM2_OK != PSM3_GPU_CHECK_PHYS_ADDR(params.out.phys_addr)) {
 		(void)psm3_rv_evict_exact(rv, (void*)pageaddr, pagelen, access);
 		errno = EFAULT;
 		goto exit;
 	}
-#endif
 	// return mapped host address or NULL with errno set
 	ret_ptr = (void *)(uintptr_t)params.out.host_buf_addr;
 
 exit:
-#ifdef PSM_ONEAPI
-	if (handle_fd) {
-		int save_errno = errno;
-		psm3_put_ipc_handle((const void *)pageaddr, ipc_handle);
-		errno = save_errno;
-	}
-#endif
+	PSM3_GPU_RV_REG_MMAP_CLEANUP((void*)pageaddr, pagelen, access, &gpu_scratchpad);
 	return ret_ptr;
 }
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 // addr, length, access are what was used in a previous call to
 // __psm_rv_reg_mem or psm3_rv_pin_and_mmap
@@ -1343,7 +1111,7 @@ int64_t psm3_rv_evict_range(psm3_rv_t rv, void *addr, uint64_t length)
 #endif
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 // this will remove from the GPU cache all entries which include
 // addresses between addr and addr+length-1 inclusive if it's
 // refcount is 0.  In the case of reg_mem, a matching call
@@ -1377,7 +1145,7 @@ int64_t psm3_rv_evict_gpu_range(psm3_rv_t rv, uintptr_t addr, uint64_t length)
 	return -1;
 #endif
 }
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 // this will remove from the cache up to the amount specified
 // Only entries with a refcount of 0 are removed.
@@ -1413,7 +1181,7 @@ int64_t psm3_rv_evict_amount(psm3_rv_t rv, uint64_t bytes, uint32_t count)
 #endif
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 // this will remove from the GPU cache up to the amount specified
 // Only entries with a refcount of 0 are removed.
 // In the case of reg_mem, a matching call
@@ -1447,7 +1215,7 @@ int64_t psm3_rv_evict_gpu_amount(psm3_rv_t rv, uint64_t bytes, uint32_t count)
 	return -1;
 #endif
 }
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 
 int psm3_rv_post_rdma_write_immed(psm3_rv_t rv, psm3_rv_conn_t conn,
 				void *loc_buf, psm3_rv_mr_t loc_mr,
diff --git a/prov/psm3/psm3/psm_rndv_mod.h b/prov/psm3/psm3/psm_rndv_mod.h
index d6f0001a37c..59bd49a87ea 100644
--- a/prov/psm3/psm3/psm_rndv_mod.h
+++ b/prov/psm3/psm3/psm_rndv_mod.h
@@ -63,17 +63,9 @@
 #include <rdma/rdma_verbs.h>
 #include <rdma/rv_user_ioctls.h>
 
-#if defined(PSM_ONEAPI)
-#ifndef RV_IOCTL_CAPABILITY
-// TBD we could have configure test this and disable PSM3_HAVE_RNDV_MOD
-// or perhaps even disable/fail oneapi in configure
-#error "PSM_ONEAPI requires rv_user_ioctls.h 1.3 (w/GPU 1.2) or later"
-#endif
-#endif
-
 struct local_info {
 	uint32_t mr_cache_size;	// in MBs
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint32_t gpu_cache_size;	// in MBs
 #endif
 	uint8_t rdma_mode;	// RV_RDMA_MODE_*
@@ -101,7 +93,7 @@ struct local_info {
 	// output from RNDV driver
 	uint16_t major_rev;		// driver ABI rev
 	uint16_t minor_rev;		// driver ABI rev
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint16_t gpu_major_rev;		// driver GPU ABI rev
 	uint16_t gpu_minor_rev;		// driver GPU ABI rev
 #endif
@@ -118,7 +110,7 @@ struct rv_event_ring {
 
 struct psm2_rv {
 	int fd; /* file handle used to issue ioctls to rv driver */
-#if defined(NVIDIA_GPU_DIRECT) || defined(INTEL_GPU_DIRECT)
+#ifdef RV_CAP_GPU_DIRECT
 	unsigned int ioctl_gpu_pin_mmap;
 #endif
 	unsigned int ioctl_reg_mem;
@@ -153,7 +145,7 @@ typedef struct psm3_rv_mr *psm3_rv_mr_t;
 
 #define psm3_rv_cache_stats rv_cache_stats_params_out
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #define psm3_rv_gpu_cache_stats rv_gpu_cache_stats_params_out
 #endif
 
@@ -171,7 +163,7 @@ static inline uint16_t psm3_rv_get_user_minor_bldtime_version(void)
 	return RV_ABI_VER_MINOR;
 }
 
-#if defined(NVIDIA_GPU_DIRECT) || defined(INTEL_GPU_DIRECT)
+#ifdef RV_CAP_GPU_DIRECT
 static inline uint16_t psm3_rv_get_gpu_user_major_bldtime_version(void)
 {
 	return RV_GPU_ABI_VER_MAJOR;
@@ -181,8 +173,6 @@ static inline uint16_t psm3_rv_get_gpu_user_minor_bldtime_version(void)
 {
 	return RV_GPU_ABI_VER_MINOR;
 }
-
-extern uint64_t psm3_min_gpu_bar_size(void);
 #endif
 
 extern int psm3_rv_available();
@@ -194,7 +184,7 @@ extern int psm3_rv_close(psm3_rv_t rv);
 extern int psm3_rv_get_cache_stats(psm3_rv_t rv,
 									struct psm3_rv_cache_stats *stats);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 extern int psm3_rv_gpu_get_cache_stats(psm3_rv_t rv,
 									struct psm3_rv_gpu_cache_stats *stats);
 #endif
@@ -221,14 +211,16 @@ extern int psm3_rv_disconnect(psm3_rv_conn_t conn);
 
 extern void psm3_rv_destroy_conn(psm3_rv_conn_t conn);
 
+#ifdef PSM_HAVE_REG_MR
 extern psm3_rv_mr_t psm3_rv_reg_mem(psm3_rv_t rv, int cmd_fd, struct ibv_pd *pd, void *addr,
 				uint64_t length, int access
-#ifdef PSM_ONEAPI
-				, uint64_t alloc_id
+#ifdef PSM_HAVE_GPU
+				, union psm3_verbs_mr_gpu_specific *gpu_specific
 #endif
 				);
 
 extern int psm3_rv_dereg_mem(psm3_rv_t rv, psm3_rv_mr_t mr);
+#endif
 
 extern void * psm3_rv_pin_and_mmap(psm3_rv_t rv, uintptr_t pageaddr,
 			uint64_t pagelen, int access);
@@ -240,7 +232,7 @@ extern int64_t psm3_rv_evict_range(psm3_rv_t rv, void *addr, uint64_t length);
 
 extern int64_t psm3_rv_evict_amount(psm3_rv_t rv, uint64_t bytes, uint32_t count);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 extern int64_t psm3_rv_evict_gpu_range(psm3_rv_t rv, uintptr_t addr,
 			uint64_t length);
 
diff --git a/prov/psm3/psm3/psm_sysbuf.c b/prov/psm3/psm3/psm_sysbuf.c
index 698507e8528..cc87e160273 100644
--- a/prov/psm3/psm3/psm_sysbuf.c
+++ b/prov/psm3/psm3/psm_sysbuf.c
@@ -99,7 +99,7 @@ void psm3_mq_sysbuf_init(psm2_mq_t mq)
     // eager message size (aka PSM3_MTU).
     // replenishing_rate is how many we add to pool at a time, there is
     // no upper bound to the pool.
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
     uint32_t gpu_block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, 65536, 262144, (uint32_t)-1};
     uint32_t gpu_replenishing_rate[] = {128, 64, 32, 16, 8, 4, 2, 2, 0};
     uint32_t block_sizes[] = {256, 512, 1024, 2048, 4096, 8192, (uint32_t)-1, (uint32_t)-1,  (uint32_t)-1};
@@ -111,8 +111,8 @@ void psm3_mq_sysbuf_init(psm2_mq_t mq)
 
     if (mq->mem_ctrl_is_init)
         return;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-    if (PSMI_IS_GPU_ENABLED) {
+#ifdef PSM_HAVE_GPU
+    if (PSM3_GPU_IS_ENABLED) {
         memcpy(block_sizes, gpu_block_sizes, sizeof(block_sizes));
         memcpy(replenishing_rate, gpu_replenishing_rate, sizeof(replenishing_rate));
     }
@@ -160,36 +160,7 @@ void psm3_mq_sysbuf_fini(psm2_mq_t mq)  // free all buffers that is currently no
     for (i=0; i < MM_NUM_OF_POOLS; i++) {
         while ((block = mq->handler_index[i].free_list) != NULL) {
             mq->handler_index[i].free_list = block->next;
-#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER)
-            if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
-                /* ignore NOT_REGISTERED in case cuda initialized late */
-                /* ignore other errors as context could be destroyed before this */
-                CUresult cudaerr;
-                //PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
-                //               cuMemHostUnregister, block);
-                psmi_count_cuMemHostUnregister++;
-                cudaerr = psmi_cuMemHostUnregister(block);
-                if (cudaerr) {
-                    const char *pStr = NULL;
-                    psmi_count_cuGetErrorString++;
-                    psmi_cuGetErrorString(cudaerr, &pStr);
-                    _HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n",
-                            cudaerr, pStr?pStr:"Unknown");
-                }
-            }
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-            if (PSMI_IS_GPU_ENABLED) {
-                ze_result_t result;
-                //PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block);
-                psmi_count_zexDriverReleaseImportedPointer++;
-                result = psmi_zexDriverReleaseImportedPointer(ze_driver,
-                        block);
-                if (result != ZE_RESULT_SUCCESS) {
-                    _HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result));
-                }
-            }
-#endif
+            PSM3_GPU_UNREGISTER_HOSTMEM(block);
             psmi_free(block);
         }
     }
@@ -229,20 +200,9 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size)
             new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
 
             if (new_block) {
-#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER)
-                // for transient buffers, no use Importing, adds cost for
-                // CPU copy, just pay GPU cost on the copy, we use once & free
-                //if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
-                //    PSMI_CUDA_CALL(cuMemHostRegister, new_block, newsz,
-                //                   CU_MEMHOSTALLOC_PORTABLE);
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
                 // for transient buffers, no use Importing, adds cost for
                 // CPU copy, just pay GPU cost on the copy, we use once & free
-                //if (PSMI_IS_GPU_ENABLED)
-                //    PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver,
-                //                    new_block, newsz);
-#endif
+                //PSM3_GPU_REGISTER_HOSTMEM(new_block, newsz);
                 new_block->mem_handler = mm_handler;
                 new_block++;
                 mm_handler->total_alloc++;
@@ -257,22 +217,9 @@ void *psm3_mq_sysbuf_alloc(psm2_mq_t mq, uint32_t alloc_size)
             new_block = psmi_malloc(mq->ep, UNEXPECTED_BUFFERS, newsz);
 
             if (new_block) {
-#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER)
-                // By registering memory with Cuds, we make
-                // cuMemcpy* run faster for copies between
-                // GPU and this sysbuf
-                if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
-                    PSMI_CUDA_CALL(cuMemHostRegister, new_block, newsz,
-                                   CU_MEMHOSTALLOC_PORTABLE);
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-                // By registering memory with Level Zero, we make
-                // zeCommandListAppendMemoryCopy run faster for copies between
-                // GPU and this sysbuf
-                if (PSMI_IS_GPU_ENABLED)
-                    PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver,
-                                    new_block, newsz);
-#endif
+                // By registering memory with GPU, we make GPU memcpy
+		// run faster for copies between GPU and this sysbuf
+		PSM3_GPU_REGISTER_HOSTMEM(new_block, newsz);
                 mm_handler->current_available++;
                 mm_handler->total_alloc++;
                 mq->mem_ctrl_total_bytes += newsz;
@@ -309,22 +256,9 @@ void psm3_mq_sysbuf_free(psm2_mq_t mq, void * mem_to_free)
     mm_handler = block_to_free->mem_handler;
 
     if (mm_handler->flags & MM_FLAG_TRANSIENT) {
-#if defined(PSM_CUDA) && !defined(PSM_NO_CUDA_REGISTER)
         // for transient buffers, no use Importing, adds cost for
         // CPU copy, just pay GPU cost on the copy, we use once & free
-        //if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
-        //        /* ignore NOT_REGISTERED in case cuda initialized late */
-        //        CUresult cudaerr;
-        //        PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
-        //                        cuMemHostUnregister, block_to_free);
-        //}
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-        // for transient buffers, no use Importing, adds cost for
-        // CPU copy, just pay GPU cost on the copy, we use once & free
-        //if (PSMI_IS_GPU_ENABLED)
-        //    PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver, block_to_free);
-#endif
+	// PSM3_GPU_UNREGISTER_HOSTMEM(block_to_free);
         psmi_free(block_to_free);
     } else {
         block_to_free->next = mm_handler->free_list;
diff --git a/prov/psm3/psm3/psm_sysbuf.h b/prov/psm3/psm3/psm_sysbuf.h
index 31ff116d088..5ab4604b014 100644
--- a/prov/psm3/psm3/psm_sysbuf.h
+++ b/prov/psm3/psm3/psm_sysbuf.h
@@ -58,7 +58,7 @@
 
 #include "psm_user.h"
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #define MM_NUM_OF_POOLS 9
 #else
 #define MM_NUM_OF_POOLS 7
diff --git a/prov/psm3/psm3/psm_user.h b/prov/psm3/psm3/psm_user.h
index 28a6e9de4dd..fa2e739b004 100644
--- a/prov/psm3/psm3/psm_user.h
+++ b/prov/psm3/psm3/psm_user.h
@@ -60,24 +60,6 @@
 extern "C" {
 #endif
 
-#if defined(PSM_CUDA)
-// if defined, do not use cuMemHostRegister for malloced pipeline
-// copy bounce buffers
-// otherwise, use cuMemHostRegister when malloc buffer
-//#define PSM3_NO_CUDA_REGISTER
-#endif
-
-#if defined(PSM_ONEAPI)
-// if defined, use malloc for pipeline copy bounce buffers
-// otherwise, use zeMemAllocHost
-//#define PSM3_USE_ONEAPI_MALLOC
-
-// if defined, do not use zexDriverImportExternalPointer for malloced pipeline
-// copy bounce buffers
-// otherwise, use zexDriverImportExternalPointer when malloc buffer
-//#define PSM3_NO_ONEAPI_IMPORT
-#endif
-
 /* Instead of testing a HAL cap mask bit at runtime (in addition to thresholds),
  * we only test thresholds, especially in the ips_proto_mq.c datapath.
  * To allow for slightly more optimized builds, a few build time capability
@@ -107,6 +89,10 @@ extern "C" {
 #ifdef PSM_VERBS
 #define PSM_HAVE_RDMA
 #endif
+
+// psm_config.h will define PSM_HAVE_GPU as needed
+#include "psm_config.h"
+
 #ifdef RNDV_MOD
 /* This is used to guard all RNDV_MOD code in the main parts of PSM
  * so that RNDV_MOD code is only really enabled when a HAL present is able
@@ -117,17 +103,16 @@ extern "C" {
  * HALs instead of testing specific HAL flags like PSM_VERBS or PSM_SOCKETS.
  * Thus, when adding a new HAL, the generic code need not be revisited.
  */
-#if defined(PSM_VERBS) || (defined(PSM_SOCKETS) && (defined(PSM_CUDA) || defined(PSM_ONEAPI)))
+#if defined(PSM_VERBS) || (defined(PSM_SOCKETS) && defined(PSM_HAVE_GPU))
 #define PSM_HAVE_RNDV_MOD
-#endif /* VERBS || (SOCKETS && (CUDA||ONEAPI)) */
+#endif /* VERBS || (SOCKETS && GPU) */
 #endif /* RNDV_MOD */
 
 
-#if (defined(PSM_CUDA) || defined(PSM_ONEAPI)) && defined(PSM_USE_HWLOC)
+#if defined(PSM_HAVE_GPU) && defined(PSM_USE_HWLOC)
 #define PSM_HAVE_GPU_CENTRIC_AFFINITY
 #endif
 
-#include "psm_config.h"
 #include <inttypes.h>
 #include <pthread.h>
 
@@ -148,25 +133,6 @@ extern "C" {
 #include "psm_log.h"
 #include "psm_perf.h"
 
-#ifdef PSM_CUDA
-#ifndef PSM_CUDA_MOCK
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-#include <driver_types.h>
-
-#if CUDA_VERSION < 7000
-#error Please update CUDA driver, required minimum version is 7.0
-#endif
-#else
-// included in stand-alone unit test that does not use real CUDA functions
-#include "psmi_cuda_mock.h"
-#endif /* PSM_CUDA_MOCK */
-#elif defined(PSM_ONEAPI)
-#include <level_zero/ze_api.h>
-#include <level_zero/loader/ze_loader.h>
-#endif
-
-
 #define PSMI_LOCK_NO_OWNER	((pthread_t)(-1))
 
 #define _PSMI_IN_USER_H
@@ -182,9 +148,10 @@ typedef void *psmi_hal_hw_context;
 #include "psm_utils.h"
 #include "psm_timer.h"
 #include "psm_mpool.h"
+#include "gpu/psm_gpu_hal.h"
 #ifdef PSM_HAVE_REG_MR
 #include "psm_verbs_mr.h"
-#ifdef RNDV_MOD
+#ifdef PSM_HAVE_RNDV_MOD
 #include "psm_rndv_mod.h"
 #endif
 #endif
@@ -427,28 +394,11 @@ void psmi_profile_reblock(int did_no_progress) __attribute__ ((weak));
 #define PSMI_PROFILE_REBLOCK(noprog)
 #endif
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-extern int is_gdr_copy_enabled;
-/* This limit dictates when the sender turns off
- * GDR Copy and uses SDMA. The limit needs to be less than equal
- * GPU RNDV threshold (psm3_gpu_thresh_rndv)
- * set to 0 if GDR Copy disabled
- */
-extern uint32_t gdr_copy_limit_send;
-/* This limit dictates when the reciever turns off
- * GDR Copy. The limit needs to be less than equal
- * GPU RNDV threshold (psm3_gpu_thresh_rndv)
- * set to 0 if GDR Copy disabled
- */
-extern uint32_t gdr_copy_limit_recv;
-extern int is_gpudirect_enabled; // only for use during parsing of other params
-extern int _device_support_gpudirect;
-extern uint32_t gpudirect_rdma_send_limit;
-extern uint32_t gpudirect_rdma_recv_limit;
-extern uint32_t psm3_gpu_thresh_rndv;
-
-#define MAX_ZE_DEVICES 8
+#define COMPILE_TIME_ASSERT(NAME,COND) extern char NAME[1/ COND]
 
+#ifdef PSM_HAVE_GPU
+// Host bounce buffers.  Used during pipelined GPU copies for
+// large rendezvous IOs.
 struct ips_gpu_hostbuf {
 	STAILQ_ENTRY(ips_gpu_hostbuf) req_next;
 	STAILQ_ENTRY(ips_gpu_hostbuf) next;
@@ -457,785 +407,14 @@ struct ips_gpu_hostbuf {
 	 * pulled from a mpool or dynamically
 	 * allocated using calloc. */
 	uint8_t is_tempbuf;
-#ifdef PSM_CUDA
-	CUevent copy_status;
-#elif defined(PSM_ONEAPI)
-	ze_event_pool_handle_t event_pool;
-	ze_command_list_handle_t command_lists[MAX_ZE_DEVICES];
-	ze_event_handle_t copy_status;
-	int cur_dev_inx;
-#endif
+	uint8_t pad1;
+	uint16_t pad2;
+	// aligned to 64 bit boundary
+	union gpu_hostbuf_gpu_specific gpu_specific;
 	psm2_mq_req_t req;
 	void* host_buf;
 	void* gpu_buf;
 };
-#endif
-
-#ifdef PSM_CUDA
-
-extern int is_cuda_enabled;
-extern int _device_support_unified_addr;
-extern int _gpu_p2p_supported;
-extern int my_gpu_device;
-extern int cuda_lib_version;
-extern int cuda_runtime_ver;
-extern CUcontext cu_ctxt;
-extern void *psmi_cuda_lib;
-#endif // PSM_CUDA
-
-#ifdef PSM_ONEAPI
-
-int psmi_oneapi_ze_initialize(void);
-psm2_error_t psm3_ze_init_fds(void);
-int *psm3_ze_get_dev_fds(int *nfds);
-
-extern int is_oneapi_ze_enabled;
-extern int _gpu_p2p_supported;
-extern int my_gpu_device;
-#ifndef PSM_HAVE_PIDFD
-extern int psm3_num_ze_dev_fds;
-#endif
-
-struct ze_dev_ctxt {
-	ze_device_handle_t dev;
-	int dev_index; /* Index in ze_devices[] */
-	uint32_t ordinal; /* CmdQGrp ordinal for the 1st copy_only engine */
-	uint32_t index;   /* Cmdqueue index within the CmdQGrp */
-	uint32_t num_queues; /* Number of queues in the CmdQGrp */
-	// for most sync copies
-	ze_command_queue_handle_t cq;	// NULL if psm3_oneapi_immed_sync_copy
-	ze_command_list_handle_t cl;
-	// fields below are only used for large DTOD sync copy so can do 2
-	// parallel async copies then wait for both
-	ze_event_handle_t copy_status0;
-	ze_event_handle_t copy_status1;
-	ze_command_list_handle_t async_cl0;
-	ze_command_list_handle_t async_cl1;
-	ze_command_queue_handle_t async_cq0;// NULL if psm3_oneapi_immed_sync_copy
-	ze_command_queue_handle_t async_cq1;// NULL if psm3_oneapi_immed_sync_copy
-	ze_event_pool_handle_t event_pool;
-};
-
-extern ze_api_version_t zel_api_version;
-extern zel_version_t zel_lib_version;
-extern ze_context_handle_t ze_context;
-extern ze_driver_handle_t ze_driver;
-extern struct ze_dev_ctxt ze_devices[MAX_ZE_DEVICES];
-extern int num_ze_devices;
-extern struct ze_dev_ctxt *cur_ze_dev;
-extern int psm3_oneapi_immed_sync_copy;
-extern int psm3_oneapi_immed_async_copy;
-extern unsigned psm3_oneapi_parallel_dtod_copy_thresh;
-
-const char* psmi_oneapi_ze_result_to_string(const ze_result_t result);
-void psmi_oneapi_async_cmd_create(struct ze_dev_ctxt *ctxt,
-	ze_command_queue_handle_t *p_cq, ze_command_list_handle_t *p_cl);
-#ifndef PSM_HAVE_PIDFD
-psm2_error_t psm3_sock_detach(ptl_t *ptl_gen);
-psm2_error_t psm3_ze_init_ipc_socket(ptl_t *ptl_gen);
-psm2_error_t psm3_send_dev_fds(ptl_t *ptl_gen, psm2_epaddr_t epaddr);
-psm2_error_t psm3_check_dev_fds_exchanged(ptl_t *ptl_gen, psm2_epaddr_t epaddr);
-psm2_error_t psm3_poll_dev_fds_exchange(ptl_t *ptl_gen);
-#endif
-
-#ifdef PSM3_USE_ONEAPI_MALLOC
-void *psm3_oneapi_ze_host_alloc_malloc(unsigned size);
-void psm3_oneapi_ze_host_free_malloc(void *ptr);
-#else
-extern void *(*psm3_oneapi_ze_host_alloc)(unsigned size);
-extern void (*psm3_oneapi_ze_host_free)(void *ptr);
-extern int psm3_oneapi_ze_using_zemem_alloc;
-#endif
-extern void psm3_oneapi_ze_can_use_zemem();
-
-void psmi_oneapi_ze_memcpy(void *dstptr, const void *srcptr, size_t size);
-void psmi_oneapi_ze_memcpy_DTOD(void *dstptr, const void *srcptr, size_t size);
-
-static inline
-int device_support_gpudirect()
-{
-	if (likely(_device_support_gpudirect > -1)) return _device_support_gpudirect;
-
-	/* Is there any device property that can indicate this? */
-	_device_support_gpudirect = 1;
-	return _device_support_gpudirect;
-}
-#endif // PSM_ONEAPI
-
-#ifdef PSM_CUDA
-extern CUresult (*psmi_cuInit)(unsigned int  Flags );
-extern CUresult (*psmi_cuCtxDetach)(CUcontext c);
-extern CUresult (*psmi_cuCtxGetCurrent)(CUcontext *c);
-extern CUresult (*psmi_cuCtxSetCurrent)(CUcontext c);
-extern CUresult (*psmi_cuPointerGetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
-extern CUresult (*psmi_cuPointerSetAttribute)(void *data, CUpointer_attribute pa, CUdeviceptr p);
-extern CUresult (*psmi_cuDeviceCanAccessPeer)(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
-extern CUresult (*psmi_cuDeviceGet)(CUdevice* device, int  ordinal);
-extern CUresult (*psmi_cuDeviceGetAttribute)(int* pi, CUdevice_attribute attrib, CUdevice dev);
-extern CUresult (*psmi_cuDriverGetVersion)(int* driverVersion);
-extern CUresult (*psmi_cuDeviceGetCount)(int* count);
-extern CUresult (*psmi_cuStreamCreate)(CUstream* phStream, unsigned int Flags);
-extern CUresult (*psmi_cuStreamDestroy)(CUstream phStream);
-extern CUresult (*psmi_cuStreamSynchronize)(CUstream phStream);
-extern CUresult (*psmi_cuEventCreate)(CUevent* phEvent, unsigned int Flags);
-extern CUresult (*psmi_cuEventDestroy)(CUevent hEvent);
-extern CUresult (*psmi_cuEventQuery)(CUevent hEvent);
-extern CUresult (*psmi_cuEventRecord)(CUevent hEvent, CUstream hStream);
-extern CUresult (*psmi_cuEventSynchronize)(CUevent hEvent);
-extern CUresult (*psmi_cuMemHostAlloc)(void** pp, size_t bytesize, unsigned int Flags);
-extern CUresult (*psmi_cuMemFreeHost)(void* p);
-extern CUresult (*psmi_cuMemHostRegister)(void* p, size_t bytesize, unsigned int Flags);
-extern CUresult (*psmi_cuMemHostUnregister)(void* p);
-extern CUresult (*psmi_cuMemcpy)(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
-extern CUresult (*psmi_cuMemcpyDtoD)(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
-extern CUresult (*psmi_cuMemcpyDtoH)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
-extern CUresult (*psmi_cuMemcpyHtoD)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
-extern CUresult (*psmi_cuMemcpyDtoHAsync)(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-extern CUresult (*psmi_cuMemcpyHtoDAsync)(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount, CUstream hStream);
-extern CUresult (*psmi_cuIpcGetMemHandle)(CUipcMemHandle* pHandle, CUdeviceptr dptr);
-extern CUresult (*psmi_cuIpcOpenMemHandle)(CUdeviceptr* pdptr, CUipcMemHandle handle, unsigned int Flags);
-extern CUresult (*psmi_cuIpcCloseMemHandle)(CUdeviceptr dptr);
-extern CUresult (*psmi_cuMemGetAddressRange)(CUdeviceptr* pbase, size_t* psize, CUdeviceptr dptr);
-extern CUresult (*psmi_cuDevicePrimaryCtxGetState)(CUdevice dev, unsigned int* flags, int* active);
-extern CUresult (*psmi_cuDevicePrimaryCtxRetain)(CUcontext* pctx, CUdevice dev);
-extern CUresult (*psmi_cuCtxGetDevice)(CUdevice* device);
-extern CUresult (*psmi_cuDevicePrimaryCtxRelease)(CUdevice device);
-extern CUresult (*psmi_cuGetErrorString)(CUresult error, const char **pStr);
-extern cudaError_t (*psmi_cudaRuntimeGetVersion)(int* runtimeVersion);
-#endif // PSM_CUDA
-
-#ifdef PSM_ONEAPI
-extern ze_result_t (*psmi_zeInit)(ze_init_flags_t flags);
-extern ze_result_t (*psmi_zeDriverGet)(uint32_t *pCount, ze_driver_handle_t *phDrivers);
-#ifndef PSM3_NO_ONEAPI_IMPORT
-extern ze_result_t (*psmi_zexDriverImportExternalPointer)(ze_driver_handle_t hDriver, void *ptr, size_t size);
-extern ze_result_t (*psmi_zexDriverReleaseImportedPointer)(ze_driver_handle_t hDriver, void *ptr);
-#endif
-extern ze_result_t (*psmi_zeDeviceGet)(ze_driver_handle_t hDriver, uint32_t *pCount, ze_device_handle_t *phDevices);
-extern ze_result_t (*psmi_zeDevicePciGetPropertiesExt)(ze_device_handle_t hDevice, ze_pci_ext_properties_t *pPciProperties);
-#ifndef PSM3_NO_ONEAPI_IMPORT
-extern ze_result_t (*psmi_zeDriverGetExtensionFunctionAddress)(ze_driver_handle_t hDriver, const char *name, void **ppFunctionAddress);
-#endif
-extern ze_result_t (*psmi_zeContextCreate)(ze_driver_handle_t hDriver, const ze_context_desc_t *desc, ze_context_handle_t *phContext);
-extern ze_result_t (*psmi_zeContextDestroy)(ze_context_handle_t hContext);
-extern ze_result_t (*psmi_zeCommandQueueCreate)(ze_context_handle_t hContext, ze_device_handle_t hDevice,const ze_command_queue_desc_t *desc, ze_command_queue_handle_t *phCommandQueue);
-extern ze_result_t (*psmi_zeCommandQueueDestroy)(ze_command_queue_handle_t hCommandQueue);
-extern ze_result_t (*psmi_zeCommandQueueExecuteCommandLists)(ze_command_queue_handle_t hCommandQueue, uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, ze_fence_handle_t hFence);
-extern ze_result_t (*psmi_zeCommandQueueSynchronize)(ze_command_queue_handle_t hCommandQueue, uint64_t timeout);
-extern ze_result_t (*psmi_zeCommandListCreate)(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_list_desc_t *desc, ze_command_list_handle_t *phCommandList);
-extern ze_result_t (*psmi_zeCommandListDestroy)(ze_command_list_handle_t hCommandList);
-extern ze_result_t (*psmi_zeCommandListClose)(ze_command_list_handle_t hCommandList);
-extern ze_result_t (*psmi_zeCommandListReset)(ze_command_list_handle_t hCommandList);
-extern ze_result_t (*psmi_zeCommandListCreateImmediate)(ze_context_handle_t hContext, ze_device_handle_t hDevice, const ze_command_queue_desc_t *desc, ze_command_list_handle_t *phCommandList);
-extern ze_result_t (*psmi_zeCommandListAppendMemoryCopy)(ze_command_list_handle_t hCommandList, void *dstptr, const void *srcptr, size_t size, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents);
-extern ze_result_t (*psmi_zeCommandListAppendSignalEvent)(ze_command_list_handle_t hCommandList, ze_event_handle_t hEvent);
-extern ze_result_t (*psmi_zeDeviceCanAccessPeer)(ze_device_handle_t hDevice, ze_device_handle_t hPeerDevice, ze_bool_t *value);
-extern ze_result_t (*psmi_zeDeviceGetCommandQueueGroupProperties)(ze_device_handle_t hDevice, uint32_t *pCount, ze_command_queue_group_properties_t *pCommandQueueGroupProperties);
-extern ze_result_t (*psmi_zeMemAllocHost)(ze_context_handle_t hContext, const ze_host_mem_alloc_desc_t *host_desc, size_t size, size_t alignment, void **pptr);
-extern ze_result_t (*psmi_zeMemAllocDevice)(ze_context_handle_t hContext, const ze_device_mem_alloc_desc_t *device_desc, size_t size, size_t alignment, ze_device_handle_t hDevice, void **pptr);
-extern ze_result_t (*psmi_zeMemFree)(ze_context_handle_t hContext, void *ptr);
-extern ze_result_t (*psmi_zeMemGetIpcHandle)(ze_context_handle_t hContext, const void *ptr, ze_ipc_mem_handle_t *pIpcHandle);
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-extern ze_result_t (*psmi_zeMemGetIpcHandleFromFileDescriptorExp)(ze_context_handle_t hContext, uint64_t handle, ze_ipc_mem_handle_t *pIpcHandle);
-extern ze_result_t (*psmi_zeMemGetFileDescriptorFromIpcHandleExp)(ze_context_handle_t hContext, ze_ipc_mem_handle_t ipcHandle, uint64_t *pHandle);
-extern ze_result_t (*psmi_zeMemPutIpcHandle)(ze_context_handle_t hContext, ze_ipc_mem_handle_t handle);
-#endif
-extern ze_result_t (*psmi_zeMemOpenIpcHandle)(ze_context_handle_t hContext,ze_device_handle_t hDevice, ze_ipc_mem_handle_t handle, ze_ipc_memory_flags_t flags, void **pptr);
-extern ze_result_t (*psmi_zeMemCloseIpcHandle)(ze_context_handle_t hContext, const void *ptr);
-extern ze_result_t (*psmi_zeMemGetAddressRange)(ze_context_handle_t hContext, const void *ptr, void **pBase, size_t *pSize);
-extern ze_result_t (*psmi_zeMemGetAllocProperties)(ze_context_handle_t hContext, const void *ptr, ze_memory_allocation_properties_t *pMemAllocProperties, ze_device_handle_t *phDevice);
-extern ze_result_t (*psmi_zeEventPoolCreate)(ze_context_handle_t hContext, const ze_event_pool_desc_t *desc, uint32_t numDevices, ze_device_handle_t *phDevices, ze_event_pool_handle_t *phEventPool);
-extern ze_result_t (*psmi_zeEventPoolDestroy)(ze_event_pool_handle_t hEventPool);
-extern ze_result_t (*psmi_zeEventCreate)(ze_event_pool_handle_t hEventPool, const ze_event_desc_t *desc, ze_event_handle_t *phEvent);
-extern ze_result_t (*psmi_zeEventDestroy)(ze_event_handle_t hEvent);
-extern ze_result_t (*psmi_zeEventQueryStatus)(ze_event_handle_t hEvent);
-extern ze_result_t (*psmi_zeEventHostSynchronize)(ze_event_handle_t hEvent, uint64_t timeout);
-extern ze_result_t (*psmi_zeEventHostReset)(ze_event_handle_t hEvent);
-extern ze_result_t (*psmi_zelLoaderGetVersions)(size_t *num_elems, zel_component_version_t *versions);
-
-#endif // PSM_ONEAPI
-
-#ifdef PSM_CUDA
-extern uint64_t psmi_count_cuInit;
-extern uint64_t psmi_count_cuCtxDetach;
-extern uint64_t psmi_count_cuCtxGetCurrent;
-extern uint64_t psmi_count_cuCtxSetCurrent;
-extern uint64_t psmi_count_cuPointerGetAttribute;
-extern uint64_t psmi_count_cuPointerSetAttribute;
-extern uint64_t psmi_count_cuDeviceCanAccessPeer;
-extern uint64_t psmi_count_cuDeviceGet;
-extern uint64_t psmi_count_cuDeviceGetAttribute;
-extern uint64_t psmi_count_cuDriverGetVersion;
-extern uint64_t psmi_count_cuDeviceGetCount;
-extern uint64_t psmi_count_cuStreamCreate;
-extern uint64_t psmi_count_cuStreamDestroy;
-extern uint64_t psmi_count_cuStreamSynchronize;
-extern uint64_t psmi_count_cuEventCreate;
-extern uint64_t psmi_count_cuEventDestroy;
-extern uint64_t psmi_count_cuEventQuery;
-extern uint64_t psmi_count_cuEventRecord;
-extern uint64_t psmi_count_cuEventSynchronize;
-extern uint64_t psmi_count_cuMemHostAlloc;
-extern uint64_t psmi_count_cuMemFreeHost;
-extern uint64_t psmi_count_cuMemHostRegister;
-extern uint64_t psmi_count_cuMemHostUnregister;
-extern uint64_t psmi_count_cuMemcpy;
-extern uint64_t psmi_count_cuMemcpyDtoD;
-extern uint64_t psmi_count_cuMemcpyDtoH;
-extern uint64_t psmi_count_cuMemcpyHtoD;
-extern uint64_t psmi_count_cuMemcpyDtoHAsync;
-extern uint64_t psmi_count_cuMemcpyHtoDAsync;
-extern uint64_t psmi_count_cuIpcGetMemHandle;
-extern uint64_t psmi_count_cuIpcOpenMemHandle;
-extern uint64_t psmi_count_cuIpcCloseMemHandle;
-extern uint64_t psmi_count_cuMemGetAddressRange;
-extern uint64_t psmi_count_cuDevicePrimaryCtxGetState;
-extern uint64_t psmi_count_cuDevicePrimaryCtxRetain;
-extern uint64_t psmi_count_cuCtxGetDevice;
-extern uint64_t psmi_count_cuDevicePrimaryCtxRelease;
-extern uint64_t psmi_count_cuGetErrorString;
-extern uint64_t psmi_count_cudaRuntimeGetVersion;
-#endif // PSM_CUDA
-
-#ifdef PSM_ONEAPI
-extern uint64_t psmi_count_zeInit;
-extern uint64_t psmi_count_zeDriverGet;
-#ifndef PSM3_NO_ONEAPI_IMPORT
-extern uint64_t psmi_count_zexDriverImportExternalPointer;
-extern uint64_t psmi_count_zexDriverReleaseImportedPointer;
-#endif
-extern uint64_t psmi_count_zeDeviceGet;
-extern uint64_t psmi_count_zeDevicePciGetPropertiesExt;
-#ifndef PSM3_NO_ONEAPI_IMPORT
-extern uint64_t psmi_count_zeDriverGetExtensionFunctionAddress;
-#endif
-extern uint64_t psmi_count_zeContextCreate;
-extern uint64_t psmi_count_zeContextDestroy;
-extern uint64_t psmi_count_zeCommandQueueCreate;
-extern uint64_t psmi_count_zeCommandQueueDestroy;
-extern uint64_t psmi_count_zeCommandQueueExecuteCommandLists;
-extern uint64_t psmi_count_zeCommandQueueSynchronize;
-extern uint64_t psmi_count_zeCommandListCreate;
-extern uint64_t psmi_count_zeCommandListDestroy;
-extern uint64_t psmi_count_zeCommandListClose;
-extern uint64_t psmi_count_zeCommandListReset;
-extern uint64_t psmi_count_zeCommandListCreateImmediate;
-extern uint64_t psmi_count_zeCommandListAppendMemoryCopy;
-extern uint64_t psmi_count_zeCommandListAppendSignalEvent;
-extern uint64_t psmi_count_zeDeviceCanAccessPeer;
-extern uint64_t psmi_count_zeDeviceGetCommandQueueGroupProperties;
-extern uint64_t psmi_count_zeMemAllocHost;
-extern uint64_t psmi_count_zeMemAllocDevice;
-extern uint64_t psmi_count_zeMemFree;
-extern uint64_t psmi_count_zeMemGetIpcHandle;
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-extern uint64_t psmi_count_zeMemGetIpcHandleFromFileDescriptorExp;
-extern uint64_t psmi_count_zeMemGetFileDescriptorFromIpcHandleExp;
-extern uint64_t psmi_count_zeMemPutIpcHandle;
-#endif
-extern uint64_t psmi_count_zeMemOpenIpcHandle;
-extern uint64_t psmi_count_zeMemCloseIpcHandle;
-extern uint64_t psmi_count_zeMemGetAddressRange;
-extern uint64_t psmi_count_zeMemGetAllocProperties;
-extern uint64_t psmi_count_zeEventPoolCreate;
-extern uint64_t psmi_count_zeEventPoolDestroy;
-extern uint64_t psmi_count_zeEventCreate;
-extern uint64_t psmi_count_zeEventDestroy;
-extern uint64_t psmi_count_zeEventQueryStatus;
-extern uint64_t psmi_count_zeEventHostSynchronize;
-extern uint64_t psmi_count_zeEventHostReset;
-extern uint64_t psmi_count_zelLoaderGetVersions;
-#endif // PSM_ONEAPI
-
-#ifdef PSM_CUDA
-static int check_set_cuda_ctxt(void)
-{
-	CUresult err;
-	CUcontext tmpctxt = {0};
-
-	if (unlikely(!psmi_cuCtxGetCurrent || !psmi_cuCtxSetCurrent))
-		return 0;
-
-	err = psmi_cuCtxGetCurrent(&tmpctxt);
-	if (likely(!err)) {
-		if (unlikely(!tmpctxt && cu_ctxt)) {
-			err = psmi_cuCtxSetCurrent(cu_ctxt);
-			return !!err;
-		} else if (unlikely(tmpctxt && !cu_ctxt)) {
-			cu_ctxt = tmpctxt;
-		}
-	}
-	return 0;
-}
-
-/* Make sure have a real GPU job.  Set cu_ctxt if available */
-PSMI_ALWAYS_INLINE(
-int check_have_cuda_ctxt(void))
-{
-	if (! cu_ctxt) {
-		if (unlikely(check_set_cuda_ctxt())) {			\
-			psm3_handle_error(PSMI_EP_NORETURN,		\
-			PSM2_INTERNAL_ERR, "Failed to set/synchronize"	\
-			" CUDA context.\n");				\
-		}							\
-	}
-	return (cu_ctxt != NULL);
-}
-
-
-#define PSMI_CUDA_CALL(func, args...) do {				\
-		CUresult cudaerr;					\
-		if (unlikely(check_set_cuda_ctxt())) {			\
-			psm3_handle_error(PSMI_EP_NORETURN,		\
-			PSM2_INTERNAL_ERR, "Failed to set/synchronize"	\
-			" CUDA context.\n");				\
-		}							\
-		psmi_count_##func++;					\
-		cudaerr = (CUresult)psmi_##func(args);			\
-		if (cudaerr != CUDA_SUCCESS) {				\
-			const char *pStr = NULL;			\
-			psmi_count_cuGetErrorString++;			\
-			psmi_cuGetErrorString(cudaerr, &pStr);		\
-			_HFI_ERROR(					\
-				"CUDA failure: %s() (at %s:%d)"		\
-				" returned %d: %s\n",			\
-				#func, __FILE__, __LINE__, cudaerr,	\
-				pStr?pStr:"Unknown");			\
-			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
-				"Error returned from CUDA function %s.\n", #func);\
-		}							\
-	} while (0)
-#endif // PSM_CUDA
-
-#ifdef PSM_ONEAPI
-
-#define PSMI_ONEAPI_ZE_CALL(func, args...) do { \
-	ze_result_t result; \
-	psmi_count_##func++; \
-	result = psmi_##func(args);	\
-	if(result != ZE_RESULT_SUCCESS) { \
-		_HFI_ERROR( "OneAPI Level Zero failure: %s() (at %s:%d)" \
-			" returned 0x%x: %s\n", \
-			#func, __FILE__, __LINE__, result, \
-			psmi_oneapi_ze_result_to_string(result)); \
-		psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR, \
-			"Error returned from OneAPI Level Zero function %s.\n", #func); \
-	} \
-} while (0)
-
-void psmi_oneapi_cmd_create_all(void);
-void psmi_oneapi_cmd_destroy_all(void);
-uint64_t psm3_oneapi_ze_get_alloc_id(void *addr, uint8_t *type);
-
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-#define ONEAPI_PUTQUEUE_SIZE -1
-#endif
-psm2_error_t psmi_oneapi_putqueue_alloc(void);
-void psmi_oneapi_putqueue_free(void);
-
-/*
- * Two usages:
- *   (1) ctxt == NULL: check if the buffer is allocated from Level-zero.
- *       In this case, change cur_ze_dev if device has changed.
- *   (2) ctxt != NULL: try to get the device context.
- *       In this case, don't change cur_ze_dev.
- */
-PSMI_ALWAYS_INLINE(
-int
-_psmi_is_oneapi_ze_mem(const void *ptr, struct ze_dev_ctxt **ctxt))
-{
-	ze_memory_allocation_properties_t mem_props = {
-		ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES
-	};
-	ze_device_handle_t dev;
-	ze_result_t result;
-	int ret = 0;
-
-	psmi_count_zeMemGetAllocProperties++;
-	result = psmi_zeMemGetAllocProperties(ze_context, ptr, &mem_props,
-					      &dev);
-	if (result == ZE_RESULT_SUCCESS &&
-	    (mem_props.type != ZE_MEMORY_TYPE_UNKNOWN)) {
-		ret = 1;
-		_HFI_VDBG("ptr %p type %d dev %p cur_ze_dev %p\n",
-			  ptr, mem_props.type, dev, cur_ze_dev->dev);
-		/*
-		 * Check if the gpu device has changed.
-		 * If we are trying to get the device context (!ctxt),
-		 * don't change cur_ze_dev.
-		 * If the buffer is allocated through zeMemAllocHost,
-		 * there will be no device associated with it (dev == NULL).
-		 * In this case, use the current device context.
-		 */
-		if (!dev) {
-			if (ctxt)
-				*ctxt = cur_ze_dev;
-			return ret;
-		}
-		if (ctxt || (!ctxt && dev != cur_ze_dev->dev)) {
-			int i;
-
-			for (i = 0; i < num_ze_devices; i++) {
-				if (ze_devices[i].dev == dev) {
-					if (ctxt)
-						*ctxt = &ze_devices[i];
-					else
-						cur_ze_dev = &ze_devices[i];
-					break;
-				}
-			}
-			_HFI_VDBG("check ze_device[%d-%d] for dev %p: no match\n", 0, num_ze_devices-1, dev);
-		}
-	}
-
-	return ret;
-}
-
-
-PSMI_ALWAYS_INLINE(
-struct ze_dev_ctxt *
-psmi_oneapi_dev_ctxt_get(const void *ptr))
-{
-	struct ze_dev_ctxt *ctxt = NULL;
-
-	_psmi_is_oneapi_ze_mem(ptr, &ctxt);
-
-	return ctxt;
-}
-
-#define PSMI_IS_ONEAPI_ZE_ENABLED likely(is_oneapi_ze_enabled)
-#define PSMI_IS_ONEAPI_ZE_DISABLED unlikely(!is_oneapi_ze_enabled)
-#define PSMI_IS_ONEAPI_ZE_MEM(ptr) _psmi_is_oneapi_ze_mem(ptr, NULL)
-
-#endif // PSM_ONEAPI
-
-#ifdef PSM_CUDA
-PSMI_ALWAYS_INLINE(
-void verify_device_support_unified_addr())
-{
-	if (likely(_device_support_unified_addr > -1)) return;
-
-	int num_devices, dev;
-
-	/* Check if all devices support Unified Virtual Addressing. */
-	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
-
-	_device_support_unified_addr = 1;
-
-	for (dev = 0; dev < num_devices; dev++) {
-		CUdevice device;
-		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
-		int unifiedAddressing;
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-				&unifiedAddressing,
-				CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING,
-				device);
-
-		if (unifiedAddressing !=1) {
-			psm3_handle_error(PSMI_EP_NORETURN, PSM2_EP_DEVICE_FAILURE,
-				"CUDA device %d does not support Unified Virtual Addressing.\n",
-				dev);
-		}
-	}
-
-	return;
-}
-
-PSMI_ALWAYS_INLINE(
-int device_support_gpudirect())
-{
-	if (likely(_device_support_gpudirect > -1)) return _device_support_gpudirect;
-
-	int num_devices, dev;
-
-	/* Check if all devices support GPU Direct RDMA based on version. */
-	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
-
-	_device_support_gpudirect = 1;
-
-	for (dev = 0; dev < num_devices; dev++) {
-		CUdevice device;
-		PSMI_CUDA_CALL(cuDeviceGet, &device, dev);
-
-		int major;
-		PSMI_CUDA_CALL(cuDeviceGetAttribute,
-				&major,
-				CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
-				device);
-		if (major < 3) {
-			_device_support_gpudirect = 0;
-			_HFI_INFO("CUDA device %d does not support GPUDirect RDMA (Non-fatal error)\n", dev);
-		}
-	}
-
-	return _device_support_gpudirect;
-}
-
-PSMI_ALWAYS_INLINE(
-int gpu_p2p_supported())
-{
-	if (likely(_gpu_p2p_supported > -1)) return _gpu_p2p_supported;
-
-	_gpu_p2p_supported = 0;
-
-	if (unlikely(!is_cuda_enabled)) {
-		_HFI_DBG("returning 0 (cuda disabled)\n");
-		return 0;
-	}
-
-	/* Check which devices the current device has p2p access to. */
-	CUdevice  current_device;
-	CUcontext current_context;
-	int num_devices, dev_idx;
-	PSMI_CUDA_CALL(cuDeviceGetCount, &num_devices);
-
-	if (num_devices > 1) {
-		PSMI_CUDA_CALL(cuCtxGetCurrent, &current_context);
-		if (current_context == NULL) {
-			_HFI_INFO("Unable to find active CUDA context, assuming P2P not supported\n");
-			return 0;
-		}
-		PSMI_CUDA_CALL(cuCtxGetDevice, &current_device);
-	}
-
-	for (dev_idx = 0; dev_idx < num_devices; dev_idx++) {
-		CUdevice device;
-		PSMI_CUDA_CALL(cuDeviceGet, &device, dev_idx);
-
-		if (num_devices > 1 && device != current_device) {
-			int canAccessPeer = 0;
-			PSMI_CUDA_CALL(cuDeviceCanAccessPeer, &canAccessPeer,
-					current_device, device);
-
-			if (canAccessPeer != 1)
-				_HFI_DBG("CUDA device %d does not support P2P from current device (Non-fatal error)\n", dev_idx);
-			else
-				_gpu_p2p_supported |= (1 << dev_idx);
-		} else {
-			/* Always support p2p on the same GPU */
-			my_gpu_device = dev_idx;
-			_gpu_p2p_supported |= (1 << dev_idx);
-		}
-	}
-
-	_HFI_DBG("returning (0x%x), device 0x%x (%d)\n", _gpu_p2p_supported, (1 << my_gpu_device), my_gpu_device);
-	return _gpu_p2p_supported;
-}
-
-/**
- * Similar to PSMI_CUDA_CALL() except does not error out
- * if func(args) returns CUDA_SUCCESS or except_err
- *
- * Invoker must provide 'CUresult cudaerr' in invoked scope
- * so invoker can inspect whether cudaerr == CUDA_SUCCESS or
- * cudaerr == except_err after expanded code is executed.
- *
- * As except_err is an allowed value, message is printed at
- * DBG level.
- */
-#define PSMI_CUDA_CALL_EXCEPT(except_err, func, args...) do {		\
-		if (unlikely(check_set_cuda_ctxt())) {			\
-			psm3_handle_error(PSMI_EP_NORETURN,		\
-				PSM2_INTERNAL_ERR, "Failed to "		\
-				"set/synchronize CUDA context.\n");	\
-		}							\
-		psmi_count_##func++;					\
-		cudaerr = psmi_##func(args);				\
-		if (cudaerr != CUDA_SUCCESS && cudaerr != except_err) {	\
-			const char *pStr = NULL;			\
-			psmi_count_cuGetErrorString++;			\
-			psmi_cuGetErrorString(cudaerr, &pStr);		\
-			if (cu_ctxt == NULL)				\
-				_HFI_ERROR(				\
-				"Check if CUDA is initialized"	\
-				"before psm3_ep_open call \n");		\
-			_HFI_ERROR(					\
-				"CUDA failure: %s() (at %s:%d)"		\
-				" returned %d: %s\n",			\
-				#func, __FILE__, __LINE__, cudaerr,	\
-				pStr?pStr:"Unknown");			\
-			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
-				"Error returned from CUDA function %s.\n", #func);\
-		} else if (cudaerr == except_err) { \
-			const char *pStr = NULL;			\
-			psmi_count_cuGetErrorString++;			\
-			psmi_cuGetErrorString(cudaerr, &pStr);		\
-			_HFI_DBG( \
-				"CUDA non-zero return value: %s() (at %s:%d)"		\
-				" returned %d: %s\n",			\
-				#func, __FILE__, __LINE__, cudaerr,	\
-				pStr?pStr:"Unknown");			\
-		} \
-	} while (0)
-
-#define PSMI_CUDA_CHECK_EVENT(event, cudaerr) do {			\
-		psmi_count_cuEventQuery++;				\
-		cudaerr = psmi_cuEventQuery(event);			\
-		if ((cudaerr != CUDA_SUCCESS) &&			\
-		    (cudaerr != CUDA_ERROR_NOT_READY)) {		\
-			const char *pStr = NULL;			\
-			psmi_count_cuGetErrorString++;			\
-			psmi_cuGetErrorString(cudaerr, &pStr);		\
-			_HFI_ERROR(					\
-				"CUDA failure: %s() (at %s:%d) returned %d: %s\n",	\
-				"cuEventQuery", __FILE__, __LINE__, cudaerr,		\
-				pStr?pStr:"Unknown");			\
-			psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,	\
-				"Error returned from CUDA function cuEventQuery.\n");\
-		}							\
-	} while (0)
-
-#define PSMI_CUDA_DLSYM(psmi_cuda_lib,func) do {                        \
-	psmi_##func = dlsym(psmi_cuda_lib, STRINGIFY(func));            \
-	if (!psmi_##func) {               				\
-		psm3_handle_error(PSMI_EP_NORETURN,                     \
-			       PSM2_INTERNAL_ERR,                       \
-			       " Unable to resolve %s symbol"		\
-			       " in CUDA libraries.\n",STRINGIFY(func));\
-	}                                                               \
-} while (0)
-#endif // PSM_CUDA
-
-#ifdef PSM_ONEAPI
-
-PSMI_ALWAYS_INLINE(
-int gpu_p2p_supported())
-{
-
-	uint32_t num_devices = 0;
-	uint32_t dev;
-	ze_device_handle_t devices[MAX_ZE_DEVICES];
-
-	if (likely(_gpu_p2p_supported > -1)) return _gpu_p2p_supported;
-
-	if (unlikely(!is_oneapi_ze_enabled)) {
-		_gpu_p2p_supported=0;
-		return 0;
-	}
-
-	_gpu_p2p_supported = 0;
-
-	PSMI_ONEAPI_ZE_CALL(zeDeviceGet, ze_driver, &num_devices, NULL);
-	if (num_devices > MAX_ZE_DEVICES)
-		num_devices = MAX_ZE_DEVICES;
-	PSMI_ONEAPI_ZE_CALL(zeDeviceGet, ze_driver, &num_devices, devices);
-
-	for (dev = 0; dev < num_devices; dev++) {
-		ze_device_handle_t device;
-		device = devices[dev];
-
-		if (num_devices > 1 && device != cur_ze_dev->dev) {
-			ze_bool_t canAccessPeer = 0;
-
-			PSMI_ONEAPI_ZE_CALL(zeDeviceCanAccessPeer, cur_ze_dev->dev,
-					device, &canAccessPeer);
-			if (canAccessPeer != 1)
-				_HFI_DBG("ONEAPI device %d does not support P2P from current device (Non-fatal error)\n", dev);
-			else
-				_gpu_p2p_supported |= (1 << dev);
-		} else {
-			/* Always support p2p on the same GPU */
-			my_gpu_device = dev;
-			_gpu_p2p_supported |= (1 << dev);
-		}
-	}
-
-	return _gpu_p2p_supported;
-}
-
-#define PSMI_ONEAPI_ZE_DLSYM(lib_ptr, func) do { \
-	psmi_##func = dlsym(lib_ptr, STRINGIFY(func)); \
-	if (!psmi_##func) { \
-		psm3_handle_error(PSMI_EP_NORETURN, \
-			PSM2_INTERNAL_ERR, \
-			"Unable to resolve %s symbol " \
-			"in OneAPI Level Zero library.\n", STRINGIFY(func)); \
-	} \
-} while (0)
-
-static inline
-int _psm3_oneapi_ze_memcpy_done(const struct ips_gpu_hostbuf *ghb)
-{
-	ze_result_t result;
-	psmi_count_zeEventQueryStatus++;
-
-	result = psmi_zeEventQueryStatus(ghb->copy_status);
-	if (result == ZE_RESULT_SUCCESS) {
-		return 1;
-	} else if (result == ZE_RESULT_NOT_READY) {
-		return 0;
-	} else {
-		_HFI_ERROR("OneAPI Level Zero failure: %s() (at %s:%d) returned 0x%x: %s\n",
-			"zeEventQueryStatus",  __FILE__, __LINE__, result,
-			psmi_oneapi_ze_result_to_string(result));
-		psm3_handle_error( PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-			"Error returned from OneAPI Level Zero function %s.\n",
-			"zeEventQueryStatus");
-	}
-	return 0;
-}
-
-#endif // PSM_ONEAPI
-
-#ifdef PSM_CUDA
-PSMI_ALWAYS_INLINE(
-int
-_psmi_is_cuda_mem(const void *ptr))
-{
-	CUresult cres;
-	CUmemorytype mt;
-	unsigned uvm = 0;
-	psmi_count_cuPointerGetAttribute++;
-	cres = psmi_cuPointerGetAttribute(
-		&mt, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr) ptr);
-	if ((cres == CUDA_SUCCESS) && (mt == CU_MEMORYTYPE_DEVICE)) {
-		psmi_count_cuPointerGetAttribute++;
-		cres = psmi_cuPointerGetAttribute(
-			&uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
-		if ((cres == CUDA_SUCCESS) && (uvm == 0))
-			return 1;
-		else
-			return 0;
-	} else
-		return 0;
-}
-
-#define PSMI_IS_CUDA_ENABLED  likely(is_cuda_enabled)
-#define PSMI_IS_CUDA_DISABLED unlikely(!is_cuda_enabled)
-#define PSMI_IS_CUDA_MEM(p) _psmi_is_cuda_mem(p)
-extern void psm2_get_gpu_bars(void);
-
-/*
- * CUDA documentation dictates the use of SYNC_MEMOPS attribute
- * when the buffer pointer received into PSM has been allocated
- * by the application. This guarantees that all memory operations
- * to this region of memory (used by multiple layers of the stack)
- * always synchronize.
- */
-static inline
-void psmi_cuda_set_attr_sync_memops(const void *ubuf)
-{
-	int true_flag = 1;
-
-	PSMI_CUDA_CALL(cuPointerSetAttribute, &true_flag,
-		       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, (CUdeviceptr) ubuf);
-}
-
-static inline
-int _psm3_cuda_memcpy_done(const struct ips_gpu_hostbuf *chb)
-{
-	CUresult status;
-	PSMI_CUDA_CHECK_EVENT(chb->copy_status, status);
-	return (status == CUDA_SUCCESS);
-}
-
-#endif /* PSM_CUDA */
-
-#define COMPILE_TIME_ASSERT(NAME,COND) extern char NAME[1/ COND]
-
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-
-extern uint64_t psm3_gpu_cache_evict;
 
 enum psm2_chb_match_type {
 	/* Complete data found in a single chb */
@@ -1252,7 +431,7 @@ void psmi_gpu_hostbuf_alloc_func(int is_alloc, void *context, void *obj);
 
 #define GPU_HOSTBUFFER_LIMITS {				\
 	    .env = "PSM3_GPU_BOUNCEBUFFERS_MAX",		\
-	    .descr = "Max CUDA bounce buffers (in MB)",		\
+	    .descr = "Max GPU Pipeline bounce buffers (in MB)",		\
 	    .env_level = PSMI_ENVVAR_LEVEL_HIDDEN,		\
 	    .minval = 1,					\
 	    .maxval = 1<<30,					\
@@ -1265,364 +444,8 @@ struct ips_gpu_hostbuf_mpool_cb_context {
 	unsigned bufsz;
 };
 
-PSMI_ALWAYS_INLINE(
-int
-_psmi_is_gdr_copy_enabled())
-{
-        return is_gdr_copy_enabled;
-}
-
-// Only valid if called for a GPU buffer
-#define PSMI_USE_GDR_COPY_RECV(len) ((len) >=1 && (len) <= gdr_copy_limit_recv)
-#define PSMI_IS_GDR_COPY_ENABLED _psmi_is_gdr_copy_enabled()
-#define PSM3_IS_BUFFER_GPU_MEM(buf, len) \
-    ((len) && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(buf))
 #endif
 
-#ifdef PSM_CUDA
-
-#define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp)                       \
-	do {                                                          \
-		protoexp->cudastream_recv = NULL;                     \
-	} while (0)
-#define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto)                          \
-	do {                                                          \
-		proto->cudastream_send = NULL;                        \
-	} while (0)
-#define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp)                      \
-	do {                                                          \
-		if (protoexp->cudastream_recv != NULL) {              \
-			PSMI_CUDA_CALL(cuStreamDestroy,               \
-				protoexp->cudastream_recv);           \
-		}                                                     \
-	} while (0)
-#define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto)                         \
-	do {                                                          \
-		if (proto->cudastream_send) {                         \
-			PSMI_CUDA_CALL(cuStreamDestroy,               \
-				proto->cudastream_send);              \
-		}                                                     \
-	} while (0)
-#define PSM3_GPU_MEMCPY_HTOD_START(protoexp, ghb, len)                \
-	do {                                                          \
-		if (protoexp->cudastream_recv == NULL) {              \
-			PSMI_CUDA_CALL(cuStreamCreate,                \
-				&protoexp->cudastream_recv,           \
-				CU_STREAM_NON_BLOCKING);              \
-		}                                                     \
-		PSMI_CUDA_CALL(cuMemcpyHtoDAsync,                     \
-			(CUdeviceptr)ghb->gpu_buf, ghb->host_buf,     \
-			len, protoexp->cudastream_recv);              \
-		if (ghb->copy_status == NULL) {                       \
-			PSMI_CUDA_CALL(cuEventCreate,                 \
-				&ghb->copy_status, CU_EVENT_DEFAULT); \
-		}                                                     \
-		PSMI_CUDA_CALL(cuEventRecord, ghb->copy_status,       \
-			protoexp->cudastream_recv);                   \
-	} while (0)
-#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len)                   \
-	do {                                                          \
-		if (proto->cudastream_send == NULL) {                 \
-			PSMI_CUDA_CALL(cuStreamCreate,                \
-				&proto->cudastream_send,              \
-				CU_STREAM_NON_BLOCKING);              \
-		}                                                     \
-		if (ghb->copy_status == NULL) {                       \
-			PSMI_CUDA_CALL(cuEventCreate,                 \
-				&ghb->copy_status, CU_EVENT_DEFAULT); \
-		}                                                     \
-		PSMI_CUDA_CALL(cuMemcpyDtoHAsync,                     \
-			ghb->host_buf, (CUdeviceptr)ghb->gpu_buf,     \
-			len, proto->cudastream_send);                 \
-		PSMI_CUDA_CALL(cuEventRecord, ghb->copy_status,       \
-			proto->cudastream_send);                      \
-	} while (0)
-#define PSM3_GPU_MEMCPY_DONE(ghb) \
-	_psm3_cuda_memcpy_done(ghb)
-#define PSM3_GPU_HOSTBUF_LAZY_INIT(ghb)                               \
-	do {                                                          \
-		ghb->copy_status = NULL;                              \
-		ghb->host_buf = NULL;                                 \
-	} while (0)
-#define PSM3_GPU_HOSTBUF_RESET(ghb)                                   \
-	do {                                                          \
-	} while (0)
-#define PSM3_GPU_HOSTBUF_DESTROY(ghb)                                 \
-	do {                                                          \
-		if (ghb->copy_status != NULL) {                       \
-			PSMI_CUDA_CALL(cuEventDestroy,                \
-				ghb->copy_status);                    \
-		}                                                     \
-		if (ghb->host_buf != NULL) {                          \
-			PSMI_CUDA_CALL(cuMemFreeHost,                 \
-				ghb->host_buf);                       \
-		}                                                     \
-	} while (0)
-#define PSM3_GPU_MEMCPY_DTOD(dstptr, srcptr, len) \
-	do { PSMI_CUDA_CALL(cuMemcpyDtoD, (CUdeviceptr)(dstptr), (CUdeviceptr)(srcptr), (len)); } while (0)
-#define PSM3_GPU_MEMCPY_HTOD(dstptr, srcptr, len) \
-	do { PSMI_CUDA_CALL(cuMemcpyHtoD, (CUdeviceptr)(dstptr), (srcptr), (len)); } while (0)
-#define PSM3_GPU_SYNCHRONIZE_MEMCPY() \
-	do {PSMI_CUDA_CALL(cuStreamSynchronize, 0);} while (0)
-#define PSM3_GPU_HOST_ALLOC(ret_ptr, size)                            \
-	do {                                                          \
-		PSMI_CUDA_CALL(cuMemHostAlloc, (void **)(ret_ptr),    \
-			(size),CU_MEMHOSTALLOC_PORTABLE);             \
-	} while (0)
-#define PSM3_GPU_HOST_FREE(ptr)                                       \
-	do {                                                          \
-		PSMI_CUDA_CALL(cuMemFreeHost, (void *)ptr);           \
-	} while (0)
-// HOST_ALLOC memory treated as CPU memory for Verbs MRs
-#define PSM3_GPU_ADDR_SEND_MR(mqreq)                                  \
-	( (mqreq)->is_buf_gpu_mem && ! (mqreq)->gpu_hostbuf_used )
-#define PSM3_GPU_ADDR_RECV_MR(tidrecvc, mqreq)                        \
-	( (tidrecvc)->is_ptr_gpu_backed )
-#define PSM3_MARK_BUF_SYNCHRONOUS(buf) do { psmi_cuda_set_attr_sync_memops(buf); } while (0)
-#define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) \
-	do { PSMI_CUDA_CALL(cuMemcpyDtoH, dstptr, (CUdeviceptr)(srcptr), len); } while (0)
-#define PSM3_GPU_MEMCPY(dstptr, srcptr, len) \
-	do { PSMI_CUDA_CALL(cuMemcpy, (CUdeviceptr)(dstptr), (CUdeviceptr)(srcptr), len); } while (0)
-#define PSMI_IS_GPU_ENABLED  PSMI_IS_CUDA_ENABLED
-#define PSMI_IS_GPU_DISABLED PSMI_IS_CUDA_DISABLED
-#define PSMI_IS_GPU_MEM(x) PSMI_IS_CUDA_MEM(x)
-
-#elif defined(PSM_ONEAPI)
-#define PSM3_GPU_PREPARE_HTOD_MEMCPYS(protoexp)                       \
-	do {                                                          \
-		int i;                                                \
-	                                                              \
-		for (i = 0; i < MAX_ZE_DEVICES; i++)                  \
-			protoexp->cq_recvs[i] = NULL;                 \
-	} while (0)
-#define PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto)                          \
-	do {                                                          \
-		int i;                                                \
-		                                                      \
-		for (i = 0; i < MAX_ZE_DEVICES; i++)                  \
-			proto->cq_sends[i] = NULL;                    \
-	} while (0)
-#define PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp)                      \
-	do {                                                          \
-		int i;                                                \
-		                                                      \
-		for (i = 0; i < MAX_ZE_DEVICES; i++) {                \
-			if (protoexp->cq_recvs[i]) {                  \
-				PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \
-					protoexp->cq_recvs[i]);       \
-				protoexp->cq_recvs[i] = NULL;         \
-			}                                             \
-		}                                                     \
-	} while (0)
-#define PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto)                         \
-	do {                                                          \
-		int i;                                                \
-		                                                      \
-		for (i = 0; i < MAX_ZE_DEVICES; i++) {                \
-			if (proto->cq_sends[i]) {                     \
-				PSMI_ONEAPI_ZE_CALL(zeCommandQueueDestroy, \
-					proto->cq_sends[i]);          \
-				proto->cq_sends[i] = NULL;            \
-			}                                             \
-		}                                                     \
-	} while (0)
-
-#define PSM3_GPU_MEMCPY_HTOD_START(protoexp, ghb, len)                \
-	do {                                                          \
-		ze_event_pool_desc_t pool_desc = {                    \
-			.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,   \
-			.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE,     \
-			.count = 1                                    \
-		};                                                    \
-		ze_event_desc_t event_desc = {                        \
-			.stype = ZE_STRUCTURE_TYPE_EVENT_DESC,        \
-			.signal = ZE_EVENT_SCOPE_FLAG_HOST,           \
-			.wait = ZE_EVENT_SCOPE_FLAG_HOST,             \
-			.index = 0                                    \
-		};                                                    \
-		struct ze_dev_ctxt *ctxt;                             \
-		int inx;                                              \
-		                                                      \
-		ctxt = psmi_oneapi_dev_ctxt_get(ghb->gpu_buf);        \
-		if (!ctxt)                                            \
-			psm3_handle_error(PSMI_EP_NORETURN,           \
-					  PSM2_INTERNAL_ERR,          \
-					  "%s HTOD: unknown GPU device for addr %p\n", \
-					  __FUNCTION__, ghb->gpu_buf);\
-		if (ghb->event_pool == NULL) {                        \
-			PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate,        \
-				ze_context, &pool_desc, 0, NULL,      \
-				&ghb->event_pool);                    \
-		}                                                     \
-		if (ghb->copy_status == NULL) {                       \
-			PSMI_ONEAPI_ZE_CALL(zeEventCreate,            \
-				ghb->event_pool, &event_desc,         \
-				&ghb->copy_status);                   \
-		}                                                     \
-		inx = ctxt->dev_index;                                \
-		if (! ghb->command_lists[inx]) {                      \
-			psmi_oneapi_async_cmd_create(ctxt,            \
-				 &protoexp->cq_recvs[inx],            \
-				 &ghb->command_lists[inx]);           \
-		}                                                     \
-		ghb->cur_dev_inx = inx;                               \
-		PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy,    \
-			ghb->command_lists[inx],                      \
-			ghb->gpu_buf, ghb->host_buf, len,             \
-			ghb->copy_status, 0, NULL);                   \
-		if (! psm3_oneapi_immed_async_copy) {                 \
-			PSMI_ONEAPI_ZE_CALL(zeCommandListClose,       \
-				ghb->command_lists[inx]);                   \
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\
-				protoexp->cq_recvs[inx], 1,           \
-				&ghb->command_lists[inx], NULL);      \
-		}                                                     \
-	} while (0)
-#define PSM3_GPU_MEMCPY_DTOH_START(proto, ghb, len)                   \
-	do {                                                          \
-		ze_event_pool_desc_t pool_desc = {                    \
-			.stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC,   \
-			.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE,     \
-			.count = 1                                    \
-		};                                                    \
-		ze_event_desc_t event_desc = {                        \
-			.stype = ZE_STRUCTURE_TYPE_EVENT_DESC,        \
-			.signal = ZE_EVENT_SCOPE_FLAG_HOST,           \
-			.wait = ZE_EVENT_SCOPE_FLAG_HOST,             \
-			.index = 0                                    \
-		};                                                    \
-		struct ze_dev_ctxt *ctxt;                             \
-		int inx;                                              \
-		                                                      \
-		ctxt = psmi_oneapi_dev_ctxt_get(ghb->gpu_buf);        \
-		if (!ctxt)                                            \
-			psm3_handle_error(PSMI_EP_NORETURN,           \
-					  PSM2_INTERNAL_ERR,          \
-					  "%s DTOH: unknown GPU device for addr %p\n", \
-					  __FUNCTION__, ghb->gpu_buf);\
-		if (ghb->event_pool == NULL) {                        \
-			PSMI_ONEAPI_ZE_CALL(zeEventPoolCreate,        \
-				ze_context, &pool_desc, 0, NULL,      \
-				&ghb->event_pool);                    \
-		}                                                     \
-		if (ghb->copy_status == NULL) {                       \
-			PSMI_ONEAPI_ZE_CALL(zeEventCreate,            \
-				ghb->event_pool, &event_desc,         \
-				&ghb->copy_status);                   \
-		}                                                     \
-		inx = ctxt->dev_index;                                \
-		if (! ghb->command_lists[inx]) {                      \
-			psmi_oneapi_async_cmd_create(ctxt,            \
-				 &proto->cq_sends[inx],               \
-				 &ghb->command_lists[inx]);           \
-		}                                                     \
-		ghb->cur_dev_inx = inx;                               \
-		PSMI_ONEAPI_ZE_CALL(zeCommandListAppendMemoryCopy,    \
-			ghb->command_lists[inx],                      \
-			ghb->host_buf, ghb->gpu_buf, len,             \
-			ghb->copy_status, 0, NULL);                   \
-		if (! psm3_oneapi_immed_async_copy) {                 \
-			PSMI_ONEAPI_ZE_CALL(zeCommandListClose,       \
-				ghb->command_lists[inx]);             \
-			PSMI_ONEAPI_ZE_CALL(zeCommandQueueExecuteCommandLists,\
-				proto->cq_sends[inx], 1,              \
-				&ghb->command_lists[inx], NULL);      \
-		}                                                     \
-	} while (0)
-#define PSM3_GPU_MEMCPY_DONE(ghb) \
-	_psm3_oneapi_ze_memcpy_done(ghb)
-#define PSM3_GPU_HOSTBUF_LAZY_INIT(ghb)                               \
-	do {                                                          \
-		int i;                                                \
-		                                                      \
-		ghb->event_pool = NULL;                               \
-		ghb->copy_status = NULL;                              \
-		for (i = 0; i < MAX_ZE_DEVICES; i++)                  \
-			ghb->command_lists[i] = NULL;                 \
-		ghb->host_buf = NULL;                                 \
-	} while (0)
-#define PSM3_GPU_HOSTBUF_RESET(ghb)                                   \
-	do {                                                          \
-		if (! psm3_oneapi_immed_async_copy) {                 \
-			PSMI_ONEAPI_ZE_CALL(zeCommandListReset,       \
-				ghb->command_lists[ghb->cur_dev_inx]);\
-		}                                                     \
-		PSMI_ONEAPI_ZE_CALL(zeEventHostReset,                 \
-			ghb->copy_status);                            \
-	} while (0)
-#define PSM3_GPU_HOSTBUF_DESTROY(ghb)                                 \
-	do {                                                          \
-		int i;                                                \
-                                                                      \
-		if (ghb->copy_status != NULL) {                       \
-			PSMI_ONEAPI_ZE_CALL(zeEventDestroy,           \
-				ghb->copy_status);                    \
-		}                                                     \
-		if (ghb->host_buf != NULL) {                          \
-			PSM3_ONEAPI_ZE_HOST_FREE(ghb->host_buf);       \
-		}                                                     \
-		if (ghb->event_pool != NULL) {                        \
-			PSMI_ONEAPI_ZE_CALL(zeEventPoolDestroy,       \
-				ghb->event_pool);                     \
-		}                                                     \
-		for (i = 0; i < MAX_ZE_DEVICES; i++) {                \
-			if (ghb->command_lists[i]) {                  \
-				PSMI_ONEAPI_ZE_CALL(                  \
-					zeCommandListDestroy,         \
-					ghb->command_lists[i]);       \
-				ghb->command_lists[i] = NULL;         \
-			}                                             \
-		}                                                     \
-	} while (0)
-#define PSM3_GPU_MEMCPY_DTOD(dstptr, srcptr, len) \
-	do { psmi_oneapi_ze_memcpy_DTOD(dstptr, srcptr, len); } while(0)
-#define PSM3_GPU_MEMCPY_HTOD(dstptr, srcptr, len) \
-	do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while(0)
-#define PSM3_GPU_SYNCHRONIZE_MEMCPY() \
-	do { /* not needed for OneAPI ZE */ } while (0)
-#ifdef PSM3_USE_ONEAPI_MALLOC
-#define PSM3_GPU_HOST_ALLOC(ret_ptr, size)                            \
-	do {                                                          \
-		*ret_ptr = psm3_oneapi_ze_host_alloc_malloc(size);    \
-	} while (0)
-#define PSM3_ONEAPI_ZE_HOST_FREE(ptr)                                 \
-	psm3_oneapi_ze_host_free_malloc(ptr)
-// HOST_ALLOC memory treated as CPU memory for Verbs MRs
-#define PSM3_GPU_ADDR_SEND_MR(mqreq)                                      \
-	( (mqreq)->is_buf_gpu_mem && ! (mqreq)->gpu_hostbuf_used )
-#define PSM3_GPU_ADDR_RECV_MR(tidrecvc, mqreq)                       \
-	( (tidrecvc)->is_ptr_gpu_backed )
-#else /* PSM3_USE_ONEAPI_MALLOC */
-#define PSM3_GPU_HOST_ALLOC(ret_ptr, size)                            \
-	do {                                                          \
-		*ret_ptr = (*psm3_oneapi_ze_host_alloc)(size);    \
-	} while (0)
-#define PSM3_ONEAPI_ZE_HOST_FREE(ptr)                                 \
-	(*psm3_oneapi_ze_host_free)(ptr)
-// HOST_ALLOC memory treated as GPU memory for Verbs MRs
-// Note: gpu_hostbuf_used" only set if is_buf_gpu_mem
-#define PSM3_GPU_ADDR_SEND_MR(mqreq)                                  \
-	( (mqreq)->is_buf_gpu_mem && \
-	  (! (mqreq)->gpu_hostbuf_used || psm3_oneapi_ze_using_zemem_alloc ))
-#define PSM3_GPU_ADDR_RECV_MR(tidrecvc, mqreq)                        \
-	( (tidrecvc)->is_ptr_gpu_backed                               \
-          || ((mqreq)->gpu_hostbuf_used && psm3_oneapi_ze_using_zemem_alloc))
-#endif /* PSM3_USE_ONEAPI_MALLOC */
-#define PSM3_GPU_HOST_FREE(ptr) PSM3_ONEAPI_ZE_HOST_FREE(ptr)
-#define PSM3_MARK_BUF_SYNCHRONOUS(buf) do { /* not needed for OneAPI ZE */ } while (0)
-#define PSM3_GPU_MEMCPY_DTOH(dstptr, srcptr, len) \
-	do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while (0)
-#define PSM3_GPU_MEMCPY(dstptr, srcptr, len) \
-	do { psmi_oneapi_ze_memcpy(dstptr, srcptr, len); } while (0)
-#define PSMI_IS_GPU_ENABLED  PSMI_IS_ONEAPI_ZE_ENABLED
-#define PSMI_IS_GPU_DISABLED PSMI_IS_ONEAPI_ZE_DISABLED
-#define PSMI_IS_GPU_MEM(x) PSMI_IS_ONEAPI_ZE_MEM(x)
-
-void psm3_put_ipc_handle(const void *buf, ze_ipc_mem_handle_t ipc_handle);
-
-#endif /* elif PSM_ONEAPI */
-
-
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/prov/psm3/psm3/psm_utils.c b/prov/psm3/psm3/psm_utils.c
index 0f1a3fe1d5d..d3f272a2041 100644
--- a/prov/psm3/psm3/psm_utils.c
+++ b/prov/psm3/psm3/psm_utils.c
@@ -2495,7 +2495,7 @@ int psm3_parse_memmode(void)
 	}
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 // we need PSM3_GPUDIRECT config early to influence rdmamode defaults,
 // MR Cache mode and whether we need to open RV.
 // As such we don't check PSMI_HAL_CAP_GPUDIRECT flag here, but
@@ -2552,7 +2552,7 @@ unsigned psmi_parse_gpudirect_rdma_send_limit(int force)
 	psm3_getenv_range("PSM3_GPUDIRECT_RDMA_SEND_LIMIT",
 		    "GPUDirect RDMA feature on send side will be switched off for messages larger than limit.", NULL,
 		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-		    (union psmi_envvar_val)UINT_MAX,
+		    (union psmi_envvar_val)psm3_gpu_gpudirect_rdma_send_limit_default,
 		    (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, 
 		    NULL, NULL, &envval);
 
@@ -2584,11 +2584,7 @@ unsigned psmi_parse_gpudirect_rdma_recv_limit(int force)
 	psm3_getenv_range("PSM3_GPUDIRECT_RDMA_RECV_LIMIT",
 		    "GPUDirect RDMA feature on receive side will be switched off for messages larger than limit.", NULL,
 		    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_UINT,
-#ifdef PSM_CUDA
-		    (union psmi_envvar_val)UINT_MAX,
-#elif defined(PSM_ONEAPI)
-		    (union psmi_envvar_val)1,
-#endif
+		    (union psmi_envvar_val)psm3_gpu_gpudirect_rdma_recv_limit_default,
 		    (union psmi_envvar_val)0, (union psmi_envvar_val)UINT_MAX, 
 		    NULL, NULL, &envval);
 
@@ -2597,9 +2593,9 @@ unsigned psmi_parse_gpudirect_rdma_recv_limit(int force)
 	have_value = 1;
 	return saved;
 }
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif // PSM_HAVE_GPU
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /* Size of RV GPU Cache - only used for PSM3_GPUDIRECT=1
  * otherwise returns 0
  */
@@ -2619,7 +2615,7 @@ unsigned psmi_parse_gpudirect_rv_gpu_cache_size(int reload)
 	// min size is (HFI_TF_NFLOWS + ep->hfi_num_send_rdma) *
 	// chunk size (psm3_mq_max_window_rv(mq, 1) after
 	// psmi_mq_initialize_params)
-	if (PSMI_IS_GPU_ENABLED && psmi_parse_gpudirect() ) {
+	if (PSM3_GPU_IS_ENABLED && psmi_parse_gpudirect() ) {
 		psm3_getenv("PSM3_RV_GPU_CACHE_SIZE",
 				"kernel space GPU cache size"
 				" (MBs, 0 lets rv module decide) [0]",
@@ -2634,7 +2630,7 @@ unsigned psmi_parse_gpudirect_rv_gpu_cache_size(int reload)
 	return saved;
 }
 
-#endif	// PSM_CUDA || PSM_ONEAPI
+#endif	// PSM_HAVE_GPU
 
 #ifdef PSM_HAVE_REG_MR
 /* Send DMA Enable */
@@ -2817,32 +2813,13 @@ void psm3_print_rank_identify(void)
 	if (identify_shown)
 		return;
 
-#ifdef PSM_CUDA
-	char cudart_ver[64] = "unknown";
-	if (cuda_runtime_ver)
-		snprintf(cudart_ver, sizeof(cudart_ver), "%d.%d",
-			cuda_runtime_ver / 1000, (cuda_runtime_ver % 1000) / 10);
-	snprintf(accel_vers, sizeof(accel_vers), "%s %s CUDA Runtime %s built against interface %d.%d\n",
-		psm3_get_mylabel(), psm3_ident_tag,
-		cudart_ver, CUDA_VERSION / 1000, (CUDA_VERSION % 1000) / 10);
-#elif defined(PSM_ONEAPI)
-	char ze_api_ver[64] = "unknown";
-	char ze_loader_ver[64] = "unknown";
-	if (zel_api_version)
-		snprintf(ze_api_ver, sizeof(ze_api_ver), "%d.%d",
-			ZE_MAJOR_VERSION(zel_api_version), ZE_MINOR_VERSION(zel_api_version));
-	if (zel_lib_version.major || zel_lib_version.minor || zel_lib_version.patch)
-		snprintf(ze_loader_ver, sizeof(ze_loader_ver), "v%d.%d.%d",
-			zel_lib_version.major, zel_lib_version.minor, zel_lib_version.patch);
-	snprintf(accel_vers, sizeof(accel_vers), "%s %s Level-Zero Runtime %s (%s) built against interface %d.%d\n",
-		psm3_get_mylabel(), psm3_ident_tag,
-		ze_api_ver, ze_loader_ver,
-		ZE_MAJOR_VERSION(ZE_API_VERSION_CURRENT), ZE_MINOR_VERSION(ZE_API_VERSION_CURRENT));
+#ifdef PSM_HAVE_GPU
+	PSM3_GPU_IDENTIFY(accel_vers, sizeof(accel_vers));
 #endif
 
 	identify_shown = 1;
 	strcat(strcat(ofed_delta," built for IEFS OFA DELTA "),psm3_IEFS_version);
-	psm3_print_identify("%s %s PSM3 v%d.%d%s%s\n"
+	psm3_print_identify("%s %s PSM3 v%d.%d"PSM3_GPU_TYPES"%s\n"
 		"%s %s location %s\n"
 		"%s %s build date %s\n"
 		"%s %s src checksum %s\n"
@@ -2853,13 +2830,6 @@ void psm3_print_rank_identify(void)
 		"%s %s CPU Core %d NUMA %d PID %d\n",
 		psm3_get_mylabel(), psm3_ident_tag,
 			PSM2_VERNO_MAJOR,PSM2_VERNO_MINOR,
-#ifdef PSM_CUDA
-			"-cuda",
-#elif defined(PSM_ONEAPI)
-			"-oneapi-ze",
-#else
-			"",
-#endif
 			(strcmp(psm3_IEFS_version,"") != 0) ? ofed_delta : "",
 		psm3_get_mylabel(), psm3_ident_tag,
 			dladdr(psm3_init, &info_psm) ?
@@ -4032,6 +4002,28 @@ psmi_coreopt_ctl(const void *core_obj, int optname,
 				epaddr->usr_ep_ctxt = optval;
 		}
 		break;
+	case PSM2_CORE_OPT_EP_CUDA_PERMITTED:
+		{
+			psm2_ep_t ep_core = (psm2_ep_t)core_obj;
+			if (!ep_core)
+				return psm3_handle_error(NULL, PSM2_PARAM_ERR, "Invalid endpoint");
+
+			if (*optlen < sizeof(bool)) {
+				err = psm3_handle_error(NULL, PSM2_PARAM_ERR,
+					"Option len insufficient for bool (%"PRIu64")", *optlen);
+				*optlen = sizeof(bool);
+				return err;
+			}
+
+			PSM_EP_FOR_EACH_MCTXT(ep_core, ep) {
+				err = get
+					? PSM3_GPU_GET_CUDA_PERMITTED(ep,  (bool *)optval)
+					: PSM3_GPU_SET_CUDA_PERMITTED(ep, *(bool *)optval);
+				if (err)
+					return err;
+			}
+		}
+		break;
 	default:
 		/* Unknown/unrecognized option */
 		err = psm3_handle_error(NULL,
@@ -5067,12 +5059,51 @@ void psm3_touch_mmap(void *m, size_t bytes)
 
 void psm3_memcpy(void *dest, const void *src, uint32_t len)
 {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (len && PSMI_IS_GPU_ENABLED &&
-	    (PSMI_IS_GPU_MEM(dest) || PSMI_IS_GPU_MEM((void *)src))) {
+#ifdef PSM_HAVE_GPU
+	if (len && 
+	    (PSM3_IS_GPU_MEM(dest) || PSM3_IS_GPU_MEM((void *)src))) {
 		PSM3_GPU_MEMCPY(dest, src, len);
 		return;
 	}
 #endif
 	memcpy(dest, src, len);
 }
+
+void psm3_ep_memcpy(psm2_ep_t ep, void *dest, const void *src, uint32_t len)
+{
+#ifdef PSM_HAVE_GPU
+	// if CUDA is disallowed, attempt gdrcopy instead
+	if_pf (!len)
+		return;
+
+	const bool src_is_gpu  = PSM3_IS_GPU_MEM(src);
+	const bool dest_is_gpu = PSM3_IS_GPU_MEM(dest);
+
+	if (src_is_gpu || dest_is_gpu) {
+		// if the GPU HAL provides memcpy, prefer it
+		if (PSM3_GPU_IS_MEMCPY_PERMITTED(ep)) {
+			PSM3_GPU_MEMCPY(dest, src, len);
+			return;
+		}
+
+		// otherwise, avoid GPU-driven memcpy paths by mapping the
+		// device buffer and issuing a CPU driven gdrcopy
+		if (src_is_gpu) {
+			src = psmi_hal_gdr_convert_gpu_to_host_addr(
+				(unsigned long)src, len, 0, ep);
+			psmi_assert_always(src);
+		}
+		if (dest_is_gpu) {
+			dest = psmi_hal_gdr_convert_gpu_to_host_addr(
+				(unsigned long)dest, len, 0, ep);
+			psmi_assert_always(dest);
+		}
+
+		// buffers cpu-accessible; fall through to host memcpy
+	}
+#else
+	// no GPU support: fall through to host memcpy
+#endif
+
+	memcpy(dest, src, len);
+}
diff --git a/prov/psm3/psm3/psm_utils.h b/prov/psm3/psm3/psm_utils.h
index 57742fc39ea..fc33bb25b5b 100644
--- a/prov/psm3/psm3/psm_utils.h
+++ b/prov/psm3/psm3/psm_utils.h
@@ -437,7 +437,7 @@ void psm3_print_identify(const char *fmt, ...) \
 #ifdef PSM_HAVE_REG_MR
 unsigned psm3_parse_senddma(void);
 #endif
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 unsigned psmi_parse_gpudirect(void);
 unsigned psmi_parse_gpudirect_rdma_send_limit(int force);
 unsigned psmi_parse_gpudirect_rdma_recv_limit(int force);
@@ -455,36 +455,6 @@ void psm3_syslog(psm2_ep_t ep, int to_console, int level,
 void *psm3_memcpyo(void *dst, const void *src, size_t n);
 uint32_t psm3_crc(unsigned char *buf, int len);
 
-/*
- * Internal CPUID detection
- */
-#define CPUID_FAMILY_MASK       0x00000f00
-#define CPUID_MODEL_MASK        0x000000f0
-#define CPUID_EXMODEL_MASK      0x000f0000
-
-/*
- * CPUID return values
- */
-#define CPUID_FAMILY_XEON       0x00000600
-/*
- * cpuid function 0, returns "GeniuneIntel" in EBX,ECX,EDX
- * due to Little Endian and Hex it is not so obvious
- */
-#define CPUID_GENUINE_INTEL_EBX 0x756e6547 /* "uneG" - Little Endian "Genu" */
-#define CPUID_GENUINE_INTEL_ECX 0x6c65746e /* "Ieni" - Little Endian "ineI" */
-#define CPUID_GENUINE_INTEL_EDX 0x49656e69 /* "letn" - Little Endian "ntel" */
-
-/*
- * These values are internal only, not real register values
- */
-#define CPUID_GENUINE_INTEL     0xf0000000
-#define CPUID_MODEL_UNDEFINED   -1
-
-/*
- * Global model so we can tune defaults better for specific cpu's
- */
-extern uint32_t psm3_cpu_model;
-
 /*
  * Diagnostics, all in psm_diags.c
  */
diff --git a/prov/psm3/psm3/psm_verbs_mr.c b/prov/psm3/psm3/psm_verbs_mr.c
index fa8fdf39499..d17d3fd5177 100644
--- a/prov/psm3/psm3/psm_verbs_mr.c
+++ b/prov/psm3/psm3/psm_verbs_mr.c
@@ -246,7 +246,7 @@ struct psm2_mr_cache {
 	uint32_t limit_nonpri_inuse;
 	uint64_t limit_nonpri_inuse_bytes;
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t limit_nonpri_gpu_inuse_bytes;
 #endif
 	psm3_rv_t rv;
@@ -309,7 +309,7 @@ struct psm2_mr_cache {
 	uint64_t inuse_send_bytes;
 	uint64_t max_inuse_send_bytes;
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t gpu_inuse_bytes;
 	uint64_t max_gpu_inuse_bytes;
 	uint64_t gpu_inuse_recv_bytes;
@@ -323,7 +323,7 @@ struct psm2_mr_cache {
 #ifdef PSM_HAVE_RNDV_MOD
 	struct psm3_rv_cache_stats rv_stats;	// statistics from rv module
 									// will remain 0 if rv not open
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	struct psm3_rv_gpu_cache_stats rv_gpu_stats;	// GPU statistics from rv module
 									// will remain 0 if rv not open
 #endif
@@ -393,13 +393,11 @@ static int mr_cache_key_cmp(const struct psm3_verbs_mr *a,
 		return -1;
 	else if (a->access > b->access)
 		return 1;
-#ifdef PSM_ONEAPI
-	if (a->alloc_id < b->alloc_id)
-		return -1;
-	else if (a->alloc_id > b->alloc_id)
-		return 1;
-#endif
+#ifdef PSM_HAVE_GPU
+	return PSM3_GPU_CMP_MR(&a->gpu_specific, &b->gpu_specific);
+#else
 	return 0;
+#endif
 }
 
 // rbtree.c uses these defines to establish some of it's code and
@@ -766,7 +764,7 @@ static void update_stats_inc_inuse(psm2_mr_cache_t cache, uint64_t length,
 {
 	INC_STAT(cache, inuse, max_inuse);
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (access & IBV_ACCESS_IS_GPU_ADDR)
 		ADD_STAT(cache, length, gpu_inuse_bytes, max_gpu_inuse_bytes);
 	else
@@ -778,7 +776,7 @@ static void update_stats_inc_inuse(psm2_mr_cache_t cache, uint64_t length,
 	if (access & IBV_ACCESS_REMOTE_WRITE) {
 		INC_STAT(cache, inuse_recv, max_inuse_recv);
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (access & IBV_ACCESS_IS_GPU_ADDR)
 			ADD_STAT(cache, length, gpu_inuse_recv_bytes, max_gpu_inuse_recv_bytes);
 		else
@@ -788,7 +786,7 @@ static void update_stats_inc_inuse(psm2_mr_cache_t cache, uint64_t length,
 	} else {
 		INC_STAT(cache, inuse_send, max_inuse_send);
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (access & IBV_ACCESS_IS_GPU_ADDR)
 			ADD_STAT(cache, length, gpu_inuse_send_bytes, max_gpu_inuse_send_bytes);
 		else
@@ -805,7 +803,7 @@ static void update_stats_dec_inuse(psm2_mr_cache_t cache, uint64_t length,
 {
 	cache->inuse--;
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (access & IBV_ACCESS_IS_GPU_ADDR)
 		cache->gpu_inuse_bytes -= length;
 	else
@@ -817,7 +815,7 @@ static void update_stats_dec_inuse(psm2_mr_cache_t cache, uint64_t length,
 	if (access & IBV_ACCESS_REMOTE_WRITE) {
 		cache->inuse_recv--;
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (access & IBV_ACCESS_IS_GPU_ADDR)
 			cache->gpu_inuse_recv_bytes -= length;
 		else
@@ -827,7 +825,7 @@ static void update_stats_dec_inuse(psm2_mr_cache_t cache, uint64_t length,
 	} else {
 		cache->inuse_send--;
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (access & IBV_ACCESS_IS_GPU_ADDR)
 			cache->gpu_inuse_send_bytes -= length;
 		else
@@ -869,7 +867,7 @@ static void update_stats_inc_full(psm2_mr_cache_t cache, bool priority,
 psm2_error_t set_cache_limit_nonpri_rv_kern(psm2_mr_cache_t cache,
 							uint32_t limit_entries,
 							uint32_t pri_entries, uint64_t pri_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 							, uint64_t gpu_pri_size
 #endif
 							)
@@ -885,20 +883,20 @@ psm2_error_t set_cache_limit_nonpri_rv_kern(psm2_mr_cache_t cache,
 		return PSM2_PARAM_ERR;
 	}
 	cache->limit_nonpri_inuse_bytes = (uint64_t)ep->rv_mr_cache_size*MEGABYTE - pri_size;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (PSMI_IS_GPU_ENABLED) {
+#ifdef PSM_HAVE_GPU
+	if (PSM3_GPU_IS_ENABLED) {
 		// For GPU, due to GdrCopy, we can't undersize cache.
 		// Otherwise RDMA MRs could consume all the
 		// cache space and leave a gdrcopy pin/mmap stuck
 		// retrying indefinitely.  If we want to allow undersize
 		// GPU cache, we need to have gdrcopy pin/mmap failures
 		// also invoke progress functions to release MRs
-		if (psm3_min_gpu_bar_size()) {
-			uint64_t max_recommend = psm3_min_gpu_bar_size() - 32*MEGABYTE;
+		if (PSM3_GPU_MIN_BAR_SIZE()) {
+			uint64_t max_recommend = PSM3_GPU_MIN_BAR_SIZE() - 32*MEGABYTE;
 			if ((uint64_t)ep->rv_gpu_cache_size*MEGABYTE >= max_recommend) {
 				_HFI_INFO("Warning: PSM3_RV_GPU_CACHE_SIZE=%u too large for smallest GPU's BAR size of %"PRIu64" (< %"PRIu64" total of endpoint-rail-qp recommended)\n",
 					ep->rv_gpu_cache_size,
-					(psm3_min_gpu_bar_size() + MEGABYTE-1)/MEGABYTE,
+					(PSM3_GPU_MIN_BAR_SIZE() + MEGABYTE-1)/MEGABYTE,
 					max_recommend/MEGABYTE);
 			}
 		}
@@ -911,7 +909,7 @@ psm2_error_t set_cache_limit_nonpri_rv_kern(psm2_mr_cache_t cache,
 	}
 	_HFI_MMDBG("CPU cache %u GPU cache %u\n", ep->rv_mr_cache_size,
 					ep->rv_gpu_cache_size);
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 	return PSM2_OK;
 }
 #endif // PSM_HAVE_RNDV_MOD
@@ -924,14 +922,14 @@ psm2_error_t set_cache_limit_nonpri_rv_kern(psm2_mr_cache_t cache,
 psm2_error_t set_cache_limit_nonpri_user(psm2_mr_cache_t cache,
 							uint32_t limit_entries,
 							uint32_t pri_entries, uint64_t pri_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 							, uint64_t gpu_pri_size
 #endif
 							)
 {
 	cache->limit_nonpri_inuse_bytes = cache->limit_bytes - pri_size;
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	// N/A to GPU
 	cache->limit_nonpri_gpu_inuse_bytes = 0;
 #endif
@@ -944,14 +942,14 @@ psm2_error_t set_cache_limit_nonpri_user(psm2_mr_cache_t cache,
 psm2_error_t set_cache_limit_nonpri_none(psm2_mr_cache_t cache,
 							uint32_t limit_entries,
 							uint32_t pri_entries, uint64_t pri_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 							, uint64_t gpu_pri_size
 #endif
 							)
 {
 	cache->limit_nonpri_inuse_bytes = UINT64_MAX;
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	// N/A to GPU
 	cache->limit_nonpri_gpu_inuse_bytes = 0;
 #endif
@@ -964,7 +962,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep,
 							uint32_t limit_entries, uint8_t cache_mode,
 							uint32_t limit_size_mb,
 							uint32_t pri_entries, uint64_t pri_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 							, uint64_t gpu_pri_size
 #endif
 							)
@@ -1010,7 +1008,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep,
 		cache->reg_mr_fn = psm3_verbs_reg_mr_not_user;
 		cache->release_mr_fn = psm3_verbs_release_mr_not_user;
 		err = set_cache_limit_nonpri_none(cache, limit_entries, pri_entries, pri_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 										, gpu_pri_size
 #endif
 										);
@@ -1021,7 +1019,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep,
 		cache->reg_mr_fn = psm3_verbs_reg_mr_not_user;
 		cache->release_mr_fn = psm3_verbs_release_mr_not_user;
 		err = set_cache_limit_nonpri_rv_kern(cache, limit_entries, pri_entries, pri_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 										, gpu_pri_size
 #endif
 										);
@@ -1032,7 +1030,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep,
 		cache->reg_mr_fn = psm3_verbs_reg_mr_user;
 		cache->release_mr_fn = psm3_verbs_release_mr_user;
 		err = set_cache_limit_nonpri_user(cache, limit_entries, pri_entries, pri_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 										, gpu_pri_size
 #endif
 										);
@@ -1042,7 +1040,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep,
 		cache->reg_mr_fn = psm3_verbs_reg_mr_not_user;
 		cache->release_mr_fn = psm3_verbs_release_mr_user_noinval;
 		err = set_cache_limit_nonpri_user(cache, limit_entries, pri_entries, pri_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 										, gpu_pri_size
 #endif
 										);
@@ -1060,7 +1058,7 @@ psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep,
 	cache->cmd_fd = ep->cmd_fd;
 #endif // PSM_HAVE_RNDV_MOD
 	cache->pd = ep->pd;
-#if defined(PSM_HAVE_RNDV_MOD) && (defined(PSM_CUDA) || defined(PSM_ONEAPI))
+#if defined(PSM_HAVE_RNDV_MOD) && defined(PSM_HAVE_GPU)
 	_HFI_MMDBG("cache alloc: limit_entries=%u limit_bytes=%"PRIu64" limit_nonpri_inuse=%u limit_nonpri_inuse_bytes=%"PRIu64" limit_nonpri_gpu_inuse_bytes=%"PRIu64", pri_entries=%u pri_size=%"PRIu64" gpu_pri_size=%"PRIu64"\n",
 			cache->limit_entries, cache->limit_bytes, cache->limit_nonpri_inuse,
 			cache->limit_nonpri_inuse_bytes, cache->limit_nonpri_gpu_inuse_bytes,
@@ -1128,7 +1126,7 @@ int psm3_verbs_mr_cache_allows_user_mr(psm2_mr_cache_t cache)
 static inline int have_nonpri_space(psm2_mr_cache_t cache, uint64_t length, int access)
 {
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (access & IBV_ACCESS_IS_GPU_ADDR)
 		return (cache->inuse < cache->limit_nonpri_inuse
 			&& cache->gpu_inuse_bytes + length < cache->limit_nonpri_gpu_inuse_bytes);
@@ -1356,13 +1354,13 @@ static psm3_verbs_mr_t prep_and_reg_mr(psm2_mr_cache_t cache,
 		// user space QPs for everything
 		mrc->mr.rv_mr = psm3_rv_reg_mem(cache->rv, cache->cmd_fd, cache->pd,
 										key->addr, key->length, key->access
-#ifdef PSM_ONEAPI
-										, key->alloc_id
+#ifdef PSM_HAVE_GPU
+										, &key->gpu_specific
 #endif
 										);
 		if (! mrc->mr.rv_mr) {
 			save_errno = errno;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (save_errno == ENOMEM && priority)
 				(void)psm3_gpu_evict_some(cache->ep, key->length, key->access);
 #endif
@@ -1377,13 +1375,13 @@ static psm3_verbs_mr_t prep_and_reg_mr(psm2_mr_cache_t cache,
 										(key->access&IBV_ACCESS_RDMA)?NULL
 													:cache->pd,
 										key->addr, key->length, key->access
-#ifdef PSM_ONEAPI
-										, key->alloc_id
+#ifdef PSM_HAVE_GPU
+										, &key->gpu_specific
 #endif
 										);
 		if (! mrc->mr.rv_mr) {
 			save_errno = errno;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (save_errno == ENOMEM && priority)
 				(void)psm3_gpu_evict_some(cache->ep, key->length, key->access);
 #endif
@@ -1411,8 +1409,8 @@ static psm3_verbs_mr_t prep_and_reg_mr(psm2_mr_cache_t cache,
 	mrc->addr = key->addr;
 	mrc->length = key->length;
 	mrc->access = key->access;
-#ifdef PSM_ONEAPI
-	mrc->alloc_id = key->alloc_id;
+#ifdef PSM_HAVE_GPU
+	mrc->gpu_specific = key->gpu_specific;
 #endif
 	ADD_STAT(cache, mrc->length, registered_bytes, max_registered_bytes);
 	/* Reset the fail counter */
@@ -1636,80 +1634,21 @@ struct psm3_verbs_mr * psm3_verbs_reg_mr(psm2_mr_cache_t cache, bool priority,
 		return NULL;
 	}
 #else
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	psmi_assert(!!(access & IBV_ACCESS_IS_GPU_ADDR) == (PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(addr)));
-#ifdef PSM_ONEAPI
-	if (access & IBV_ACCESS_IS_GPU_ADDR) {
-#define MAX_USER_MR_SIZE (32 * 1024)
-		void *base;
-		size_t len;
-		PSMI_ONEAPI_ZE_CALL(zeMemGetAddressRange, ze_context,
-				    (const void *)addr, &base, &len);
-		/*
-		 * Need to register MR with base address and total length.
-		 * However, for Mellanox cards, the max buffer size for a
-		 * user MR registered through the rv module is 32k bytes.
-		 * Otherwise, it will fail with IB_WC_MW_BIND_ERR. For fast
-		 * registration MR through RV (kernel MR and GPU MR), there
-		 * is also a upper limit (max_fast_reg_page_list_len) imposed
-		 * by the underlying RDMA device (eg 256MB for mlx5).
-		 */
-		if (strncasecmp(cache->ep->dev_name, "mlx5_0", 3) == 0 &&
-		    !(access & IBV_ACCESS_KERNEL)) {
-			if (len > MAX_USER_MR_SIZE) {
-				/*
-				 * Change only if the buffer stays in the first
-				 * 32k
-				 */
-				if (((char *)addr + length) <=
-					((char *)base + MAX_USER_MR_SIZE)) {
-					addr = base;
-					length = MAX_USER_MR_SIZE;
-				}
-			} else {
-				addr = base;
-				length = len;
-			}
-		} else {
-			uint64_t start, end;
-			uint64_t mr_len;
-			uint64_t offset;
-			uint64_t limit = cache->ep->verbs_ep.max_fmr_size;
-
-			/* Buffer end + 1 */
-			end = (uint64_t)base + len;
-			/* Offset of the requested buffer chunk */
-			offset = (uint64_t)addr - (uint64_t)base;
-			/* Start address of next MR */
-			start = (uint64_t)base + (offset / limit) * limit;
-			mr_len = end - start;
-			if (mr_len > limit)
-				mr_len = limit;
-			/*
-			 * Change only if the chunk does not cross the
-			 * (start + mr_len) boundary, Otherwise,
-			 * Just register the requested chunk.
-			 */
-			if (((uint64_t)addr + length) <= (start + mr_len))
-			{
-				addr = (void *)start;
-				length = mr_len;
-			}
-		}
-	}
-#endif /* PSM_ONEAPI */
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#ifdef PSM_HAVE_GPU
+	psmi_assert(!!(access & IBV_ACCESS_IS_GPU_ADDR) == (PSM3_IS_GPU_MEM(addr)));
+	if (access & IBV_ACCESS_IS_GPU_ADDR)
+		PSM3_GPU_ROUNDUP_RV_REG_MR(cache->ep, &addr, &length, access);
+#endif /* PSM_HAVE_GPU */
 #endif /* PSM_HAVE_RNDV_MOD */
 
 	struct psm3_verbs_mr key = { // our search key
 		.addr = addr,
 		.length = length,
 		.access = access,
-#ifdef PSM_ONEAPI
-		.alloc_id = (access & IBV_ACCESS_IS_GPU_ADDR)?
-			psm3_oneapi_ze_get_alloc_id(addr, NULL) : 0
-#endif
 	};
+#ifdef PSM_HAVE_GPU
+	PSM3_GPU_INIT_MR(addr, length, access, &key.gpu_specific);
+#endif
 	_HFI_MMDBG("pri %d "MRC_FMT"\n", priority, MR_OUT_MRC(&key));
 
 	return (*cache->reg_mr_fn)(cache, priority, &key);
@@ -1817,9 +1756,9 @@ void psm3_verbs_release_mr(struct psm3_verbs_mr *mrc)
 void psm3_verbs_free_mr_cache(psm2_mr_cache_t cache)
 {
 	// don't pollute stats with our shutdown activity
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #ifdef PSM_HAVE_RNDV_MOD
-	if (cache->rv && PSMI_IS_GPU_ENABLED)
+	if (cache->rv && PSM3_GPU_IS_ENABLED)
 		psm3_stats_deregister_type(PSMI_STATSTYPE_MR_CACHE,
 					&cache->rv_gpu_stats);
 #endif
@@ -2310,11 +2249,11 @@ static uint64_t mr_cache_rv_miss_rate(void *context)
 		return 0;
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static uint64_t mr_cache_rv_gpu_limit_size(void *context)
 {
 	psm2_mr_cache_t cache = container_of(context, struct psm2_mr_cache, rv_gpu_stats);
-	if (cache->rv && PSMI_IS_GPU_ENABLED ) {
+	if (cache->rv && PSM3_GPU_IS_ENABLED ) {
 		// this is a little sly, we know the stats processing routines will
 		// call the accessors in the order from the entries list
 		// so we use the 1st of the rv statistics accessors to get
@@ -2447,7 +2386,7 @@ static uint64_t mr_cache_rv_gpu_miss_rate_mmap(void *context)
 	else
 		return 0;
 }
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 
 #endif // PSM_HAVE_RNDV_MOD
 
@@ -2478,7 +2417,7 @@ static void register_cache_stats(psm2_mr_cache_t cache)
 				MPSPAWN_STATS_REDUCTION_ALL,
 				NULL, &cache->limit_nonpri_inuse_bytes),
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECL("limit_nonpri_gpu_inuse_bytes",
 				"Limit of total registered non-priority GPU MR bytes in cache",
 				MPSPAWN_STATS_REDUCTION_ALL,
@@ -2572,7 +2511,7 @@ static void register_cache_stats(psm2_mr_cache_t cache)
 				&cache->umr_cache.stats.max_dereg_queued_cnt),
 #endif /* UMR_CACHE */
 #ifdef PSM_HAVE_RNDV_MOD
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("gpu_inuse_bytes",
 				"Current registered GPU MR bytes with an active IO",
 				&cache->gpu_inuse_bytes),
@@ -2778,7 +2717,7 @@ static void register_cache_stats(psm2_mr_cache_t cache)
 					PSMI_HOWMANY(entries),
 					psm3_epid_fmt_internal(cache->ep->epid, 0), cache,
 					cache->ep->dev_name);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #ifdef PSM_HAVE_RNDV_MOD
 	struct psmi_stats_entry gpu_entries[] = {
 		PSMI_STATS_DECL_HELP("Kernel RV GPU Cache Configuration:"),
@@ -3009,7 +2948,7 @@ static void register_cache_stats(psm2_mr_cache_t cache)
 				"Number of GPU RDMA write bytes successfully posted",
 				(uint64_t*)&cache->rv_gpu_stats.gpu_post_write_bytes),
 	};
-	if (cache->rv && PSMI_IS_GPU_ENABLED && cache->ep->rv_gpu_cache_size) {
+	if (cache->rv && PSM3_GPU_IS_ENABLED && cache->ep->rv_gpu_cache_size) {
 		psm3_stats_register_type("MR_GPU_Cache_Statistics",
 			"Kernel RV GPU MR and mmap cache for an endpoint in the process\n"
 			"When Direct GPU transfers are enabled, an additional "
@@ -3031,7 +2970,7 @@ static void register_cache_stats(psm2_mr_cache_t cache)
 					&cache->rv_gpu_stats, cache->ep->dev_name);
 	}
 #endif /* PSM_HAVE_RNDV_MOD */
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 }
 
 #endif /* PSM_HAVE_REG_MR */
diff --git a/prov/psm3/psm3/psm_verbs_mr.h b/prov/psm3/psm3/psm_verbs_mr.h
index 83c34944a84..e04bc759e13 100644
--- a/prov/psm3/psm3/psm_verbs_mr.h
+++ b/prov/psm3/psm3/psm_verbs_mr.h
@@ -191,8 +191,8 @@ struct psm3_verbs_mr {
 	void *addr;
 	uint64_t length;
 	uint32_t access;
-#if defined(PSM_ONEAPI)
-	uint64_t alloc_id;
+#ifdef PSM_HAVE_GPU
+	union psm3_verbs_mr_gpu_specific gpu_specific;
 #endif
 	// below is for queue of cache entries available for reuse (refcount==0)
 	// only used when cache_mode==1
@@ -211,11 +211,11 @@ extern unsigned psm3_mr_cache_debug;
 #define MR_OUT_RANGE(addr, len) (uint64_t)(uintptr_t)(addr), \
 					(uint64_t)(uintptr_t)(addr)+(uint64_t)(len)-1, \
 					(uint64_t)(len)
-#ifdef PSM_ONEAPI
-#define MRC_FMT "0x%"PRIx64":0x%"PRIx64" (len 0x%"PRIx64") id %"PRIu64 \
-				" access 0x%x"
-#define MR_OUT_MRC(mrc) MR_OUT_RANGE((mrc)->addr, (mrc)->length), \
-								 (mrc)->alloc_id, (mrc)->access
+#ifdef PSM_HAVE_GPU
+#define MRC_FMT "0x%"PRIx64":0x%"PRIx64" (len 0x%"PRIx64") access 0x%x" \
+		PSM3_GPU_MRC_FMT
+#define MR_OUT_MRC(mrc) MR_OUT_RANGE((mrc)->addr, (mrc)->length), (mrc)->access \
+		PSM3_GPU_OUT_MRC(&(mrc)->gpu_specific)
 #else
 #define MRC_FMT "0x%"PRIx64":0x%"PRIx64" (len 0x%"PRIx64") access 0x%x"
 #define MR_OUT_MRC(mrc) MR_OUT_RANGE((mrc)->addr, (mrc)->length), (mrc)->access
@@ -225,7 +225,7 @@ extern psm2_mr_cache_t psm3_verbs_alloc_mr_cache(psm2_ep_t ep,
 				uint32_t limit_entries, uint8_t cache_mode,
 				uint32_t limit_size_mb,
 				uint32_t limit_pri_entries, uint64_t pri_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, uint64_t gpu_pri_size
 #endif
 				);
diff --git a/prov/psm3/psm3/ptl.h b/prov/psm3/psm3/ptl.h
index 44110636411..f6ca3e0d98b 100644
--- a/prov/psm3/psm3/ptl.h
+++ b/prov/psm3/psm3/ptl.h
@@ -133,7 +133,7 @@ struct ptl_arg {
 struct ptl_strategy_stats {
 	uint64_t tiny_cpu_isend;
 	uint64_t tiny_cpu_isend_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t tiny_gdrcopy_isend;
 	uint64_t tiny_gdrcopy_isend_bytes;
 	uint64_t tiny_cuCopy_isend;
@@ -141,7 +141,7 @@ struct ptl_strategy_stats {
 #endif
 	uint64_t tiny_cpu_send;
 	uint64_t tiny_cpu_send_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t tiny_gdrcopy_send;
 	uint64_t tiny_gdrcopy_send_bytes;
 	uint64_t tiny_cuCopy_send;
@@ -152,7 +152,7 @@ struct ptl_strategy_stats {
 	uint64_t tiny_cpu_recv_bytes;
 	uint64_t tiny_sysbuf_recv;	/* to unexpected Q sysbuf */ /* incl 0 byte */
 	uint64_t tiny_sysbuf_recv_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t tiny_gdrcopy_recv;
 	uint64_t tiny_gdrcopy_recv_bytes;
 	uint64_t tiny_cuCopy_recv;
@@ -163,7 +163,7 @@ struct ptl_strategy_stats {
 	uint64_t short_copy_cpu_isend_bytes;
 	uint64_t short_dma_cpu_isend;
 	uint64_t short_dma_cpu_isend_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t short_gdrcopy_isend;
 	uint64_t short_gdrcopy_isend_bytes;
 	uint64_t short_cuCopy_send;
@@ -176,7 +176,7 @@ struct ptl_strategy_stats {
 	uint64_t short_dma_cpu_send;
 	uint64_t short_dma_cpu_send_bytes;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t short_gdrcopy_send;
 	uint64_t short_gdrcopy_send_bytes;
 	uint64_t short_cuCopy_isend;
@@ -189,7 +189,7 @@ struct ptl_strategy_stats {
 	uint64_t short_cpu_recv_bytes;
 	uint64_t short_sysbuf_recv;	/* to unexpected Q sysbuf */
 	uint64_t short_sysbuf_recv_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t short_gdrcopy_recv;
 	uint64_t short_gdrcopy_recv_bytes;
 	uint64_t short_cuCopy_recv;
@@ -202,7 +202,7 @@ struct ptl_strategy_stats {
 	uint64_t eager_dma_cpu_isend_bytes;
 	uint64_t eager_sysbuf_recv;	/* to unexpected Q sysbuf */
 	uint64_t eager_sysbuf_recv_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t eager_cuCopy_isend;
 	uint64_t eager_cuCopy_isend_bytes;
 	uint64_t eager_gdr_isend;
@@ -212,7 +212,7 @@ struct ptl_strategy_stats {
 	uint64_t eager_copy_cpu_send_bytes;
 	uint64_t eager_dma_cpu_send;
 	uint64_t eager_dma_cpu_send_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t eager_cuCopy_send;
 	uint64_t eager_cuCopy_send_bytes;
 	uint64_t eager_gdr_send;
@@ -221,7 +221,7 @@ struct ptl_strategy_stats {
 
 	uint64_t eager_cpu_recv;
 	uint64_t eager_cpu_recv_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t eager_gdrcopy_recv;
 	uint64_t eager_gdrcopy_recv_bytes;
 	uint64_t eager_cuCopy_recv;
@@ -230,13 +230,13 @@ struct ptl_strategy_stats {
 
 	uint64_t rndv_cpu_isend;
 	uint64_t rndv_cpu_isend_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t rndv_gpu_isend;
 	uint64_t rndv_gpu_isend_bytes;
 #endif
 	uint64_t rndv_cpu_send;
 	uint64_t rndv_cpu_send_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t rndv_gpu_send;
 	uint64_t rndv_gpu_send_bytes;
 #endif
@@ -246,7 +246,7 @@ struct ptl_strategy_stats {
 	uint64_t rndv_rts_cpu_recv_bytes;
 	uint64_t rndv_rts_sysbuf_recv;
 	uint64_t rndv_rts_sysbuf_recv_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t rndv_rts_cuCopy_recv;
 	uint64_t rndv_rts_cuCopy_recv_bytes;
 #endif
@@ -261,7 +261,7 @@ struct ptl_strategy_stats {
 	uint64_t rndv_long_cpu_recv_bytes;
 	uint64_t rndv_long_gpu_recv;	/* per RTS */
 	uint64_t rndv_long_gpu_recv_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t rndv_long_cuCopy_recv;
 	uint64_t rndv_long_cuCopy_recv_bytes;
 	uint64_t rndv_long_gdrcopy_recv;
@@ -274,7 +274,7 @@ struct ptl_strategy_stats {
 	uint64_t rndv_long_copy_cpu_send_bytes;
 	uint64_t rndv_long_dma_cpu_send;	/* per CTS  (1 per RTS) */
 	uint64_t rndv_long_dma_cpu_send_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t rndv_long_cuCopy_send;	/* per CTS  (1 per RTS) */
 	uint64_t rndv_long_cuCopy_send_bytes;
 	uint64_t rndv_long_gdrcopy_send;	/* per CTS  (1 per RTS) */
@@ -286,7 +286,7 @@ struct ptl_strategy_stats {
 	/* RDMA approach selected by receiver */
 	uint64_t rndv_rdma_cpu_recv;	/* per RTS */
 	uint64_t rndv_rdma_cpu_recv_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t rndv_rdma_gdr_recv;	/* per RTS */
 	uint64_t rndv_rdma_gdr_recv_bytes;
 	uint64_t rndv_rdma_hbuf_recv;	/* per RTS */
@@ -297,7 +297,7 @@ struct ptl_strategy_stats {
 	/* RDMA may use >= 1 CTS per RTS */
 	uint64_t rndv_rdma_cpu_send;	/* per CTS */
 	uint64_t rndv_rdma_cpu_send_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint64_t rndv_rdma_gdr_send;	/* per CTS */
 	uint64_t rndv_rdma_gdr_send_bytes;
 	uint64_t rndv_rdma_hbuf_send;	/* per CTS */
diff --git a/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c b/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c
deleted file mode 100644
index bc4b4798b16..00000000000
--- a/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.c
+++ /dev/null
@@ -1,515 +0,0 @@
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  Contact Information:
-  Intel Corporation, www.intel.com
-
-  BSD LICENSE
-
-  Copyright(c) 2016 Intel Corporation.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifdef PSM_CUDA
-
-#include "psm_user.h"
-#include "am_cuda_memhandle_cache.h"
-
-/*
- * rbtree cruft
- */
-struct _cl_map_item;
-
-typedef struct
-{
-	unsigned long		start;		 /* start virtual address */
-	CUipcMemHandle		cuda_ipc_handle; /* cuda ipc mem handle */
-	CUdeviceptr		cuda_ipc_dev_ptr;/* Cuda device pointer */
-	psm2_epid_t             epid;
-	struct _cl_map_item*	i_prev;	 /* idle queue previous */
-	struct _cl_map_item*	i_next;	 /* idle queue next */
-}__attribute__ ((aligned (128))) rbtree_cuda_memhandle_cache_mapitem_pl_t;
-
-typedef struct {
-	uint32_t		nelems;	/* number of elements in the cache */
-} rbtree_cuda_memhandle_cache_map_pl_t;
-
-static psm2_error_t am_cuda_memhandle_mpool_alloc(
-					am_cuda_memhandle_cache_t cache, uint32_t memcache_size);
-
-/*
- * Custom comparator
- */
-typedef rbtree_cuda_memhandle_cache_mapitem_pl_t cuda_cache_item;
-
-static int cuda_cache_key_cmp(const cuda_cache_item *a, const cuda_cache_item *b)
-{
-	// we use epid as part of cache key so multi-ep and multi-process jobs
-	// can have a better cache hit rate.  In some cases we may end up with
-	// cache entries for the same buffer with different epid's all within the
-	// same multi-ep rank, but this does no harm other than to waste some
-	// cache space.  By including epid in key_cmp we have a chance to have
-	// separate cache entries for the same sbuf address in different
-	// sender's GPU virtual address space.
-	switch (psm3_epid_cmp_internal(a->epid, b->epid)) {
-	case -1: return -1;
-	case 1: return 1;
-	default:
-		break;
-	}
-
-	// The sender has used cuMemGetAddressRange to normalize the address
-	// so we can simply compare the start address of the allocation.
-	// Note cuIpcOpenMemHandle only needs the start address as well, so we
-	// ignore length
-	if (a->start < b->start)
-		return -1;
-	if (b->start < a->start)
-		return 1;
-
-	return 0;
-}
-
-
-/*
- * Necessary rbtree cruft
- */
-#define RBTREE_MI_PL  rbtree_cuda_memhandle_cache_mapitem_pl_t
-#define RBTREE_MAP_PL rbtree_cuda_memhandle_cache_map_pl_t
-#define RBTREE_CMP(a,b) cuda_cache_key_cmp((a), (b))
-#define RBTREE_ASSERT                     psmi_assert
-#define RBTREE_MAP_COUNT(PAYLOAD_PTR)     ((PAYLOAD_PTR)->nelems)
-#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
-
-#include "psm3_rbtree.h"
-#include "psm3_rbtree.c"
-
-/*
- * Convenience rbtree cruft
- */
-#define NELEMS(cache)	((cache)->map.payload.nelems)
-
-#define IHEAD(cache)	((cache)->map.root)
-#define LAST(cache)	(IHEAD(cache)->payload.i_prev)
-#define FIRST(cache)	(IHEAD(cache)->payload.i_next)
-#define INEXT(x)	((x)->payload.i_next)
-#define IPREV(x)	((x)->payload.i_prev)
-
-/*
- * Actual module data
- */
-struct am_cuda_memhandle_cache {
-	cl_qmap_t map;
-	mpool_t mpool;
-	uint32_t size;
-	psm2_mq_stats_t *stats;
-};
-
-static void print_cuda_memhandle_cache_stats(psm2_mq_stats_t *stats)
-{
-	_HFI_DBG("limit=%lu,maxelems=%lu,hit=%lu,miss=%lu,evict=%lu,remove=%lu,clear=%lu\n",
-		stats->gpu_ipc_cache_limit, stats->gpu_ipc_cache_max_nelems,
-		stats->gpu_ipc_cache_hit, stats->gpu_ipc_cache_miss,
-		stats->gpu_ipc_cache_evict, stats->gpu_ipc_cache_remove,
-		stats->gpu_ipc_cache_clear);
-}
-
-/*
- * This is the callback function when mempool are resized or destroyed.
- * Upon calling cache fini mpool is detroyed which in turn calls this callback
- * which helps in closing all memhandles.
- */
-static void
-psmi_cuda_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
-{
-	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
-	if (!is_alloc) {
-		if(memcache_item->payload.start)
-			PSMI_CUDA_CALL(cuIpcCloseMemHandle,
-				       memcache_item->payload.cuda_ipc_dev_ptr);
-	}
-}
-
-/*
- * Creating mempool for cuda memhandle cache nodes.
- */
-static psm2_error_t
-am_cuda_memhandle_mpool_alloc(am_cuda_memhandle_cache_t cache,
-							uint32_t memcache_size)
-{
-	psm2_error_t err;
-	if (memcache_size < 1)
-		return PSM2_PARAM_ERR;
-
-	cache->size = memcache_size;
-	/* Creating a memory pool of size PSM3_CUDA_MEMCACHE_SIZE
-	 * which includes the Root and NIL items
-	 */
-	cache->mpool = psm3_mpool_create_for_gpu(sizeof(cl_map_item_t),
-					cache->size,
-					cache->size, 0,
-					UNDEFINED, NULL, NULL,
-					psmi_cuda_memhandle_cache_alloc_func,
-					NULL);
-	if (cache->mpool == NULL) {
-		err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
-				"Couldn't allocate CUDA host receive buffer pool");
-		return err;
-	}
-	return PSM2_OK;
-}
-
-/*
- * allocate and initialize memhandle cache
- * including rbtree.
- */
-psm2_error_t am_cuda_memhandle_cache_alloc(am_cuda_memhandle_cache_t *cachep,
-										uint32_t memcache_size,
-										psm2_mq_stats_t *stats)
-{
-	cl_map_item_t *root = NULL, *nil_item = NULL;
-
-	*cachep = (am_cuda_memhandle_cache_t)psmi_calloc(
-				NULL, UNDEFINED, 1, sizeof(**cachep));
-	if (! *cachep)
-		return PSM2_NO_MEMORY;
-
-	psm2_error_t err = am_cuda_memhandle_mpool_alloc(*cachep, memcache_size);
-	if (err != PSM2_OK)
-		goto fail;
-
-	root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
-	if (root == NULL) {
-		err = PSM2_NO_MEMORY;
-		goto fail;
-	}
-	nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
-	if (nil_item == NULL) {
-		err = PSM2_NO_MEMORY;
-		goto fail;
-	}
-
-	nil_item->payload.start = 0;
-	nil_item->payload.epid = psm3_epid_zeroed_internal();
-	ips_cl_qmap_init(&(*cachep)->map,root,nil_item);
-	NELEMS(*cachep) = 0;
-
-	(*cachep)->stats = stats;
-
-	stats->gpu_ipc_cache_limit = memcache_size;
-	stats->gpu_ipc_cache_nelems = 0;
-	stats->gpu_ipc_cache_max_nelems = 0;
-	stats->gpu_ipc_cache_hit = 0;
-	stats->gpu_ipc_cache_miss = 0;
-	stats->gpu_ipc_cache_evict = 0;
-	stats->gpu_ipc_cache_remove = 0;
-	stats->gpu_ipc_cache_clear = 0;
-
-	return PSM2_OK;
-
-fail:
-	if (nil_item)
-		psmi_free(nil_item);
-	if (root)
-		psmi_free(root);
-	if ((*cachep)->mpool)
-		psm3_mpool_destroy((*cachep)->mpool);
-	psmi_free(*cachep);
-	return err;
-}
-
-void am_cuda_memhandle_cache_free(am_cuda_memhandle_cache_t cache)
-{
-	print_cuda_memhandle_cache_stats(cache->stats);
-
-	if (cache->map.nil_item)
-		psmi_free(cache->map.nil_item);
-	if (cache->map.root)
-		psmi_free(cache->map.root);
-	if (cache->mpool)
-		psm3_mpool_destroy(cache->mpool);
-	psmi_free(cache);
-}
-
-/*
- * Insert at the head of Idleq.
- */
-static void
-am_cuda_idleq_insert(am_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item)
-{
-	if (FIRST(cache) == NULL) {
-		FIRST(cache) = memcache_item;
-		LAST(cache) = memcache_item;
-		return;
-	}
-	INEXT(FIRST(cache)) = memcache_item;
-	IPREV(memcache_item) = FIRST(cache);
-	FIRST(cache) = memcache_item;
-	INEXT(FIRST(cache)) = NULL;
-	return;
-}
-
-/*
- * Remove least recent used element.
- */
-static void
-am_cuda_idleq_remove_last(am_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item)
-{
-	if (!INEXT(memcache_item)) {
-		LAST(cache) = NULL;
-		FIRST(cache) = NULL;
-	} else {
-		LAST(cache) = INEXT(memcache_item);
-		IPREV(LAST(cache)) = NULL;
-	}
-	// Null-out now-removed memcache_item's next and prev pointers out of
-	// an abundance of caution
-	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
-}
-
-static void
-am_cuda_idleq_remove(am_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item)
-{
-	if (LAST(cache) == memcache_item) {
-		am_cuda_idleq_remove_last(cache, memcache_item);
-	} else if (FIRST(cache) == memcache_item) {
-		FIRST(cache) = IPREV(memcache_item);
-		INEXT(FIRST(cache)) = NULL;
-	} else {
-		INEXT(IPREV(memcache_item)) = INEXT(memcache_item);
-		IPREV(INEXT(memcache_item)) = IPREV(memcache_item);
-	}
-	// Null-out now-removed memcache_item's next and prev pointers out of
-	// an abundance of caution
-	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
-}
-
-static void
-am_cuda_idleq_reorder(am_cuda_memhandle_cache_t cache, cl_map_item_t* memcache_item)
-{
-	if (FIRST(cache) == memcache_item && LAST(cache) == memcache_item ) {
-		return;
-	}
-	am_cuda_idleq_remove(cache, memcache_item);
-	am_cuda_idleq_insert(cache, memcache_item);
-	return;
-}
-
-/*
- * After a successful cache hit, item is validated by doing a
- * memcmp on the handle stored and the handle we receive from the
- * sender. If the validation fails the item is removed from the idleq,
- * the rbtree, is put back into the mpool and cuIpcCloseMemHandle function
- * is called.
- * Cuda ipcMemHandles for distinct allocations are unique, even if the
- * allocation was at the same address.  So this check catches stale cache
- * entries.
- */
-static psm2_error_t
-am_cuda_memhandle_cache_validate(am_cuda_memhandle_cache_t cache,
-				 cl_map_item_t* memcache_item,
-				 uintptr_t sbuf, CUipcMemHandle* handle,
-				 psm2_epid_t epid)
-{
-	psmi_assert(!psm3_epid_cmp_internal(epid, memcache_item->payload.epid));
-	psmi_assert(sbuf == memcache_item->payload.start);
-	if (0 == memcmp(handle, &memcache_item->payload.cuda_ipc_handle,
-			 sizeof(CUipcMemHandle))) {
-		return PSM2_OK;
-	}
-	_HFI_DBG("cache collision: new entry start=%lu\n", sbuf);
-
-	cache->stats->gpu_ipc_cache_remove++;
-	ips_cl_qmap_remove_item(&cache->map, memcache_item);
-	cache->stats->gpu_ipc_cache_nelems--;
-	PSMI_CUDA_CALL(cuIpcCloseMemHandle,
-		       memcache_item->payload.cuda_ipc_dev_ptr);
-	am_cuda_idleq_remove(cache, memcache_item);
-	memset(memcache_item, 0, sizeof(*memcache_item));
-	psm3_mpool_put(memcache_item);
-	return PSM2_OK_NO_PROGRESS;
-}
-
-/*
- * Current eviction policy: Least Recently Used.
- */
-static void
-am_cuda_memhandle_cache_evict(am_cuda_memhandle_cache_t cache)
-{
-	cache->stats->gpu_ipc_cache_evict++;
-	cl_map_item_t *p_item = LAST(cache);
-	_HFI_VDBG("Removing (epid=%s,start=%lu,dev_ptr=0x%llX,it=%p) from cuda_memhandle_cachemap.\n",
-			psm3_epid_fmt_internal(p_item->payload.epid, 0), p_item->payload.start,
-			p_item->payload.cuda_ipc_dev_ptr, p_item);
-	ips_cl_qmap_remove_item(&cache->map, p_item);
-	cache->stats->gpu_ipc_cache_nelems--;
-	PSMI_CUDA_CALL(cuIpcCloseMemHandle, p_item->payload.cuda_ipc_dev_ptr);
-	am_cuda_idleq_remove_last(cache, p_item);
-	memset(p_item, 0, sizeof(*p_item));
-	psm3_mpool_put(p_item);
-}
-
-static psm2_error_t
-am_cuda_memhandle_cache_register(am_cuda_memhandle_cache_t cache,
-				 uintptr_t sbuf, CUipcMemHandle* handle,
-				 psm2_epid_t epid,
-				 CUdeviceptr cuda_ipc_dev_ptr)
-{
-	if (NELEMS(cache) == cache->size)
-		am_cuda_memhandle_cache_evict(cache);
-
-	cl_map_item_t* memcache_item = psm3_mpool_get(cache->mpool);
-	/* memcache_item cannot be NULL as we evict
-	 * before the call to mpool_get. Check has
-	 * been fixed to help with klockwork analysis.
-	 */
-	if (memcache_item == NULL)
-		return PSM2_NO_MEMORY;
-	memcache_item->payload.start = sbuf;
-	memcache_item->payload.cuda_ipc_handle = *handle;
-	memcache_item->payload.cuda_ipc_dev_ptr = cuda_ipc_dev_ptr;
-	memcache_item->payload.epid = epid;
-	ips_cl_qmap_insert_item(&cache->map, memcache_item);
-	cache->stats->gpu_ipc_cache_nelems++;
-	if (cache->stats->gpu_ipc_cache_nelems > cache->stats->gpu_ipc_cache_max_nelems)
-		cache->stats->gpu_ipc_cache_max_nelems = cache->stats->gpu_ipc_cache_nelems;
-	am_cuda_idleq_insert(cache, memcache_item);
-	return PSM2_OK;
-}
-
-static void am_cuda_memhandle_cache_clear(am_cuda_memhandle_cache_t cache)
-{
-	_HFI_DBG("Closing all handles, clearing cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS(cache));
-	while (NELEMS(cache)) {
-		am_cuda_memhandle_cache_evict(cache);
-	}
-	cache->stats->gpu_ipc_cache_clear++;
-	_HFI_DBG("Closed all handles, cleared cuda_memhandle_cachemap and idleq. NELEMS=%u\n", NELEMS(cache));
-}
-
-/*
- * The key used to search the cache is the senders buf address pointer and
- * epid.  The sender will have used cuMemGetAddressRange
- * to find the start of the memory containing the buffer (supplied as sbuf).
- * Upon match, we must validate the entry we find and may need to replace it.
- */
-CUdeviceptr
-am_cuda_memhandle_acquire(am_cuda_memhandle_cache_t cache,
-				uintptr_t sbuf, CUipcMemHandle* handle,
-				psm2_epid_t epid)
-{
-	_HFI_VDBG("sbuf=%lu,handle=%p,epid=%s\n",
-			sbuf, handle, psm3_epid_fmt_internal(epid, 0));
-
-	CUdeviceptr cuda_ipc_dev_ptr;
-	if(! cache) {
-		PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr,
-				 *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
-		return cuda_ipc_dev_ptr;
-	}
-
-	cuda_cache_item key = {
-		.start = (unsigned long) sbuf,
-		.epid = epid
-	};
-
-	/*
-	 * preconditions:
-	 *  1) buffer [start,epid) may or may not be in cachemap already
-	 *  2) there are no duplicate entries in cachemap
-	 * postconditions:
-	 *  1) buffer is in cachemap with same handle, epid
-	 *  2) there are no duplicate entries in cachemap
-	 *
-	 * The key used to search the cache is the senders buf address pointer
-	 * and epid.
-	 * Upon a succesful hit in the cache, additional validation is required
-	 * as the handle could be stale.
-	 */
-	cl_map_item_t *p_item = ips_cl_qmap_searchv(&cache->map, &key);
-	if (p_item->payload.start) {
-		// confirm the entry for sbuf matches the handle and is not stale
-		if (am_cuda_memhandle_cache_validate(cache, p_item, sbuf, handle, epid) == PSM2_OK) {
-			cache->stats->gpu_ipc_cache_hit++;
-			am_cuda_idleq_reorder(cache, p_item);
-			return p_item->payload.cuda_ipc_dev_ptr;
-		}
-
-		// buffer found was stale am_cuda_memhandle_cache_validate()
-		// closed and removed existing entry.
-		// Should find no more duplicates
-#ifdef PSM_DEBUG
-		p_item = ips_cl_qmap_searchv(&cache->map, &key);
-		psmi_assert(! p_item->payload.start);
-#endif
-	}
-	cache->stats->gpu_ipc_cache_miss++;
-
-	CUresult cudaerr;
-	PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_ALREADY_MAPPED, cuIpcOpenMemHandle,
-		&cuda_ipc_dev_ptr, *handle, CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
-
-	if (cudaerr == CUDA_ERROR_ALREADY_MAPPED) {
-		// remote memory already mapped. Close all handles, clear cache,
-		// and try again
-		am_cuda_memhandle_cache_clear(cache);
-		PSMI_CUDA_CALL(cuIpcOpenMemHandle, &cuda_ipc_dev_ptr, *handle,
-			CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS);
-	}
-
-	am_cuda_memhandle_cache_register(cache, sbuf, handle,
-					   epid, cuda_ipc_dev_ptr);
-	return cuda_ipc_dev_ptr;
-}
-
-void
-am_cuda_memhandle_release(am_cuda_memhandle_cache_t cache,
-						CUdeviceptr cuda_ipc_dev_ptr)
-{
-	if(! cache)
-		PSMI_CUDA_CALL(cuIpcCloseMemHandle, cuda_ipc_dev_ptr);
-	return;
-}
-
-#endif
diff --git a/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h b/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h
deleted file mode 100644
index 4b1cf744545..00000000000
--- a/prov/psm3/psm3/ptl_am/am_cuda_memhandle_cache.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2016 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  Contact Information:
-  Intel Corporation, www.intel.com
-
-  BSD LICENSE
-
-  Copyright(c) 2016 Intel Corporation.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifdef PSM_CUDA
-
-#ifndef _AM_CUDA_MEMHANDLE_CACHE_H
-#define _AM_CUDA_MEMHANDLE_CACHE_H
-
-#include "psm_user.h"
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define CUDA_MEMHANDLE_CACHE_SIZE 64
-
-struct am_cuda_memhandle_cache;	// opaque since contains rbtree fields
-typedef struct am_cuda_memhandle_cache *am_cuda_memhandle_cache_t;
-
-psm2_error_t am_cuda_memhandle_cache_alloc(am_cuda_memhandle_cache_t *cachep,
-										uint32_t memcache_size,
-										psm2_mq_stats_t *stats);
-
-CUdeviceptr
-am_cuda_memhandle_acquire(am_cuda_memhandle_cache_t cache,
-				uintptr_t sbuf, CUipcMemHandle* handle,
-				psm2_epid_t epid);
-void
-am_cuda_memhandle_release(am_cuda_memhandle_cache_t cache,
-				CUdeviceptr cuda_ipc_dev_ptr);
-
-void am_cuda_memhandle_cache_free(am_cuda_memhandle_cache_t cache);
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* _AM_CUDA_MEMHANDLE_CACHE_H */
-
-#endif /* PSM_CUDA */
diff --git a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c b/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c
deleted file mode 100644
index ac561c6d32f..00000000000
--- a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.c
+++ /dev/null
@@ -1,696 +0,0 @@
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2022 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  Contact Information:
-  Intel Corporation, www.intel.com
-
-  BSD LICENSE
-
-  Copyright(c) 2022 Intel Corporation.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifdef PSM_ONEAPI
-
-#include "psm_user.h"
-#include "psm_am_internal.h"
-#include "am_oneapi_memhandle_cache.h"
-#include <fcntl.h>
-#include <unistd.h>
-#ifdef HAVE_DRM
-#include <sys/ioctl.h>
-#include <drm/i915_drm.h>
-#endif
-#ifdef HAVE_LIBDRM
-#include <sys/ioctl.h>
-#include <libdrm/i915_drm.h>
-#endif
-#ifdef PSM_HAVE_PIDFD
-#include <sys/syscall.h>
-#endif
-
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-/*
- * rbtree cruft
- */
-struct _cl_map_item;
-
-typedef struct
-{
-	unsigned long           start;           /* start(base) virtual address
-						    in peer process */
-	uint32_t                ze_handle;       /* Sender's GEM handle or fd */
-	uint64_t                alloc_id;        /* ze alloc_id */
-	void                    *buf_ptr;        /* buffer pointer in this
-						    process */
-	psm2_epid_t             epid;
-	struct _cl_map_item*    i_prev;          /* idle queue previous */
-	struct _cl_map_item*    i_next;          /* idle queue next */
-	am_ze_memhandle_cache_t cache;           /* only for gem_handle close */
-}__attribute__ ((aligned (128))) rbtree_ze_memhandle_cache_mapitem_pl_t;
-
-typedef struct {
-	uint32_t                nelems;          /* number of elements in the cache */
-} rbtree_ze_memhandle_cache_map_pl_t;
-
-static psm2_error_t am_ze_memhandle_mpool_alloc(
-					am_ze_memhandle_cache_t cache, uint32_t memcache_size);
-static void am_ze_memhandle_delete(void *buf_ptr);
-
-/*
- * Custom comparator
- */
-typedef rbtree_ze_memhandle_cache_mapitem_pl_t ze_cache_item;
-
-static int ze_cache_key_cmp(const ze_cache_item *a, const ze_cache_item *b)
-{
-	// we use epid as part of cache key so multi-ep and multi-process jobs
-	// can have a better cache hit rate.  In some cases we may end up with
-	// cache entries for the same buffer with different epid's all within the
-	// same multi-ep rank, but this does no harm other than to waste some
-	// cache space.  By including epid in key_cmp we have a chance to have
-	// separate cache entries for the same sbuf address in different
-	// sender's GPU virtual address space.
-	switch (psm3_epid_cmp_internal(a->epid, b->epid)) {
-	case -1: return -1;
-	case 1: return 1;
-	default:
-		break;
-	}
-
-	// The sender has used zeMemGetAddressRange to normalize the address
-	// so we can simply compare the start address of the allocation.
-	// Note zeMemOpenIpcHandle only needs the start address as well, so we
-	// ignore length
-	if (a->start < b->start)
-		return -1;
-	if (b->start < a->start)
-		return 1;
-
-	return 0;
-}
-
-
-/*
- * Necessary rbtree cruft
- */
-#define RBTREE_MI_PL  rbtree_ze_memhandle_cache_mapitem_pl_t
-#define RBTREE_MAP_PL rbtree_ze_memhandle_cache_map_pl_t
-#define RBTREE_CMP(a,b) ze_cache_key_cmp((a), (b))
-#define RBTREE_ASSERT                     psmi_assert
-#define RBTREE_MAP_COUNT(PAYLOAD_PTR)     ((PAYLOAD_PTR)->nelems)
-#define RBTREE_NO_EMIT_IPS_CL_QMAP_PREDECESSOR
-
-#include "psm3_rbtree.h"
-#include "psm3_rbtree.c"
-
-/*
- * Convenience rbtree cruft
- */
-#define NELEMS(cache)	((cache)->map.payload.nelems)
-
-#define IHEAD(cache)	((cache)->map.root)
-#define LAST(cache)	(IHEAD(cache)->payload.i_prev)
-#define FIRST(cache)	(IHEAD(cache)->payload.i_next)
-#define INEXT(x)	((x)->payload.i_next)
-#define IPREV(x)	((x)->payload.i_prev)
-
-/*
- * Actual module data
- */
-struct am_ze_memhandle_cache {
-	cl_qmap_t map;
-	mpool_t mpool;
-	uint32_t size;
-	psm2_mq_stats_t *stats;
-};
-
-static void print_ze_memhandle_cache_stats(psm2_mq_stats_t *stats)
-{
-	_HFI_DBG("limit=%lu,maxelems=%lu,hit=%lu,miss=%lu,evict=%lu,remove=%lu,clear=%lu\n",
-		stats->gpu_ipc_cache_limit, stats->gpu_ipc_cache_max_nelems,
-		stats->gpu_ipc_cache_hit, stats->gpu_ipc_cache_miss,
-		stats->gpu_ipc_cache_evict, stats->gpu_ipc_cache_remove,
-		stats->gpu_ipc_cache_clear);
-}
-
-/*
- * This is the callback function when mempool are resized or destroyed.
- * Upon calling cache free mpool is destroyed which in turn calls this callback
- * which helps in closing all memhandles.
- * TBD - only called for !is_alloc when destroying so could avoid keeping
- * cache pointer in memcache_item.  But when GEM_CLOSE is not needed
- * memhandle_delete won't need destroyng flag and can remove cache pointer then
- */
-static void
-psmi_ze_memhandle_cache_alloc_func(int is_alloc, void* context, void* obj)
-{
-	cl_map_item_t* memcache_item = (cl_map_item_t*)obj;
-	if (!is_alloc) {
-		if(memcache_item->payload.start)
-			am_ze_memhandle_delete(memcache_item->payload.buf_ptr);
-	}
-}
-
-/*
- * Creating mempool for ze memhandle cache nodes.
- */
-static psm2_error_t
-am_ze_memhandle_mpool_alloc(am_ze_memhandle_cache_t cache,
-							uint32_t memcache_size)
-{
-	psm2_error_t err;
-	if (memcache_size < 1)
-		return PSM2_PARAM_ERR;
-
-	cache->size = memcache_size;
-	/* Creating a memory pool of size PSM3_ONEAPI_MEMCACHE_SIZE
-	 * which includes the Root and NIL items
-	 */
-	cache->mpool = psm3_mpool_create_for_gpu(sizeof(cl_map_item_t),
-					cache->size,
-					cache->size, 0,
-					UNDEFINED, NULL, NULL,
-					psmi_ze_memhandle_cache_alloc_func,
-					NULL);
-	if (cache->mpool == NULL) {
-		err = psm3_handle_error(PSMI_EP_NORETURN, PSM2_NO_MEMORY,
-				"Couldn't allocate ONEAPI host receive buffer pool");
-		return err;
-	}
-	return PSM2_OK;
-}
-
-/*
- * allocate and initialize memhandle cache
- * including rbtree.
- */
-psm2_error_t am_ze_memhandle_cache_alloc(am_ze_memhandle_cache_t *cachep,
-									uint32_t memcache_size,
-									psm2_mq_stats_t *stats)
-{
-	cl_map_item_t *root = NULL, *nil_item = NULL;
-
-	*cachep = (am_ze_memhandle_cache_t)psmi_calloc(
-				NULL, UNDEFINED, 1, sizeof(**cachep));
-	if (! *cachep)
-		return PSM2_NO_MEMORY;
-
-	psm2_error_t err = am_ze_memhandle_mpool_alloc(*cachep, memcache_size);
-	if (err != PSM2_OK)
-		return err;
-
-	root = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
-	if (root == NULL) {
-		err = PSM2_NO_MEMORY;
-		goto fail;
-	}
-	nil_item = (cl_map_item_t *)psmi_calloc(NULL, UNDEFINED, 1, sizeof(cl_map_item_t));
-	if (nil_item == NULL) {
-		err = PSM2_NO_MEMORY;
-		goto fail;
-	}
-
-	nil_item->payload.start = 0;
-	nil_item->payload.epid = psm3_epid_zeroed_internal();
-	ips_cl_qmap_init(&(*cachep)->map,root,nil_item);
-	NELEMS(*cachep) = 0;
-
-	 (*cachep)->stats = stats;
-
-	stats->gpu_ipc_cache_limit = memcache_size;
-	stats->gpu_ipc_cache_nelems = 0;
-	stats->gpu_ipc_cache_max_nelems = 0;
-	stats->gpu_ipc_cache_hit = 0;
-	stats->gpu_ipc_cache_miss = 0;
-	stats->gpu_ipc_cache_evict = 0;
-	stats->gpu_ipc_cache_remove = 0;
-	stats->gpu_ipc_cache_clear = 0;
-
-	return PSM2_OK;
-
-fail:
-	if (nil_item)
-		psmi_free(nil_item);
-	if (root)
-		psmi_free(root);
-	if ((*cachep)->mpool)
-		psm3_mpool_destroy((*cachep)->mpool);
-	psmi_free(*cachep);
-	return err;
-}
-
-void am_ze_memhandle_cache_free(am_ze_memhandle_cache_t cache)
-{
-	print_ze_memhandle_cache_stats(cache->stats);
-
-	if (cache->map.nil_item)
-		psmi_free(cache->map.nil_item);
-	if (cache->map.root)
-		psmi_free(cache->map.root);
-	if (cache->mpool)
-		psm3_mpool_destroy(cache->mpool);
-	psmi_free(cache);
-}
-
-/*
- * Insert at the head of Idleq.
- */
-static void
-am_ze_idleq_insert(am_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item)
-{
-	if (FIRST(cache) == NULL) {
-		FIRST(cache) = memcache_item;
-		LAST(cache) = memcache_item;
-		return;
-	}
-	INEXT(FIRST(cache)) = memcache_item;
-	IPREV(memcache_item) = FIRST(cache);
-	FIRST(cache) = memcache_item;
-	INEXT(FIRST(cache)) = NULL;
-	return;
-}
-
-/*
- * Remove least recent used element.
- */
-static void
-am_ze_idleq_remove_last(am_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item)
-{
-	if (!INEXT(memcache_item)) {
-		LAST(cache) = NULL;
-		FIRST(cache) = NULL;
-	} else {
-		LAST(cache) = INEXT(memcache_item);
-		IPREV(LAST(cache)) = NULL;
-	}
-	// Null-out now-removed memcache_item's next and prev pointers out of
-	// an abundance of caution
-	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
-}
-
-static void
-am_ze_idleq_remove(am_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item)
-{
-	if (LAST(cache) == memcache_item) {
-		am_ze_idleq_remove_last(cache, memcache_item);
-	} else if (FIRST(cache) == memcache_item) {
-		FIRST(cache) = IPREV(memcache_item);
-		INEXT(FIRST(cache)) = NULL;
-	} else {
-		INEXT(IPREV(memcache_item)) = INEXT(memcache_item);
-		IPREV(INEXT(memcache_item)) = IPREV(memcache_item);
-	}
-	// Null-out now-removed memcache_item's next and prev pointers out of
-	// an abundance of caution
-	INEXT(memcache_item) = IPREV(memcache_item) = NULL;
-}
-
-static void
-am_ze_idleq_reorder(am_ze_memhandle_cache_t cache, cl_map_item_t* memcache_item)
-{
-	if (FIRST(cache) == memcache_item && LAST(cache) == memcache_item ) {
-		return;
-	}
-	am_ze_idleq_remove(cache, memcache_item);
-	am_ze_idleq_insert(cache, memcache_item);
-	return;
-}
-
-/*
- * After a successful cache hit, item is validated by doing a
- * memcmp on the handle stored and the handle we receive from the
- * sender. If the validation fails the item is removed from the idleq,
- * the rbtree, is put back into the mpool and ZeMemCloseIpcHandle function
- * is called.
- * Level Zero's alloc_id will be unique per allocation, even if the allocation
- * was at the same address.  In some cases, but not always, the ipc_handle
- * will also be different.  So we validate both, although just checking alloc_id
- * would be sufficient.
- */
-
-static psm2_error_t
-am_ze_memhandle_cache_validate(am_ze_memhandle_cache_t cache,
-			       cl_map_item_t* memcache_item,
-			       uintptr_t sbuf, uint32_t handle,
-			       psm2_epid_t epid, uint64_t alloc_id)
-{
-	psmi_assert(!psm3_epid_cmp_internal(epid, memcache_item->payload.epid));
-	psmi_assert(sbuf == memcache_item->payload.start);
-	if (handle == memcache_item->payload.ze_handle &&
-	    alloc_id == memcache_item->payload.alloc_id) {
-		return PSM2_OK;
-	}
-	_HFI_DBG("cache remove stale entry: new start=%lu,handle=%u,alloc_id=%lu\n",
-		 sbuf, handle, alloc_id);
-
-	cache->stats->gpu_ipc_cache_remove++;
-	ips_cl_qmap_remove_item(&cache->map, memcache_item);
-	cache->stats->gpu_ipc_cache_nelems--;
-	am_ze_memhandle_delete(memcache_item->payload.buf_ptr);
-	am_ze_idleq_remove(cache, memcache_item);
-	memset(memcache_item, 0, sizeof(*memcache_item));
-	psm3_mpool_put(memcache_item);
-	return PSM2_OK_NO_PROGRESS;
-}
-
-/*
- * Current eviction policy: Least Recently Used.
- */
-static void
-am_ze_memhandle_cache_evict(am_ze_memhandle_cache_t cache)
-{
-	cache->stats->gpu_ipc_cache_evict++;
-	cl_map_item_t *p_item = LAST(cache);
-	_HFI_VDBG("Removing (epid=%s,start=%lu,dev_ptr=%p,it=%p) from ze_memhandle_cachemap.\n",
-			psm3_epid_fmt_internal(p_item->payload.epid, 0), p_item->payload.start,
-			p_item->payload.buf_ptr, p_item);
-	ips_cl_qmap_remove_item(&cache->map, p_item);
-	cache->stats->gpu_ipc_cache_nelems--;
-	am_ze_memhandle_delete(p_item->payload.buf_ptr);
-	am_ze_idleq_remove_last(cache, p_item);
-	memset(p_item, 0, sizeof(*p_item));
-	psm3_mpool_put(p_item);
-}
-
-static psm2_error_t
-am_ze_memhandle_cache_register(am_ze_memhandle_cache_t cache,
-			       uintptr_t sbuf, uint32_t handle,
-			       psm2_epid_t epid,
-			       void *buf_ptr, uint64_t alloc_id)
-{
-	if (NELEMS(cache) == cache->size)
-		am_ze_memhandle_cache_evict(cache);
-
-	cl_map_item_t* memcache_item = psm3_mpool_get(cache->mpool);
-	/* memcache_item cannot be NULL as we evict
-	 * before the call to mpool_get. Check has
-	 * been fixed to help with klockwork analysis.
-	 */
-	if (memcache_item == NULL)
-		return PSM2_NO_MEMORY;
-	memcache_item->payload.start = sbuf;
-	memcache_item->payload.ze_handle = handle;
-	memcache_item->payload.buf_ptr = buf_ptr;
-	memcache_item->payload.alloc_id = alloc_id;
-	memcache_item->payload.epid = epid;
-	memcache_item->payload.cache = cache;
-	ips_cl_qmap_insert_item(&cache->map, memcache_item);
-	cache->stats->gpu_ipc_cache_nelems++;
-	if (cache->stats->gpu_ipc_cache_nelems > cache->stats->gpu_ipc_cache_max_nelems)
-		cache->stats->gpu_ipc_cache_max_nelems = cache->stats->gpu_ipc_cache_nelems;
-	am_ze_idleq_insert(cache, memcache_item);
-	_HFI_VDBG("registered: handle %u sbuf 0x%lx ptr %p alloc_id %lu\n",
-		  handle, sbuf, buf_ptr, alloc_id);
-	return PSM2_OK;
-}
-
-#ifndef PSM_HAVE_PIDFD
-static inline psm2_error_t am_ze_prepare_fds_for_ipc_import(
-		uint32_t gem_handle, int device_index, int *ipc_fd,
-		psm2_epaddr_t epaddr)
-{
-	am_epaddr_t *am_epaddr = (am_epaddr_t*)epaddr;
-	int fd;
-	struct drm_prime_handle open_fd = {0, 0, -1};
-
-	if (device_index >= num_ze_devices) {
-		_HFI_ERROR("am_ze_memhandle_acquire received invalid device_index from peer: %d\n",
-			device_index);
-		psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-			"device_index "
-			"invalid - received from peer: %d",
-			device_index);
-		return PSM2_INTERNAL_ERR;
-	}
-	fd = am_epaddr->peer_fds[device_index];
-	cur_ze_dev = &ze_devices[device_index];
-	open_fd.flags = DRM_CLOEXEC | DRM_RDWR;
-	open_fd.handle = gem_handle;
-	if (ioctl(fd, DRM_IOCTL_PRIME_HANDLE_TO_FD, &open_fd) < 0) {
-		_HFI_ERROR("ioctl failed for DRM_IOCTL_PRIME_HANDLE_TO_FD: %s\n", strerror(errno));
-		psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-			"ioctl "
-			"failed for DRM_IOCTL_PRIME_HANDLE_TO_FD errno=%d",
-			errno);
-		return PSM2_INTERNAL_ERR;
-	}
-	*ipc_fd = open_fd.fd;
-
-	return PSM2_OK;
-}
-#else
-static inline psm2_error_t am_ze_prepare_fds_for_ipc_import(
-		uint32_t handle, int device_index, int *ipc_fd,
-		psm2_epaddr_t epaddr)
-{
-	int fd;
-	am_epaddr_t *am_epaddr = (am_epaddr_t *)epaddr;
-
-	fd = syscall(__NR_pidfd_getfd, am_epaddr->pidfd, handle, 0);
-	if (fd < 0) {
-		_HFI_ERROR("pidfd_getfd failed %d: %s\n", fd, strerror(errno));
-		psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-			"pidfd_getfd failed errno=%d (%s)",
-			errno, strerror(errno));
-		return PSM2_INTERNAL_ERR;
-	}
-	*ipc_fd = fd;
-
-	return PSM2_OK;
-}
-#endif /* PSM_HAVE_PIDFD */
-#endif /* defined(HAVE_DRM) || defined(HAVE_LIBDRM) */
-
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-static void *am_ze_import_ipc_buf(uint32_t fd, uint8_t alloc_type)
-{
-	ze_external_memory_import_fd_t import_desc = {};
-	void *ze_ipc_buf = NULL;
-
-	import_desc.stype = ZE_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMPORT_FD;
-	import_desc.flags = ZE_EXTERNAL_MEMORY_TYPE_FLAG_DMA_BUF;
-	import_desc.fd = fd;
-
-	switch(alloc_type) {
-	case ZE_MEMORY_TYPE_HOST:
-	{
-		ze_host_mem_alloc_desc_t host_desc = {};
-
-		host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
-		host_desc.pNext = &import_desc;
-		/* size & alignment are not used since this is an import.*/
-		PSMI_ONEAPI_ZE_CALL(zeMemAllocHost, ze_context, &host_desc,
-				    0, 0, &ze_ipc_buf);
-	}
-		break;
-	case ZE_MEMORY_TYPE_DEVICE:
-	{
-		ze_device_mem_alloc_desc_t dev_desc = {};
-
-		dev_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
-		dev_desc.pNext = &import_desc;
-		/* size & alignment are not used since this is an import. */
-		PSMI_ONEAPI_ZE_CALL(zeMemAllocDevice, ze_context, &dev_desc,
-				    0, 0, cur_ze_dev->dev, &ze_ipc_buf);
-	}
-		break;
-	default:
-		_HFI_ERROR("Invalid alloc_type %u for fd %u\n",
-			   alloc_type, fd);
-		return NULL;
-	}
-
-	return ze_ipc_buf;
-}
-#endif /* defined(HAVE_DRM) || defined(HAVE_LIBDRM) */
-
-/*
- * The key used to search the cache is the senders buf address pointer and
- * epid.  The sender will have used zeMemGetAddressRange
- * to find the start of the memory containing the buffer (supplied as sbuf)
- * Upon match, we must validate the entry we find and may need to replace it.
- */
-void *
-am_ze_memhandle_acquire(am_ze_memhandle_cache_t cache,
-			uintptr_t sbuf, uint32_t handle,
-			psm2_epaddr_t epaddr, int device_index,
-			uint64_t alloc_id, uint8_t alloc_type)
-{
-	void *buf_ptr = NULL;
-	psm2_epid_t epid = epaddr->epid;
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-	int ipc_fd = -1;
-#endif
-	_HFI_VDBG("sbuf=%lu,handle=%u,epid=%s\n",
-		  sbuf, handle, psm3_epid_fmt_internal(epid, 0));
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-
-	if (!cache) {
-		if (am_ze_prepare_fds_for_ipc_import(handle, device_index, &ipc_fd,
-						     epaddr) == PSM2_OK) {
-			buf_ptr = am_ze_import_ipc_buf(ipc_fd, alloc_type);
-			if (ipc_fd >= 0) {
-				if (close(ipc_fd) < 0) {
-					_HFI_ERROR("close failed for ipc_fd: %s\n", strerror(errno));
-					psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-						"close "
-						"failed for ipc_fd %d errno=%d",
-						ipc_fd, errno);
-					return NULL;
-				}
-			}
-		}
-		return buf_ptr;
-	}
-
-	ze_cache_item key = {
-		.start = (unsigned long) sbuf,
-		.epid = epid
-	};
-
-	/*
-	 * preconditions:
-	 *  1) buffer [start,epid) may or may not be in cache->map already
-	 *  2) there are no duplicate entries in cache->map
-	 * postconditions:
-	 *  1) buffer is in cache->map with same handle, epid, alloc_id
-	 *  2) there are no duplicate entries in cache->map
-	 *
-	 * The key used to search the cache is the senders buf address pointer
-	 * and epid.
-	 * Upon a succesful hit in the cache, additional validation is required
-	 * as the handle or alloc_id could be stale.
-	 */
-	cl_map_item_t *p_item = ips_cl_qmap_searchv(&cache->map, &key);
-	if (p_item->payload.start) {
-		// confirm the entry for sbuf matches the handle and is not stale
-		if (am_ze_memhandle_cache_validate(cache, p_item, sbuf, handle,
-						   epid, alloc_id) ==
-						   PSM2_OK) {
-			cache->stats->gpu_ipc_cache_hit++;
-			am_ze_idleq_reorder(cache, p_item);
-			return p_item->payload.buf_ptr;
-		}
-
-		// buffer found was stale am_oneapi_memhandle_cache_validate()
-		// closed and removed existing entry.
-		// Should find no more duplicates
-#ifdef PSM_DEBUG
-		p_item = ips_cl_qmap_searchv(&cache->map, &key);
-		psmi_assert(! p_item->payload.start);
-#endif
-	}
-	cache->stats->gpu_ipc_cache_miss++;
-
-	if (am_ze_prepare_fds_for_ipc_import(handle, device_index, &ipc_fd,
-					     epaddr) == PSM2_OK) {
-		buf_ptr = am_ze_import_ipc_buf(ipc_fd, alloc_type);
-		if (ipc_fd >= 0) {
-			if (close(ipc_fd) < 0) {
-				_HFI_ERROR("close failed for ipc_fd: %s\n", strerror(errno));
-				psm3_handle_error(PSMI_EP_NORETURN, PSM2_INTERNAL_ERR,
-					"close "
-					"failed for ipc_fd %d errno=%d",
-					ipc_fd, errno);
-				return NULL;
-			}
-		}
-		if (!buf_ptr)
-			return NULL;
-	} else {
-		return NULL;
-	}
-
-	am_ze_memhandle_cache_register(cache, sbuf, handle, epid, buf_ptr,
-				       alloc_id);
-	return buf_ptr;
-#else // if no drm, set up to return NULL as oneapi ipc handles don't work without drm
-	buf_ptr = NULL;
-	return buf_ptr;
-#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-
-}
-
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-void am_ze_memhandle_delete(void *buf_ptr)
-{
-	/* Release the reference to the buffer */
-	PSMI_ONEAPI_ZE_CALL(zeMemFree, ze_context, buf_ptr);
-
-#ifndef PSM_HAVE_PIDFD
-	/*
-	 * If pidfd is not used, we need to call GEM_CLOSE ioctl to remove the
-	 * GEM handle from the handle cache of the peer device file's
-	 * private file data in the kernel to avoid handle leak. However, we
-	 * will have a potential risk condition that will fail a later request:
-	 * (1) 3 requests with buf1, buf2, and buf1 are sent from sender side.
-	 *     Requests 1 and 3 uses the same buffer and therefore have the
-	 *     same gem_handle1.
-	 * (2) buf1 is received and put into cache;
-	 * (3) buf2 is received and buf1 is evicted from cache due to some
-	 *     condition (small cache size). As a result, gem_handle1 is closed
-	 *     through GEM_CLOSE ioctl. buf2 is put into cache.
-	 * (4) Request 3 (with buf1) is received and HANDLE_TO_FD ioctl will
-	 *     fail because the gem_handle has been removed from peer device
-	 *     file's handle cache.
-	 * For this reason, we prefer to leak the GEM handle over calling
-	 * GEM_CLOSE.
-	 */
-#endif
-}
-#endif /* HAVE_DRM or HAVE_LIBDRM */
-
-void
-am_ze_memhandle_release(am_ze_memhandle_cache_t cache,
-			void *buf_ptr)
-{
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-	if (!cache)
-		am_ze_memhandle_delete(buf_ptr);
-#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-	return;
-}
-
-#endif /* PSM_ONEAPI */
diff --git a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.h b/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.h
deleted file mode 100644
index 12539540507..00000000000
--- a/prov/psm3/psm3/ptl_am/am_oneapi_memhandle_cache.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
-
-  This file is provided under a dual BSD/GPLv2 license.  When using or
-  redistributing this file, you may do so under either license.
-
-  GPL LICENSE SUMMARY
-
-  Copyright(c) 2022 Intel Corporation.
-
-  This program is free software; you can redistribute it and/or modify
-  it under the terms of version 2 of the GNU General Public License as
-  published by the Free Software Foundation.
-
-  This program is distributed in the hope that it will be useful, but
-  WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  General Public License for more details.
-
-  Contact Information:
-  Intel Corporation, www.intel.com
-
-  BSD LICENSE
-
-  Copyright(c) 2022 Intel Corporation.
-
-  Redistribution and use in source and binary forms, with or without
-  modification, are permitted provided that the following conditions
-  are met:
-
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in
-      the documentation and/or other materials provided with the
-      distribution.
-    * Neither the name of Intel Corporation nor the names of its
-      contributors may be used to endorse or promote products derived
-      from this software without specific prior written permission.
-
-  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#ifdef PSM_ONEAPI
-
-#ifndef _AM_ONEAPI_MEMHANDLE_H
-#define _AM_ONEAPI_MEMHANDLE_H
-
-#include "psm_user.h"
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define ONEAPI_MEMHANDLE_CACHE_SIZE 64
-
-struct am_ze_memhandle_cache;	// opaque since contains rbtree fields
-typedef struct am_ze_memhandle_cache *am_ze_memhandle_cache_t;
-
-struct am_oneapi_ze_ipc_info {
-	uint32_t handle;  /* GEM handle or file descriptor */
-	uint8_t alloc_type; /* allocation type */
-};
-typedef struct am_oneapi_ze_ipc_info *am_oneapi_ze_ipc_info_t;
-
-psm2_error_t am_ze_memhandle_cache_alloc(am_ze_memhandle_cache_t *cachep,
-										uint32_t memcache_size,
- 										psm2_mq_stats_t *stats);
-
-void *
-am_ze_memhandle_acquire(am_ze_memhandle_cache_t cache,
-			uintptr_t sbuf, uint32_t handle,
-			psm2_epaddr_t epaddr, int device_index,
-			uint64_t alloc_id, uint8_t alloc_type);
-void
-am_ze_memhandle_release(am_ze_memhandle_cache_t cache, void *buf_ptr);
-
-void am_ze_memhandle_cache_free(am_ze_memhandle_cache_t cache);
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* _AM_ONEAPI_MEMHANDLE_H */
-
-#endif /* PSM_ONEAPI */
diff --git a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c
index 89dbdd6cd87..722f9fdbb1b 100644
--- a/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c
+++ b/prov/psm3/psm3/ptl_am/am_reqrep_shmem.c
@@ -59,6 +59,7 @@
 #include <fcntl.h>
 #include <signal.h>
 #include <errno.h>
+#include <stdatomic.h>
 
 #include "psm_user.h"
 #include "psm_mq_internal.h"
@@ -66,27 +67,6 @@
 #include "cmarw.h"
 #include "psmi_wrappers.h"
 
-#ifdef PSM_CUDA
-#include "am_cuda_memhandle_cache.h"
-#endif
-
-#ifdef PSM_ONEAPI
-#include "am_oneapi_memhandle_cache.h"
-#ifdef HAVE_DRM
-#include <drm/i915_drm.h>
-#include <sys/ioctl.h>
-#include <stdio.h>
-#endif
-#ifdef HAVE_LIBDRM
-#include <libdrm/i915_drm.h>
-#include <sys/ioctl.h>
-#include <stdio.h>
-#endif
-#ifdef PSM_HAVE_PIDFD
-#include <sys/syscall.h>
-#endif
-#endif
-
 /* AMLONG_PAYLOAD is number of bytes available in a bulk packet for payload. */
 #define AMLONG_PAYLOAD(FifoLong) ((FifoLong) - sizeof(am_pkt_bulk_t))
 
@@ -169,9 +149,9 @@ static uint32_t create_extra_ep_data()
 {
 	uint32_t ret = getpid();
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/* PID is at maximum 22 bits */
-	ret |= my_gpu_device << 22;
+	ret |= psm3_my_gpu_device << 22;
 #endif
 
 	return ret;
@@ -190,12 +170,14 @@ static void am_update_directory(struct am_ctl_nodeinfo *, size_t segsz);
 static
 void amsh_atexit()
 {
-	static ips_atomic_t atexit_once = { 0 };
+	static atomic_int atexit_once = 0;
+	int expected = 0;
+
 	psm2_ep_t ep;
 	struct ptl_am *ptl;
 
 	/* bail out if previous value is non-zero */
-	if (ips_atomic_cmpxchg(&atexit_once, 0, 1) != 0)
+	if (!atomic_compare_exchange_strong(&atexit_once, &expected, 1))
 		return;
 
 	ep = psm3_opened_endpoint;
@@ -363,16 +345,7 @@ psm2_error_t psm3_shm_create(ptl_t *ptl_gen)
 	}
 
 	memset((void *) mapptr, 0, segsz); /* touch all of my pages */
-#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
-	if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
-		PSMI_CUDA_CALL(cuMemHostRegister, mapptr, segsz,
-				CU_MEMHOSTALLOC_PORTABLE);
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-	if (PSMI_IS_GPU_ENABLED)
-		PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver,
-                                    mapptr, segsz);
-#endif
+	PSM3_GPU_REGISTER_HOSTMEM(mapptr, segsz);
 
 	/* Our own ep's info for ptl_am resides at the start of the
 	   shm object.  Other processes need some of this info to
@@ -421,36 +394,8 @@ psm2_error_t psm3_do_unmap(struct am_ctl_nodeinfo *nodeinfo)
 
 {
 	psm2_error_t err = PSM2_OK;
-#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
-	if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
-		/* ignore NOT_REGISTERED in case cuda initialized late */
-		/* ignore other errors as context could be destroyed before this */
-		CUresult cudaerr;
-		//PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
-		//		cuMemHostUnregister, (void*)nodeinfo->amsh_shmbase);
-		psmi_count_cuMemHostUnregister++;
-		cudaerr = psmi_cuMemHostUnregister((void*)nodeinfo->amsh_shmbase);
-		if (cudaerr) {
-			const char *pStr = NULL;
-			psmi_count_cuGetErrorString++;
-			psmi_cuGetErrorString(cudaerr, &pStr);
-			_HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n",
-					cudaerr, pStr?pStr:"Unknown");
-		}
-	}
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-        if (PSMI_IS_GPU_ENABLED) {
-			ze_result_t result;
-			//PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver,
-			//	    (void *)nodeinfo->amsh_shmbase);
-			psmi_count_zexDriverReleaseImportedPointer++;
-			result = psmi_zexDriverReleaseImportedPointer(ze_driver,
-					    (void *)nodeinfo->amsh_shmbase);
-			if (result != ZE_RESULT_SUCCESS) {
-				_HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result));
-			}
-		}
+#ifdef PSM_HAVE_GPU
+	PSM3_GPU_UNREGISTER_HOSTMEM((void*)nodeinfo->amsh_shmbase);
 #endif
 	if (munmap((void *)nodeinfo->amsh_shmbase, am_ctl_sizeof_seg(nodeinfo))) {
 		err =
@@ -583,15 +528,8 @@ psm2_error_t psm3_shm_map_remote(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t *shm
 
 	// read every page in segment so faulted into our address space
 	psm3_touch_mmap(dest_mapptr, segsz);
-#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
-	if (PSMI_IS_GPU_ENABLED && check_have_cuda_ctxt())
-		PSMI_CUDA_CALL(cuMemHostRegister, dest_mapptr, segsz,
-				CU_MEMHOSTALLOC_PORTABLE);
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-	if (PSMI_IS_GPU_ENABLED)
-		PSMI_ONEAPI_ZE_CALL(zexDriverImportExternalPointer, ze_driver,
-				    dest_mapptr, segsz);
+#ifdef PSM_HAVE_GPU
+	PSM3_GPU_REGISTER_HOSTMEM(dest_mapptr, segsz);
 #endif
 
 	shmidx = -1;
@@ -732,36 +670,8 @@ psm2_error_t psm3_shm_detach(ptl_t *ptl_gen)
 	shm_unlink(ptl->amsh_keyname);
 	psmi_free(ptl->amsh_keyname);
 
-#if defined(PSM_CUDA) && !defined(PSM3_NO_CUDA_REGISTER)
-	if (PSMI_IS_GPU_ENABLED && cu_ctxt) {
-		/* ignore NOT_REGISTERED in case cuda initialized late */
-		/* ignore other errors as context could be destroyed before this */
-		CUresult cudaerr;
-		//PSMI_CUDA_CALL_EXCEPT(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
-		//		cuMemHostUnregister, (void*)shmbase);
-		psmi_count_cuMemHostUnregister++;
-		cudaerr = psmi_cuMemHostUnregister((void*)shmbase);
-		if (cudaerr) {
-			const char *pStr = NULL;
-			psmi_count_cuGetErrorString++;
-			psmi_cuGetErrorString(cudaerr, &pStr);
-			_HFI_DBG("CUDA failure: cuMemHostUnregister returned %d: %s\n",
-					cudaerr, pStr?pStr:"Unknown");
-		}
-	}
-#endif
-#if defined(PSM_ONEAPI) && !defined(PSM3_NO_ONEAPI_IMPORT)
-	if (PSMI_IS_GPU_ENABLED) {
-		ze_result_t result;
-		//PSMI_ONEAPI_ZE_CALL(zexDriverReleaseImportedPointer, ze_driver,
-		//		    (void *)shmbase);
-		psmi_count_zexDriverReleaseImportedPointer++;
-		result = psmi_zexDriverReleaseImportedPointer(ze_driver,
-				    (void *)shmbase);
-		if (result != ZE_RESULT_SUCCESS) {
-			_HFI_DBG("OneAPI Level Zero failure: zexDriverReleaseImportedPointer returned %d: %s\n", result, psmi_oneapi_ze_result_to_string(result));
-		}
-	}
+#ifdef PSM_HAVE_GPU
+	PSM3_GPU_UNREGISTER_HOSTMEM((void*)shmbase);
 #endif
 	if (munmap((void *)shmbase, am_ctl_sizeof_block(ptl))) {
 		err =
@@ -882,26 +792,11 @@ amsh_epaddr_add(ptl_t *ptl_gen, psm2_epid_t epid, uint16_t shmidx, psm2_epaddr_t
 	amaddr->return_shmidx = -1;
 	amaddr->cstate_outgoing = AMSH_CSTATE_OUTGOING_NONE;
 	amaddr->cstate_incoming = AMSH_CSTATE_INCOMING_NONE;
-#ifdef PSM_ONEAPI
-#ifdef PSM_HAVE_PIDFD
-	amaddr->pidfd = syscall(SYS_pidfd_open, ptl->am_ep[shmidx].pid, 0);
-	if (amaddr->pidfd < 0) {
-		_HFI_ERROR("pidfd_open failed: pid %u, ret %d (%s)\n",
-			   ptl->am_ep[shmidx].pid, amaddr->pidfd,
-			   strerror(errno));
+#ifdef PSM_HAVE_GPU
+	err = PSM3_GPU_SHM_EPADDR_ADD(ptl, amaddr);
+	if (err)
 		goto fail;
-	}
-#else
-	amaddr->num_peer_fds = 0;
-	{
-		int i;
-		for (i=0; i < MAX_ZE_DEVICES; i++)
-			amaddr->peer_fds[i] = -1;
-	}
-	amaddr->sock_connected_state = ZE_SOCK_NOT_CONNECTED;
-	amaddr->sock = -1;
 #endif
-#endif /* PSM_ONEAPI */
 
 	/* other setup */
 	ptl->am_ep[shmidx].epaddr = epaddr;
@@ -952,23 +847,8 @@ amsh_epaddr_update(ptl_t *ptl_gen, psm2_epaddr_t epaddr)
 	return;
 }
 
-struct ptl_connection_req {
-	int isdone;
-	int op;			/* connect or disconnect */
-	int numep;
-	int numep_left;
-	int phase;
-
-	int *epid_mask;
-	const psm2_epid_t *epids;	/* input epid list */
-	psm2_epaddr_t *epaddr;
-	psm2_error_t *errors;	/* inout errors */
-
-	/* Used for connect/disconnect */
-	psm2_amarg_t args[6];
-};
-
 static
+
 void amsh_free_epaddr(ptl_t *ptl_gen, psm2_epaddr_t epaddr)
 {
 	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
@@ -979,29 +859,11 @@ void amsh_free_epaddr(ptl_t *ptl_gen, psm2_epaddr_t epaddr)
 	psmi_assert(ptl->am_ep[amaddr->shmidx].epaddr == epaddr);
 	if (ptl->am_ep[amaddr->shmidx].epaddr == epaddr)
 		ptl->am_ep[amaddr->shmidx].epaddr = NULL;
-#ifdef PSM_ONEAPI
-#ifdef PSM_HAVE_PIDFD
-	if (amaddr->pidfd >= 0)
-		close(amaddr->pidfd);
-#else
-	{
-		int i;
-		for (i=0; i < MAX_ZE_DEVICES; i++)
-			if (amaddr->peer_fds[i] >= 0)
-				close(amaddr->peer_fds[i]);
-	}
-	if (amaddr->sock >= 0)
-		close(amaddr->sock);
-#endif
-#endif /* PSM_ONEAPI */
+	PSM3_GPU_SHM_EPADDR_FREE(amaddr);
 	psmi_free(epaddr);
 	return;
 }
 
-#define PTL_OP_CONNECT      0
-#define PTL_OP_DISCONNECT   1
-#define PTL_OP_ABORT        2
-
 static
 psm2_error_t
 amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */
@@ -1009,17 +871,17 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */
 		     const int array_of_epid_mask[],
 		     psm2_error_t *array_of_errors,
 		     psm2_epaddr_t *array_of_epaddr,
-		     struct ptl_connection_req **req_o)
+		     struct am_ptl_connection_req **req_o)
 {
 	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
 	int i, cstate;
 	psm2_epaddr_t epaddr;
 	psm2_epid_t epid;
-	struct ptl_connection_req *req = NULL;
+	struct am_ptl_connection_req *req = NULL;
 
-	req = (struct ptl_connection_req *)
+	req = (struct am_ptl_connection_req *)
 	    psmi_calloc(ptl->ep, PER_PEER_ENDPOINT, 1,
-			sizeof(struct ptl_connection_req));
+			sizeof(struct am_ptl_connection_req));
 	if (req == NULL)
 		return PSM2_NO_MEMORY;
 	req->isdone = 0;
@@ -1043,7 +905,7 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */
 		req->epid_mask[i] = AMSH_CMASK_NONE;	/* no connect by default */
 		if (!array_of_epid_mask[i])
 			continue;
-		if (op == PTL_OP_CONNECT) {
+		if (op == AM_PTL_OP_CONNECT) {
 			epid = array_of_epid[i];
 			/* Connect only to other processes reachable by shared memory.
 			   The self PTL handles loopback communication, so explicitly
@@ -1068,12 +930,11 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */
 				if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) {
 					array_of_epaddr[i] = epaddr;
 					array_of_errors[i] = PSM2_OK;
-#ifdef PSM_ONEAPI
-#ifndef PSM_HAVE_PIDFD
-					// set done so know to check in amsh_ep_connreq_poll_dev_fds
-					req->epid_mask[i] = AMSH_CMASK_DONE;
-#endif
-#endif
+					if (PSM3_GPU_SHM_DEV_FDS_NEEDED()) {
+						// set done so know to check in
+						// PSM3_GPU_SHM_DEV_FDS_CONNEQ_POLL
+						req->epid_mask[i] = AMSH_CMASK_DONE;
+					}
 				} else {
 					psmi_assert(cstate ==
 						    AMSH_CSTATE_OUTGOING_NONE);
@@ -1092,7 +953,7 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */
 
 			psmi_assert(epaddr != NULL);
 			_HFI_CONNDBG("Disconnect force=%d epid %s\n",
-					(op == PTL_OP_ABORT), psm3_epid_fmt_internal(epaddr->epid, 0));
+					(op == AM_PTL_OP_ABORT), psm3_epid_fmt_internal(epaddr->epid, 0));
 			cstate = ((am_epaddr_t *) epaddr)->cstate_outgoing;
 			if (cstate == AMSH_CSTATE_OUTGOING_ESTABLISHED) {
 				req->epid_mask[i] = AMSH_CMASK_PREREQ;
@@ -1109,7 +970,7 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */
 	if (req->numep_left == 0) {	/* nothing to do */
 		psmi_free(req->epid_mask);
 		psmi_free(req);
-		if (op != PTL_OP_ABORT) {
+		if (op != AM_PTL_OP_ABORT) {
 			_HFI_CONNDBG("Nothing to connect, bump up phase\n");
 			ptl->connect_phase++;
 		}
@@ -1123,7 +984,7 @@ amsh_ep_connreq_init(ptl_t *ptl_gen, int op, /* connect, disconnect or abort */
 
 static
 psm2_error_t
-amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req)
+amsh_ep_connreq_poll(ptl_t *ptl_gen, struct am_ptl_connection_req *req)
 {
 	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
 	int i, j, cstate;
@@ -1137,7 +998,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req)
 
 	psmi_assert_always(ptl->connect_phase == req->phase);
 
-	if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) {
+	if (req->op == AM_PTL_OP_DISCONNECT || req->op == AM_PTL_OP_ABORT) {
 		for (i = 0; i < req->numep; i++) {
 			if (req->epid_mask[i] == AMSH_CMASK_NONE ||
 			    req->epid_mask[i] == AMSH_CMASK_DONE)
@@ -1164,7 +1025,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req)
 				psmi_assert(shmidx != (uint16_t)-1);
 				req->args[2].u32w0 = create_extra_ep_data();
 				req->args[2].u32w1 = PSM2_OK;
-				if (req->op != PTL_OP_ABORT)
+				if (req->op != AM_PTL_OP_ABORT)
 					req->args[3].u64w0 =
 					    (uint64_t) (uintptr_t) &req->errors[i];
 				else
@@ -1229,12 +1090,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req)
 				((am_epaddr_t *) epaddr)->cstate_outgoing =
 					AMSH_CSTATE_OUTGOING_ESTABLISHED;
 				req->epid_mask[i] = AMSH_CMASK_DONE;
-#ifdef PSM_ONEAPI
-#ifndef PSM_HAVE_PIDFD
-				if (PSMI_IS_GPU_ENABLED)
-					psm3_send_dev_fds(ptl_gen, epaddr);
-#endif
-#endif
+				PSM3_GPU_SHM_DEV_FDS_SEND(ptl, (struct am_epaddr *)epaddr);
 				continue;
 			}
 		}
@@ -1341,7 +1197,7 @@ amsh_ep_connreq_poll(ptl_t *ptl_gen, struct ptl_connection_req *req)
 
 static
 psm2_error_t
-amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req)
+amsh_ep_connreq_fini(ptl_t *ptl_gen, struct am_ptl_connection_req *req)
 {
 	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
 	psm2_error_t err = PSM2_OK;
@@ -1355,13 +1211,13 @@ amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req)
 	/* This prevents future connect replies from referencing data structures
 	 * that disappeared.  For abort we aren't waiting for DISC_REP so
 	 * we want to keep same phase so we accept them after this function */
-	if (req->op != PTL_OP_ABORT)
+	if (req->op != AM_PTL_OP_ABORT)
 		ptl->connect_phase++;
 
 	/* First process any leftovers in postreq or prereq */
 	for (i = 0; i < req->numep; i++) {
 		if (req->epid_mask[i] == AMSH_CMASK_NONE
-			|| req->op == PTL_OP_ABORT)
+			|| req->op == AM_PTL_OP_ABORT)
 			continue;
 		else if (req->epid_mask[i] == AMSH_CMASK_POSTREQ) {
 			int cstate;
@@ -1370,20 +1226,12 @@ amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req)
 			if (cstate == AMSH_CSTATE_OUTGOING_REPLIED) {
 				((am_epaddr_t *) req->epaddr[i])->cstate_outgoing =
 					AMSH_CSTATE_OUTGOING_ESTABLISHED;
-#ifdef PSM_ONEAPI
-#ifndef PSM_HAVE_PIDFD
 				// late connect establish, check once to
 				// see if have GPU dev fds, if not, this one
 				// missed the timelimit and timesout
-				if (PSMI_IS_GPU_ENABLED && req->op == PTL_OP_CONNECT)
-					_HFI_CONNDBG("late established, special GPU dev FDs poll\n");
-				if (PSMI_IS_GPU_ENABLED && req->op == PTL_OP_CONNECT &&
-					PSM2_OK != psm3_check_dev_fds_exchanged(ptl_gen,
-											  				req->epaddr[i]))
+				if (PSM3_GPU_SHM_DEV_FDS_CHECK_EXCHANGED(ptl, req, i) != PSM2_OK)
 					req->errors[i] = PSM2_TIMEOUT;
 				else
-#endif
-#endif
 					req->numep_left--;
 			} else {	/* never actually got reply */
 				req->errors[i] = PSM2_TIMEOUT;
@@ -1403,7 +1251,7 @@ amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req)
 	for (i = 0; i < req->numep; i++) {
 		if (req->epid_mask[i] == AMSH_CMASK_NONE)
 			continue;
-		if (req->op == PTL_OP_ABORT
+		if (req->op == AM_PTL_OP_ABORT
 			 && req->epid_mask[i] != AMSH_CMASK_DONE) {
 			req->epid_mask[i] = AMSH_CMASK_DONE;
 			continue;
@@ -1415,7 +1263,7 @@ amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req)
 		/* Only free epaddr if they have disconnected from us */
 		int cstate = ((am_epaddr_t *) req->epaddr[i])->cstate_incoming;
 		if (cstate == AMSH_CSTATE_INCOMING_DISC_REQUESTED) {
-			if (req->op == PTL_OP_DISCONNECT || req->op == PTL_OP_ABORT) {
+			if (req->op == AM_PTL_OP_DISCONNECT || req->op == AM_PTL_OP_ABORT) {
 				psmi_assert(req->epaddr[i] != NULL);
 				amsh_free_epaddr(ptl_gen, req->epaddr[i]);
 				req->epaddr[i] = NULL;
@@ -1429,39 +1277,6 @@ amsh_ep_connreq_fini(ptl_t *ptl_gen, struct ptl_connection_req *req)
 	return err;
 }
 
-#ifdef PSM_ONEAPI
-#ifndef PSM_HAVE_PIDFD
-// check if all successful epid/epaddr in req have exchanged GPU dev FDs
-// when called it assumes all the good epid have completed so it does not
-// check failed epid and just treats them as done for this phase
-// return:
-//	PSM2_OK - all that can be done are done
-//	PSM2_OK_NO_PROGRESS - more to be done
-static
-psm2_error_t
-amsh_ep_connreq_poll_dev_fds(ptl_t *ptl_gen, struct ptl_connection_req *req)
-{
-	int num_left = 0;
-	int i;
-
-	for (i = 0; i < req->numep; i++) {
-		if (req->epid_mask[i] == AMSH_CMASK_NONE)
-			continue;
-		if (req->epid_mask[i] != AMSH_CMASK_DONE || req->errors[i])
-			continue;
-		psmi_assert(req->epaddr[i]);
-		psmi_assert(! psm3_epid_zero_internal(req->epaddr[i]->epid));
-		if (PSM2_OK != psm3_check_dev_fds_exchanged(ptl_gen, req->epaddr[i]))
-			num_left++;
-	}
-	if (num_left == 0)
-		return PSM2_OK;
-	else
-		return PSM2_OK_NO_PROGRESS;	// not done everyone yet
-}
-#endif
-#endif /* PSM_ONEAPI */
-
 /* Wrapper for 2.0's use of connect/disconnect.  The plan is to move the
  * init/poll/fini interface up to the PTL level for 2.2 */
 #define CONNREQ_ZERO_POLLS_BEFORE_YIELD  20
@@ -1477,7 +1292,7 @@ amsh_ep_connreq_wrap(ptl_t *ptl_gen, int op,
 	struct ptl_am *ptl = (struct ptl_am *)ptl_gen;
 	psm2_error_t err;
 	uint64_t t_start;
-	struct ptl_connection_req *req;
+	struct am_ptl_connection_req *req;
 	int num_polls_noprogress = 0;
 	static int shm_polite_attach = -1;
 
@@ -1503,7 +1318,7 @@ amsh_ep_connreq_wrap(ptl_t *ptl_gen, int op,
 					 * there was an error */
 		return err;
 
-	if (op == PTL_OP_ABORT) {
+	if (op == AM_PTL_OP_ABORT) {
 		int i;
 		/* loop a couple times only, ignore timeout */
 		/* this will move from PREREQ to POSTREQ and check once
@@ -1529,23 +1344,15 @@ amsh_ep_connreq_wrap(ptl_t *ptl_gen, int op,
 	do {
 		psm3_poll_internal(ptl->ep, 1, 0);
 		err = amsh_ep_connreq_poll(ptl_gen, req);
-		if (err == PSM2_OK)
-#ifndef PSM_ONEAPI
-			break;	/* Finished before timeout */
-#elif !defined(PSM_HAVE_PIDFD)
-		{
-			if (PSMI_IS_GPU_ENABLED && req->op == PTL_OP_CONNECT) {
-				if (amsh_ep_connreq_poll_dev_fds(ptl_gen, req) == PSM2_OK) {
+		if (err == PSM2_OK) {
+			if (PSM3_GPU_IS_ENABLED && req->op == AM_PTL_OP_CONNECT) {
+				if (PSM3_GPU_SHM_DEV_FDS_CONNREQ_POLL(ptl, req) == PSM2_OK)
 					break;	/* Finished before timeout */
-				} else {
+				 else
 					PSMI_YIELD(ptl->ep->mq->progress_lock);
-				}
 			} else
 				break;
 		}
-#else
-			break;
-#endif
 		else if (err != PSM2_OK_NO_PROGRESS) {
 				psmi_free(req->epid_mask);
 				psmi_free(req);
@@ -1578,7 +1385,7 @@ amsh_ep_connect(ptl_t *ptl,
 		psm2_error_t *array_of_errors,
 		psm2_epaddr_t *array_of_epaddr, uint64_t timeout_ns)
 {
-	return amsh_ep_connreq_wrap(ptl, PTL_OP_CONNECT, numep, array_of_epid,
+	return amsh_ep_connreq_wrap(ptl, AM_PTL_OP_CONNECT, numep, array_of_epid,
 				    array_of_epid_mask, array_of_errors,
 				    array_of_epaddr, timeout_ns);
 }
@@ -1591,7 +1398,7 @@ amsh_ep_disconnect(ptl_t *ptl, int force, int numep,
 		   psm2_error_t array_of_errors[], uint64_t timeout_ns)
 {
 	return amsh_ep_connreq_wrap(ptl,
-				    force ? PTL_OP_ABORT : PTL_OP_DISCONNECT,
+				    force ? AM_PTL_OP_ABORT : AM_PTL_OP_DISCONNECT,
 				    numep, NULL, array_of_epaddr_mask,
 				    array_of_errors,
 				    array_of_epaddr,
@@ -1746,14 +1553,8 @@ amsh_poll_internal_inner(ptl_t *ptl_gen, int replyonly,
 			} while (!QISEMPTY(ptl->reqH.head->flag));
 		}
 	}
-#ifdef PSM_ONEAPI
-#ifndef PSM_HAVE_PIDFD
-	// play err safe, callers ignore errors or expect just OK or NO_PROGRESS
-	if (((struct ptl_am *)ptl_gen)->ep->need_dev_fds_poll
-			&& psm3_poll_dev_fds_exchange(ptl_gen) != PSM2_OK_NO_PROGRESS)
-		err = PSM2_OK;
-#endif
-#endif
+
+	err = PSM3_GPU_SHM_DEV_FDS_POLL((struct ptl_am *)ptl_gen, err);
 
 	if (is_internal) {
 		if (err == PSM2_OK)	/* some progress, no yields */
@@ -2236,25 +2037,8 @@ amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req,
 	     psm2_epaddr_t epaddr, psm2_mq_tag_t *tag, const void *buf,
 	     uint32_t len)
 {
-#ifdef PSM_ONEAPI
 	psm2_amarg_t args[6];
-#else
-	psm2_amarg_t args[5];
-#endif
 	psm2_error_t err = PSM2_OK;
-#ifdef PSM_ONEAPI
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-#ifndef PSM_HAVE_PIDFD
-	int fd;
-	int *devfds;
-	int numfds;
-	int device_index = 0;
-#endif
-	uint64_t handle_fd = 0;
-	size_t total;
-#endif
-#endif
-
 
 	args[0].u32w0 = MQ_MSG_LONGRTS;
 	args[0].u32w1 = len;
@@ -2271,124 +2055,27 @@ amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req,
 	req->req_data.send_msglen = len;
 	req->send_msgoff = 0;
 
-#ifdef PSM_CUDA
-	/* If the send buffer is on gpu, we create a cuda IPC
+#ifdef PSM_HAVE_GPU
+	/* If the send buffer is on gpu, we create a GPU IPC
 	 * handle and send it as payload in the RTS */
 	if (req->is_buf_gpu_mem) {
-		CUdeviceptr buf_base_ptr;
-		PSMI_CUDA_CALL(cuMemGetAddressRange, &buf_base_ptr, NULL, (CUdeviceptr)buf);
-
-		/* Offset in GPU buffer from which we copy data, we have to
-			* send it separetly because this offset is lost
-			* when cuIpcGetMemHandle  is called */
-		req->cuda_ipc_offset = (uint32_t)((uintptr_t)buf - (uintptr_t)buf_base_ptr);
-		args[2].u32w0 = (uint32_t)req->cuda_ipc_offset;
-
-		PSMI_CUDA_CALL(cuIpcGetMemHandle,
-				&req->cuda_ipc_handle,
-				(CUdeviceptr) buf);
-		if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) {
-			psm3_am_reqq_add(AMREQUEST_SHORT, ptl,
-						epaddr, mq_handler_hidx,
-						args, 5, (void*)&req->cuda_ipc_handle,
-						sizeof(CUipcMemHandle), NULL, 0);
-		} else {
-			psm3_amsh_short_request(ptl, epaddr, mq_handler_hidx,
-						args, 5, (void*)&req->cuda_ipc_handle,
-						sizeof(CUipcMemHandle), 0);
-		}
-		req->cuda_ipc_handle_attached = 1;
-	} else
-#elif defined(PSM_ONEAPI)
-	/* If the send buffer is on gpu, we create a oneapi IPC
-	 * handle and send it as payload in the RTS */
-	if (req->is_buf_gpu_mem) {
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-		void *buf_base_ptr;
-#ifndef PSM_HAVE_PIDFD
-		struct drm_prime_handle open_fd = {0, 0, 0};
-#endif
-		uint64_t alloc_id;
-		struct am_oneapi_ze_ipc_info info;
-
-#ifndef PSM_HAVE_PIDFD
-		devfds = psm3_ze_get_dev_fds(&numfds);
-		device_index = cur_ze_dev - ze_devices; /* index (offset) in table */
-		args[5].u32w0 = device_index;
-		fd = devfds[device_index];
-#endif
-		PSMI_ONEAPI_ZE_CALL(zeMemGetAddressRange, ze_context, buf, &buf_base_ptr, &total);
-
-		/* Offset in GPU buffer from which we copy data, we have to
-			* send it separetly because this offset is lost
-			* when zeMemGetIpcHandle is called */
-		req->ze_ipc_offset = (uint32_t)((uintptr_t)buf - (uintptr_t)buf_base_ptr);
-		args[2].u32w0 = (uint32_t)req->ze_ipc_offset;
-		alloc_id = psm3_oneapi_ze_get_alloc_id(buf_base_ptr, &info.alloc_type);
-#ifndef PSM_HAVE_PIDFD
-		args[5].u32w1 = (uint32_t)alloc_id; /* 32-bit for now  */
-#else
-		args[5].u64w0 = alloc_id;
-#endif
-
-		PSMI_ONEAPI_ZE_CALL(zeMemGetIpcHandle,
-				ze_context,
-				(const void *)buf_base_ptr,
-				&req->ipc_handle);
-#ifdef PSM_HAVE_ONEAPI_ZE_PUT_IPCHANDLE
-		PSMI_ONEAPI_ZE_CALL(zeMemGetFileDescriptorFromIpcHandleExp, ze_context, req->ipc_handle, &handle_fd);
-#else
-		memcpy(&handle_fd, &req->ipc_handle, sizeof(uint32_t));
-#endif
-		req->ze_handle_attached = 1;
-#ifndef PSM_HAVE_PIDFD
-		open_fd.fd = (uint32_t)handle_fd;
-		if (ioctl(fd, DRM_IOCTL_PRIME_FD_TO_HANDLE, &open_fd) < 0) {
-			struct ptl_am *ptl_am = (struct ptl_am *)ptl;
-			_HFI_ERROR("ioctl failed for DRM_IOCTL_PRIME_FD_TO_HANDLE: for fd %d: %s", open_fd.fd, strerror(errno));
-			psm3_handle_error(ptl_am->ep, PSM2_INTERNAL_ERR,
-				"ioctl "
-				"failed for DRM_IOCTL_PRIME_FD_TO_HANDLE for fd %d: errno=%d",
-				open_fd.fd, errno);
-			err = PSM2_INTERNAL_ERR;
+		void *payload;
+		size_t payload_size;
+		union am_gpu_rts_payload info;
+		int narg;
+		err = PSM3_GPU_SHM_BUILD_RTS((struct ptl_am *)ptl, req, &narg, args, &payload, &payload_size, &info);
+		if (err)
 			goto fail;
-		}
-		_HFI_VDBG("FD_TO_HANDLE: buf %p total 0x%lx base %p alloc_id %lu gem_handle %u\n",
-			  buf, total, buf_base_ptr, alloc_id, open_fd.handle);
-		info.handle = open_fd.handle;
 		if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) {
 			psm3_am_reqq_add(AMREQUEST_SHORT, ptl,
 						epaddr, mq_handler_hidx,
-						args, 6, (void *)&info,
-						sizeof(info), NULL, 0);
+						args, narg, payload, payload_size, NULL, 0);
 		} else {
 			psm3_amsh_short_request(ptl, epaddr, mq_handler_hidx,
-						args, 6, (void *)&info,
-						sizeof(info), 0);
+						args, narg, payload, payload_size, 0);
 		}
-		// for DRM approach once we have the open_fd we could
-		// PutIpcHandle(ipc_handle) since open_fd has a reference
-		// however since that is a legacy mode, we focus on the
-		// prefered mode and have both delay the Put until CTS received
-#else
-		info.handle = (uint32_t)handle_fd;
-		if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) {
-			psm3_am_reqq_add(AMREQUEST_SHORT, ptl,
-					 epaddr, mq_handler_hidx,
-					 args, 6, (void *)&info,
-					 sizeof(info), NULL, 0);
-		} else {
-			psm3_amsh_short_request(ptl, epaddr, mq_handler_hidx,
-					 args, 6, (void *)&info,
-					 sizeof(info), 0);
-		}
-#endif /* PSM_HAVE_PIDFD */
-#else // if no drm, error out as oneapi ipc handles don't work without drm
-		err = PSM2_INTERNAL_ERR;
-		goto fail;
-#endif // defined(HAVE_DRM) || defined(HAVE_LIBDRM)
 	} else
-#endif // defined(PSM_ONEAPI)
+#endif /* PSM_HAVE_GPU */
 	if (req->flags_internal & PSMI_REQ_FLAG_FASTPATH) {
 		psm3_am_reqq_add(AMREQUEST_SHORT, ptl, epaddr, mq_handler_hidx,
 					args, 5, NULL, 0, NULL, 0);
@@ -2402,10 +2089,8 @@ amsh_mq_rndv(ptl_t *ptl, psm2_mq_t mq, psm2_mq_req_t req,
 	mq->stats.tx_rndv_num++;
 	// tx_rndv_bytes tabulated when get CTS
 
-#ifdef PSM_ONEAPI
-#if !defined(PSM_HAVE_PIDFD) || !(defined(HAVE_DRM) || defined(HAVE_LIBDRM))
+#ifdef PSM_HAVE_GPU
 fail:
-#endif
 #endif
 	return err;
 }
@@ -2485,9 +2170,9 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 	psm2_amarg_t args[3];
 	psm2_error_t err = PSM2_OK;
 	int is_blocking = (req == NULL);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	int gpu_mem = 0;
-	int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & gpu_p2p_supported();
+	int ep_supports_p2p = (1 << ((am_epaddr_t *) epaddr)->gpuid) & PSM3_GPU_P2P_SUPPORTED();
 
 	if (PSM3_IS_BUFFER_GPU_MEM(ubuf, len)) {
 		gpu_mem = 1;
@@ -2503,7 +2188,7 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 			goto do_rendezvous;
 		}
 	} else
-#endif
+#endif	/* PSM_HAVE_GPU */
 	/* SENDSYNC gets priority, assume not used for MPI_isend w/INJECT */
 	/* otherwise use eager for INJECT as caller is waiting */
 	if ((flags_user & (PSM2_MQ_FLAG_SENDSYNC|PSM2_MQ_FLAG_INJECT))
@@ -2531,14 +2216,12 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 		 * mq->completed_q */
 		req->flags_internal |= (flags_internal | PSMI_REQ_FLAG_IS_INTERNAL);
 	}
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	void *host_buf = NULL;
 
 	req->is_buf_gpu_mem = gpu_mem;
 	if (req->is_buf_gpu_mem) {
-#ifdef PSM_CUDA
-		psmi_cuda_set_attr_sync_memops(ubuf);
-#endif
+		PSM3_GPU_MARK_BUF_SYNCHRONOUS(ubuf);
 
 		/* Use host buffer for blocking requests if GPU P2P is
 		 * unsupported between endpoints.
@@ -2553,7 +2236,7 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 			req->is_buf_gpu_mem = 0;
 		}
 	}
-#endif
+#endif /* PSM_HAVE_GPU */
 
 	err = amsh_mq_rndv(epaddr->ptlctl->ptl, mq, req, epaddr, tag, ubuf, len);
 
@@ -2561,7 +2244,7 @@ amsh_mq_send_inner(psm2_mq_t mq, psm2_mq_req_t req, psm2_epaddr_t epaddr,
 		err = psm3_mq_wait_internal(&req);
 	}
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (err == PSM2_OK && host_buf)
 		psmi_free(host_buf);
 #endif
@@ -2664,15 +2347,15 @@ int psm3_get_kassist_mode(int first_ep)
 	union psmi_envvar_val env_kassist;
 	const char *PSM3_KASSIST_MODE_HELP = "PSM Shared memory kernel assist mode "
 			 "(cma-put, cma-get, none)";
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	// GPU limits KASSIST choices to cma-get or none
 	const char *PSM3_KASSIST_MODE_GPU_HELP = "PSM Shared memory kernel assist mode "
 			 "(cma-get, none)";
 #endif
 
 	if (!psm3_getenv("PSM3_KASSIST_MODE",
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-			 PSMI_IS_GPU_ENABLED?
+#ifdef PSM_HAVE_GPU
+			 PSM3_GPU_IS_ENABLED?
 				PSM3_KASSIST_MODE_GPU_HELP:PSM3_KASSIST_MODE_HELP,
 #else
 			 PSM3_KASSIST_MODE_HELP,
@@ -2683,8 +2366,8 @@ int psm3_get_kassist_mode(int first_ep)
 			 &env_kassist)) {
 		char *s = env_kassist.e_str;
 		if (
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-			! PSMI_IS_GPU_ENABLED &&
+#ifdef PSM_HAVE_GPU
+			! PSM3_GPU_IS_ENABLED &&
 #endif
 			strcasecmp(s, "cma-put") == 0)
 			mode = PSM3_KASSIST_CMA_PUT;
@@ -2783,12 +2466,7 @@ amsh_conn_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
 			((am_epaddr_t *) epaddr)->pid = pid;
 			((am_epaddr_t *) epaddr)->gpuid = gpuid;
 		}
-#ifdef PSM_ONEAPI
-#ifndef PSM_HAVE_PIDFD
-		if (PSMI_IS_GPU_ENABLED)
-			psm3_send_dev_fds(ptl_gen, epaddr);
-#endif
-#endif
+		PSM3_GPU_SHM_DEV_FDS_SEND(ptl, (struct am_epaddr *)epaddr);
 
 		/* Rewrite args */
 		ptl->connect_incoming++;
@@ -3030,18 +2708,6 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl)
 
 	amsh_fifo_getconfig(ptl);
 
-#ifdef PSM_ONEAPI
-#ifndef PSM_HAVE_PIDFD
-	ptl->ep->ze_ipc_socket = -1;
-	if (PSMI_IS_GPU_ENABLED) {
-		if ((err = psm3_ze_init_ipc_socket(ptl_gen)) != PSM2_OK)
-			goto fail;
-		if ((err = psm3_ze_init_fds()) != PSM2_OK)
-			goto fail;
-	}
-#endif
-#endif
-
 	memset(&ptl->amsh_empty_shortpkt, 0, sizeof(ptl->amsh_empty_shortpkt));
 	memset(&ptl->psmi_am_reqq_fifo, 0, sizeof(ptl->psmi_am_reqq_fifo));
 
@@ -3095,49 +2761,9 @@ amsh_init(psm2_ep_t ep, ptl_t *ptl_gen, ptl_ctl_t *ctl)
 	ctl->epaddr_stats_init = NULL;
 	ctl->epaddr_stats_get = NULL;
 #endif
-#ifdef PSM_CUDA
-	if (PSMI_IS_GPU_ENABLED) {
-		union psmi_envvar_val env_memcache_enabled;
-		psm3_getenv("PSM3_CUDA_MEMCACHE_ENABLED",
-			    "PSM cuda ipc memhandle cache enabled (default is enabled)",
-			     PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
-			     (union psmi_envvar_val)
-			      1, &env_memcache_enabled);
-		if (env_memcache_enabled.e_uint) {
-			union psmi_envvar_val env_memcache_size;
-			psm3_getenv("PSM3_CUDA_MEMCACHE_SIZE",
-				    "Size of the cuda ipc memhandle cache ",
-				    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
-				    (union psmi_envvar_val)
-				    CUDA_MEMHANDLE_CACHE_SIZE, &env_memcache_size);
-			if ((err = am_cuda_memhandle_cache_alloc(&ptl->memhandle_cache,
-						 env_memcache_size.e_uint, &ep->mq->stats) != PSM2_OK))
-				goto fail;
-		}
-	}
-#endif
-#ifdef PSM_ONEAPI
-	if (PSMI_IS_GPU_ENABLED) {
-		union psmi_envvar_val env_memcache_enabled;
-		psm3_getenv("PSM3_ONEAPI_MEMCACHE_ENABLED",
-			    "PSM oneapi ipc memhandle cache enabled (default is enabled)",
-			     PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
-			     (union psmi_envvar_val)
-			      1, &env_memcache_enabled);
-		if (env_memcache_enabled.e_uint) {
-			union psmi_envvar_val env_memcache_size;
-			psm3_getenv("PSM3_ONEAPI_MEMCACHE_SIZE",
-				    "Size of the oneapi ipc memhandle cache ",
-				    PSMI_ENVVAR_LEVEL_HIDDEN, PSMI_ENVVAR_TYPE_UINT,
-				    (union psmi_envvar_val)
-				    ONEAPI_MEMHANDLE_CACHE_SIZE, &env_memcache_size);
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-			if ((err = am_ze_memhandle_cache_alloc(&ptl->memhandle_cache,
-						 env_memcache_size.e_uint, &ep->mq->stats) != PSM2_OK))
-				goto fail;
-#endif
-		}
-	}
+#ifdef PSM_HAVE_GPU
+	if ((err = PSM3_GPU_SHM_INIT(ptl, &ep->mq->stats)) != PSM2_OK)
+		goto fail;
 #endif
 fail:
 	return err;
@@ -3235,15 +2861,6 @@ static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns)
 		goto fail;
 	}
 
-#ifdef PSM_ONEAPI
-#ifndef PSM_HAVE_PIDFD
-	if (PSMI_IS_GPU_ENABLED && (err_seg = psm3_sock_detach(ptl_gen))) {
-		err = err_seg;
-		goto fail;
-	}
-#endif
-#endif
-
 	/* This prevents poll calls between now and the point where the endpoint is
 	 * deallocated to reference memory that disappeared */
 	ptl->repH.head = &ptl->amsh_empty_shortpkt;
@@ -3252,20 +2869,9 @@ static psm2_error_t amsh_fini(ptl_t *ptl_gen, int force, uint64_t timeout_ns)
 	if (ptl->am_ep)
 		psmi_free(ptl->am_ep);
 
-#ifdef PSM_CUDA
-	if (ptl->memhandle_cache)
-		am_cuda_memhandle_cache_free(ptl->memhandle_cache);
-	ptl->memhandle_cache = NULL;
-#endif
-#ifdef PSM_ONEAPI
-#if defined(HAVE_DRM) || defined(HAVE_LIBDRM)
-	if (ptl->memhandle_cache)
-		am_ze_memhandle_cache_free(ptl->memhandle_cache);
-#endif
-	ptl->memhandle_cache = NULL;
-#endif
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (PSMI_IS_GPU_ENABLED && ptl->gpu_bounce_buf)
+#ifdef PSM_HAVE_GPU
+	PSM3_GPU_SHM_FINALIZE(ptl);
+	if (PSM3_GPU_IS_ENABLED && ptl->gpu_bounce_buf)
 		PSM3_GPU_HOST_FREE(ptl->gpu_bounce_buf);
 #endif
 	return PSM2_OK;
diff --git a/prov/psm3/psm3/ptl_am/psm_am_internal.h b/prov/psm3/psm3/ptl_am/psm_am_internal.h
index 0796dbee9e9..091ae3e3edb 100644
--- a/prov/psm3/psm3/ptl_am/psm_am_internal.h
+++ b/prov/psm3/psm3/ptl_am/psm_am_internal.h
@@ -58,22 +58,9 @@
 
 #include "am_config.h"
 #include "../psm_am_internal.h"
-#ifdef PSM_CUDA
-#include "am_cuda_memhandle_cache.h"
-#endif
-#ifdef PSM_ONEAPI
-#include "am_oneapi_memhandle_cache.h"
-#endif
 
 #define AMSH_DIRBLOCK_SIZE 128
 
-#ifdef PSM_ONEAPI
-/* sock_connected_state state definitions */
-#define ZE_SOCK_NOT_CONNECTED			0
-#define ZE_SOCK_DEV_FDS_SENT			1
-#define ZE_SOCK_DEV_FDS_SENT_AND_RECD	2
-#endif
-
 typedef
 struct am_epaddr {
 	/*
@@ -84,15 +71,8 @@ struct am_epaddr {
 
 	uint16_t shmidx;
 	uint16_t return_shmidx;
-#ifdef PSM_ONEAPI
-#ifdef PSM_HAVE_PIDFD
-	int pidfd;
-#else
-	int num_peer_fds;
-	int peer_fds[MAX_ZE_DEVICES];
-	int sock_connected_state;
-	int sock;
-#endif
+#ifdef PSM_HAVE_GPU
+	union am_epaddr_gpu_specific gpu_specific;
 #endif
 	uint32_t cstate_outgoing:3;
 	uint32_t cstate_incoming:3;
@@ -105,6 +85,26 @@ struct am_epaddr {
 	uint32_t gpuid:4;
 } am_epaddr_t;
 
+struct am_ptl_connection_req {
+	int isdone;
+	int op;			/* connect or disconnect */
+	int numep;
+	int numep_left;
+	int phase;
+
+	int *epid_mask;
+	const psm2_epid_t *epids;	/* input epid list */
+	psm2_epaddr_t *epaddr;
+	psm2_error_t *errors;	/* inout errors */
+
+	/* Used for connect/disconnect */
+	psm2_amarg_t args[6];
+};
+
+#define AM_PTL_OP_CONNECT      0
+#define AM_PTL_OP_DISCONNECT   1
+#define AM_PTL_OP_ABORT        2
+
 /* Up to NSHORT_ARGS are supported via am_pkt_short_t; the remaining
    arguments are passed using space in am_pkt_bulk_t.  One additional argument
    is added for passing the internal ptl_am handler index. */
@@ -466,13 +466,9 @@ struct ptl_am {
 
 	struct am_ctl_nodeinfo *self_nodeinfo; /* our local advertized shm */
 	struct am_ctl_nodeinfo *am_ep; /* local array w/copy of each peer's info */
-#ifdef PSM_CUDA
-	am_cuda_memhandle_cache_t memhandle_cache;
-#endif
-#ifdef PSM_ONEAPI
-	am_ze_memhandle_cache_t memhandle_cache;
-#endif
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
+	union ptl_am_gpu_specific gpu_specific;
+	void *memhandle_cache;
 #define AMSH_GPU_BOUNCE_BUF_SZ (256*1024)
 	void *gpu_bounce_buf;	// for H to D
 #endif
diff --git a/prov/psm3/psm3/ptl_am/ptl.c b/prov/psm3/psm3/ptl_am/ptl.c
index a6af3c356ac..b37ac175357 100644
--- a/prov/psm3/psm3/ptl_am/ptl.c
+++ b/prov/psm3/psm3/ptl_am/ptl.c
@@ -59,13 +59,6 @@
 #include "psm_am_internal.h"
 #include "cmarw.h"
 
-#ifdef PSM_CUDA
-#include "am_cuda_memhandle_cache.h"
-#endif
-#ifdef PSM_ONEAPI
-#include "am_oneapi_memhandle_cache.h"
-#endif
-
 #ifdef PSM_FI
 /*
  * fault injection for psm3_cma_get() and psm3_cma_put().
@@ -110,63 +103,9 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 
 	_HFI_VDBG("[shm][rndv][recv] req=%p dest=%p len=%d tok=%p\n",
 		  req, req->req_data.buf, req->req_data.recv_msglen, tok);
-#ifdef PSM_CUDA
-	if (req->cuda_ipc_handle_attached) {
-
-		CUdeviceptr cuda_ipc_dev_ptr = am_cuda_memhandle_acquire(
-								 ptl->memhandle_cache,
-								 req->rts_sbuf - req->cuda_ipc_offset,
-								 (CUipcMemHandle*)&req->cuda_ipc_handle,
-								 req->rts_peer->epid);
-		cuda_ipc_dev_ptr = cuda_ipc_dev_ptr + req->cuda_ipc_offset;
-		/* cuMemcpy into the receive side buffer
-		 * based on its location */
-		if (req->is_buf_gpu_mem) {
-			PSM3_GPU_MEMCPY_DTOD(req->req_data.buf, cuda_ipc_dev_ptr,
-				       req->req_data.recv_msglen);
-			PSM3_GPU_SYNCHRONIZE_MEMCPY();
-		} else {
-			PSM3_GPU_MEMCPY_DTOH(req->req_data.buf, cuda_ipc_dev_ptr,
-				req->req_data.recv_msglen);
-		}
+#ifdef PSM_HAVE_GPU
+	if (PSM3_GPU_SHM_RTSMATCH(ptl, req)) {
 		gpu_ipc_send_completion = 1;
-		am_cuda_memhandle_release(ptl->memhandle_cache,
-					cuda_ipc_dev_ptr - req->cuda_ipc_offset);
-		req->cuda_ipc_handle_attached = 0;
-		goto send_cts;
-	}
-#endif
-#ifdef PSM_ONEAPI
-	if (req->ze_handle_attached) {
-		void *buf_ptr = am_ze_memhandle_acquire(
-						  ptl->memhandle_cache,
-						  req->rts_sbuf - req->ze_ipc_offset, req->ze_handle,
-						  req->rts_peer,
-#ifndef PSM_HAVE_PIDFD
-						  req->ze_device_index, req->ze_alloc_id,
-#else
-						  0, req->ze_alloc_id,
-#endif
-						  req->ze_alloc_type);
-		psmi_assert_always(buf_ptr != NULL);
-		buf_ptr = (uint8_t *)buf_ptr + req->ze_ipc_offset;
-		/* zeMemcpy into the receive side buffer
-		 * based on its location */
-		_HFI_VDBG("Copying src %p (offset 0x%x) dst %p msg_len %u\n",
-			  buf_ptr, req->ze_ipc_offset,
-			  req->req_data.buf, req->req_data.recv_msglen);
-		if (req->is_buf_gpu_mem) {
-			PSM3_GPU_MEMCPY_DTOD(req->req_data.buf, buf_ptr,
-				       req->req_data.recv_msglen);
-			PSM3_GPU_SYNCHRONIZE_MEMCPY();
-		} else {
-			PSM3_GPU_MEMCPY_DTOH(req->req_data.buf, buf_ptr,
-				req->req_data.recv_msglen);
-		}
-		gpu_ipc_send_completion = 1;
-		am_ze_memhandle_release(ptl->memhandle_cache,
-					(uint8_t *)buf_ptr - req->ze_ipc_offset);
-		req->ze_handle_attached = 0;
 		goto send_cts;
 	}
 #endif
@@ -175,7 +114,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 	if ((ptl->kassist_mode & PSM3_KASSIST_GET)
 	    && req->req_data.recv_msglen > 0
 	    && (pid = psm3_epaddr_pid(epaddr))) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		/* If the buffer on the send side is on the host,
 		 * we alloc a bounce buffer, use kassist and then
 		 * do a cuMemcpy if the buffer on the recv side
@@ -213,7 +152,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 			/* Cuda library has recent optimizations where they do
 			 * not guarantee synchronus nature for Host to Device
 			 * copies for msg sizes less than 64k. The event record
-			 * and synchronize calls are to guarentee completion.
+			 * and synchronize calls are to guarantee completion.
 			 */
 			PSM3_GPU_SYNCHRONIZE_MEMCPY();
 		} else {
@@ -230,7 +169,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 				goto fail_cma;
 			psmi_assert_always(nbytes == req->req_data.recv_msglen);
 		}
-#else
+#else /* PSM_HAVE_GPU */
 		/* cma can be done in handler context or not. */
 		size_t nbytes;
 #ifdef PSM_FI
@@ -243,7 +182,7 @@ ptl_handle_rtsmatch_request(psm2_mq_req_t req, int was_posted,
 		if (nbytes == -1)
 			goto fail_cma;
 		psmi_assert_always(nbytes == req->req_data.recv_msglen);
-#endif
+#endif /* PSM_HAVE_GPU */
 		cma_succeed = 1;
 	}
 
@@ -330,7 +269,7 @@ psm3_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
 	default:{
 			void *sreq = (void *)(uintptr_t) args[3].u64w0;
 			uintptr_t sbuf = (uintptr_t) args[4].u64w0;
-#ifdef PSM_ONEAPI
+#ifdef PSM_HAVE_GPU
 			psmi_assert(narg == 5 || narg == 6);
 #else
 			psmi_assert(narg == 5);
@@ -343,38 +282,13 @@ psm3_am_mq_handler(void *toki, psm2_amarg_t *args, int narg, void *buf,
 			req->rts_peer = tok->tok.epaddr_incoming;
 			req->ptl_req_ptr = sreq;
 			req->rts_sbuf = sbuf;
-#ifdef PSM_CUDA
-			/* Payload in RTS would mean an IPC handle has been
+#ifdef PSM_HAVE_GPU
+			/* Payload in RTS would mean a GPU IPC handle has been
 			 * sent. This would also mean the sender has to
 			 * send from a GPU buffer
 			 */
-			if (buf && len > 0) {
-				req->cuda_ipc_handle = *((CUipcMemHandle*)buf);
-				req->cuda_ipc_handle_attached = 1;
-				req->cuda_ipc_offset = args[2].u32w0;
-			}
-#endif
-#ifdef PSM_ONEAPI
-			/* Payload in RTS would mean an IPC handle has been
-			 * sent. This would also mean the sender has to
-			 * send from a GPU buffer
-			 */
-			if (buf && len > 0) {
-				am_oneapi_ze_ipc_info_t info;
-
-				psmi_assert(narg == 6);
-				info = (am_oneapi_ze_ipc_info_t)buf;
-				req->ze_handle = info->handle;
-				req->ze_alloc_type = info->alloc_type;
-				req->ze_handle_attached = 1;
-				req->ze_ipc_offset = args[2].u32w0;
-#ifndef PSM_HAVE_PIDFD
-				req->ze_device_index = args[5].u32w0;
-				req->ze_alloc_id = args[5].u32w1;
-#else
-				req->ze_alloc_id = args[5].u64w0;
-#endif
-			}
+			if (buf && len > 0)
+				PSM3_GPU_SHM_PROCESS_RTS(req, buf, len, narg, args);
 #endif
 
 			if (rc == MQ_RET_MATCH_OK)	/* we are in handler context, issue a reply */
@@ -397,7 +311,7 @@ psm3_am_mq_handler_data(void *toki, psm2_amarg_t *args, int narg, void *buf,
 	psm2_epaddr_t epaddr = (psm2_epaddr_t) tok->tok.epaddr_incoming;
 	psm2_mq_req_t req = mq_eager_match(tok->mq, epaddr, 0);	/* using seqnum 0 */
 	psmi_assert_always(req != NULL);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	psm3_mq_handle_data(tok->mq, req, args[2].u32w0, buf, len, 0, NULL);
 #else
 	psm3_mq_handle_data(tok->mq, req, args[2].u32w0, buf, len);
@@ -419,34 +333,19 @@ psm3_am_mq_handler_rtsmatch(void *toki, psm2_amarg_t *args, int narg, void *buf,
 
 	ptl_t *ptl = tok->ptl;
 	psm2_mq_req_t sreq = (psm2_mq_req_t) (uintptr_t) args[0].u64w0;
-#ifdef PSM_CUDA
-	/* If send side req has a cuda ipc handle attached then as soon as we
-	 * get a CTS, we can assume the data has been copied and receiver now
-	 * has a reference for the ipc handle for any receiver handle caching
-	 */
-	if (sreq->cuda_ipc_handle_attached) {
-		sreq->cuda_ipc_handle_attached = 0;
-		sreq->mq->stats.tx_shm_bytes += sreq->req_data.send_msglen;
-		sreq->mq->stats.tx_rndv_bytes += sreq->req_data.send_msglen;
-		psm3_mq_handle_rts_complete(sreq);
-		return;
-	}
-#endif
-#ifdef PSM_ONEAPI
-	/* If send side req has an ipc handle attached then as soon as we
+#ifdef PSM_HAVE_GPU
+	/* If send side req has a GPU IPC  handle attached then as soon as we
 	 * get a CTS, we can assume the data has been copied and receiver now
 	 * has a reference for the ipc handle for any receiver handle caching
 	 */
-	if (sreq->ze_handle_attached) {
-		psm3_put_ipc_handle(sreq->req_data.buf - sreq->ze_ipc_offset,
-							sreq->ipc_handle);
-		sreq->ze_handle_attached = 0;
+	if (PSM3_GPU_SHM_PROCESS_CTS(sreq)) {
 		sreq->mq->stats.tx_shm_bytes += sreq->req_data.send_msglen;
 		sreq->mq->stats.tx_rndv_bytes += sreq->req_data.send_msglen;
 		psm3_mq_handle_rts_complete(sreq);
 		return;
 	}
 #endif
+
 	void *dest = (void *)(uintptr_t) args[2].u64w0;
 	uint32_t msglen = args[3].u32w0;
 	psm2_amarg_t rarg[1];
diff --git a/prov/psm3/psm3/ptl_ips/ips_config.h b/prov/psm3/psm3/ptl_ips/ips_config.h
index 1a253aa4a23..e66a651ed6c 100644
--- a/prov/psm3/psm3/ptl_ips/ips_config.h
+++ b/prov/psm3/psm3/ptl_ips/ips_config.h
@@ -69,6 +69,10 @@
 #define IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT 128
 #define IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT 16
 
+#define IPS_PROTO_FLOW_CREDITS_RC_MIN_DEFAULT 768
+#define IPS_PROTO_FLOW_CREDITS_RC_MAX_DEFAULT 960
+#define IPS_PROTO_FLOW_CREDITS_RC_MAX 2048
+
 /* Send retransmission */
 #define IPS_PROTO_SPIO_RETRY_US_DEFAULT	2	/* in uS */
 
@@ -116,7 +120,7 @@
 #define IPS_FAULTINJ_UFFD_REGISTER 1000	/* 1 every X uffd REGISTER ENOMEM */
 #endif
 #endif /* PSM_HAVE_REG_MR */
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #define IPS_FAULTINJ_GDRMMAP	100	/* 1 every X GPU pin and mmap ENOMEM */
 #define IPS_FAULTINJ_GPU_REG_MR	100	/* 1 every X GPU reg_mr */
 #endif
diff --git a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h
index 221706ade25..3e65d85d864 100644
--- a/prov/psm3/psm3/ptl_ips/ips_expected_proto.h
+++ b/prov/psm3/psm3/ptl_ips/ips_expected_proto.h
@@ -124,19 +124,14 @@ struct ips_protoexp {
 	/* services pend_getreqsq and pend_err_chk_rdma_resp */
 	struct psmi_timer timer_getreqs;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	STAILQ_HEAD(ips_tid_get_gpupend, /* pending GPU transfers */
 		    ips_tid_get_request) gpupend_getreqsq;
 	struct ips_gpu_hostbuf_mpool_cb_context gpu_hostbuf_recv_cfg;
 	struct ips_gpu_hostbuf_mpool_cb_context gpu_hostbuf_small_recv_cfg;
 	mpool_t gpu_hostbuf_pool_recv;
 	mpool_t gpu_hostbuf_pool_small_recv;
-#endif
-#ifdef PSM_CUDA
-	CUstream cudastream_recv;
-#elif defined(PSM_ONEAPI)
-	/* Will not be usd if psm3_oneapi_immed_async_copy */
-	ze_command_queue_handle_t cq_recvs[MAX_ZE_DEVICES];
+	union ips_protoexp_gpu_specific gpu_specific;
 #endif
 };
 
@@ -194,7 +189,7 @@ struct ips_tid_send_desc {
 	uint8_t reserved:7;
 #endif
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/* As size of gpu_hostbuf is less than equal to window size,
 	 * there is a guarantee that the maximum number of host bufs we
 	 * would need to attach to a tidsendc would be 2
@@ -239,7 +234,7 @@ struct ips_tid_recv_desc {
 	uint32_t tidflow_nswap_gen;
 	psmi_seqnum_t tidflow_genseq;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	struct ips_gpu_hostbuf *gpu_hostbuf;
 	uint8_t is_ptr_gpu_backed;
 #endif
@@ -282,7 +277,7 @@ struct ips_tid_get_request {
 	uint32_t tidgr_bytesdone;
 	uint32_t tidgr_flags;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	int gpu_hostbuf_used;
 	uint32_t tidgr_gpu_bytesdone;
 	STAILQ_HEAD(ips_tid_getreq_gpu_hostbuf_pend,	/* pending exp. sends */
@@ -363,7 +358,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 			    ips_tid_session_list *tid_list,
 			    uint32_t tid_list_size);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 // buffers for GPU send copy pipeline
 struct ips_gpu_hostbuf* psm3_ips_allocate_send_chb(struct ips_proto *proto,
 				uint32_t nbytes, int allow_temp);
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.c b/prov/psm3/psm3/ptl_ips/ips_proto.c
index 372dd75ea56..3705b052672 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto.c
@@ -86,11 +86,6 @@
 // to play safe we set max credit to 16384
 #define IPS_MAX_CREDIT 16384
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-uint32_t gpudirect_rdma_send_limit;
-uint32_t gpudirect_rdma_recv_limit;
-#endif
-
 static void ctrlq_init(struct ips_ctrlq *ctrlq, struct ips_proto *proto);
 
 #ifdef PSM_HAVE_REG_MR
@@ -98,18 +93,19 @@ static psm2_error_t proto_sdma_init(struct ips_proto *proto);
 #endif
 static psm2_error_t ips_proto_register_stats(struct ips_proto *proto);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 void psmi_gpu_hostbuf_alloc_func(int is_alloc, void *context, void *obj)
 {
 	struct ips_gpu_hostbuf *icb = (struct ips_gpu_hostbuf *)obj;
 	if (is_alloc) {
 		PSM3_GPU_HOSTBUF_LAZY_INIT(icb);
+		icb->host_buf = NULL;
 	} else {
 		PSM3_GPU_HOSTBUF_DESTROY(icb);
 	}
 	return;
 }
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 static int parse_flow_credits(const char *str,
 			size_t errstr_size, char errstr[],
@@ -191,18 +187,32 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 	{
 		/* Number of credits per flow */
 		union psmi_envvar_val env_flow_credits;
+#ifdef PSM_VERBS
+		int min_credits = IPS_PROTOEXP_FLAG_RDMA_QP(proto->ep->rdmamode) == IPS_PROTOEXP_FLAG_RDMA_USER_RC ?
+				IPS_PROTO_FLOW_CREDITS_RC_MIN_DEFAULT : IPS_PROTO_FLOW_CREDITS_MIN_DEFAULT;
+		int max_credits = IPS_PROTOEXP_FLAG_RDMA_QP(proto->ep->rdmamode) == IPS_PROTOEXP_FLAG_RDMA_USER_RC ?
+				IPS_PROTO_FLOW_CREDITS_RC_MAX_DEFAULT : IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT;
 		int tvals[3] = {
-                	min(IPS_PROTO_FLOW_CREDITS_MIN_DEFAULT, num_of_send_desc),
-                	min(IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT, num_of_send_desc),
-                	IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT
-                };
+			min(min_credits, num_of_send_desc),
+			min(max_credits, num_of_send_desc),
+			IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT
+		};
+#else
+		int tvals[3] = {
+			min(IPS_PROTO_FLOW_CREDITS_MIN_DEFAULT, num_of_send_desc),
+			min(IPS_PROTO_FLOW_CREDITS_MAX_DEFAULT, num_of_send_desc),
+			IPS_PROTO_FLOW_CREDITS_STEP_DEFAULT
+		};
+#endif
 		char fcredits_def[32];
 		snprintf(fcredits_def, sizeof(fcredits_def), "%d:%d:%d", tvals[0], tvals[1], tvals[2]);
 
 		(void)psm3_getenv_range("PSM3_FLOW_CREDITS",
 			    "Number of unacked packets (credits) per flow in <min:max:adjust>",
 			    "Specified as min:max:adjust where min and max is the range of credits,\n"
-			    "and adjust is the adjustment amount for adjusting credits",
+			    "and adjust is the adjustment amount for adjusting credits. For PSM3_RDMA=3,\n"
+				"adjust is ignored. Data send pauses when number of unacked packets is beyond\n"
+				"max credits, and send resumes when the number is below min credits",
 			    PSMI_ENVVAR_LEVEL_USER, PSMI_ENVVAR_TYPE_STR_TUPLES,
 			    (union psmi_envvar_val)fcredits_def,
 			    (union psmi_envvar_val)NULL, (union psmi_envvar_val)NULL,
@@ -446,7 +456,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 			goto fail;
 	}
 	if (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSM3_GPU_PREPARE_DTOH_MEMCPYS(proto);
 #endif
 		if ((err = psm3_ips_protoexp_init(proto, protoexp_flags,
@@ -474,59 +484,56 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 				     &proto->proto_am)))
 		goto fail;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	is_gpudirect_enabled = psmi_parse_gpudirect();
-	gpudirect_rdma_send_limit = psmi_parse_gpudirect_rdma_send_limit(0);
-	gpudirect_rdma_recv_limit = psmi_parse_gpudirect_rdma_recv_limit(0);
+#ifdef PSM_HAVE_GPU
+	psm3_gpu_is_gpudirect_enabled = psmi_parse_gpudirect();
+	psm3_gpu_gpudirect_rdma_send_limit = psmi_parse_gpudirect_rdma_send_limit(0);
+	psm3_gpu_gpudirect_rdma_recv_limit = psmi_parse_gpudirect_rdma_recv_limit(0);
+#ifdef PSM_HAVE_RNDV_MOD
 	if (psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT))
-		is_driver_gpudirect_enabled = 1;
+		psm3_gpu_is_driver_gpudirect_enabled = 1;
 	/* Check for mismatch between PSM3 and RV module */
-#ifdef PSM_CUDA
-	if (psmi_hal_has_cap(PSM_HAL_CAP_INTEL_GPU) &&
-	    !psmi_hal_has_cap(PSM_HAL_CAP_NVIDIA_GPU))
-		is_driver_gpudirect_enabled = 0;
+	if (! psmi_hal_has_cap(PSM3_GPU_HAL_CAP_EXPECTED))
+		psm3_gpu_is_driver_gpudirect_enabled = 0;
 #else
-	if (psmi_hal_has_cap(PSM_HAL_CAP_NVIDIA_GPU) &&
-	    !psmi_hal_has_cap(PSM_HAL_CAP_INTEL_GPU))
-		is_driver_gpudirect_enabled = 0;
+	psm3_gpu_is_driver_gpudirect_enabled = 0;
 #endif
 
-	if (! is_gpudirect_enabled) {
-		gpudirect_rdma_send_limit = gpudirect_rdma_recv_limit = 0;
-	} else if (PSMI_IS_GPU_DISABLED) {
-#ifdef PSM_CUDA
-		// should not happen since we don't dynamically disable CUDA
-		_HFI_INFO("WARNING: Non-CUDA application, PSM3_GPUDIRECT option ignored\n");
-#else
-		// should not happen since we don't dynamically disable ONEAPI_ZE
-		_HFI_INFO("WARNING: Non-ONEAPI_ZE application, PSM3_GPUDIRECT option ignored\n");
-#endif
-		is_gpudirect_enabled = 0;
-		gpudirect_rdma_send_limit = gpudirect_rdma_recv_limit = 0;
-	} else if (!device_support_gpudirect()) {
+	if (! psm3_gpu_is_gpudirect_enabled) {
+		psm3_gpu_gpudirect_rdma_send_limit = psm3_gpu_gpudirect_rdma_recv_limit = 0;
+	} else if (! PSM3_GPU_IS_ENABLED) {
+		// should not happen since we test psmi_parse_gpudirect earlier
+		// and it will trigger initialization of the proper GPU.  Then
+		// we provide no disabling of the GPU per EP.
+		_HFI_INFO("WARNING: Non-GPU application, PSM3_GPUDIRECT option ignored\n");
+		psm3_gpu_is_gpudirect_enabled = 0;
+		psm3_gpu_gpudirect_rdma_send_limit = psm3_gpu_gpudirect_rdma_recv_limit = 0;
+	} else if (!PSM3_GPU_GPUDIRECT_SUPPORTED()) {
 		_HFI_INFO("WARNING: GPU device does not support GPU Direct, PSM3_GPUDIRECT option ignored\n");
-		is_gpudirect_enabled = 0;
-		gpudirect_rdma_send_limit = gpudirect_rdma_recv_limit = 0;
-	} else if (
-		PSMI_IS_DRIVER_GPUDIRECT_DISABLED) {
+		psm3_gpu_is_gpudirect_enabled = 0;
+		psm3_gpu_gpudirect_rdma_send_limit = psm3_gpu_gpudirect_rdma_recv_limit = 0;
+	} else if (! PSM3_GPU_IS_DRIVER_GPUDIRECT_ENABLED) {
+#ifdef PSM_HAVE_RNDV_MOD
+		char buf[100];
+		PSM3_GPU_RV_CAP_STRING(buf, sizeof(buf), PSM3_GPU_RV_CAPABILITY_EXPECTED);
 		err = psm3_handle_error(PSMI_EP_NORETURN,
 				PSM2_INTERNAL_ERR,
-#ifdef PSM_CUDA
-				"Unable to start run, PSM3_GPUDIRECT requires rv module with CUDA support.\n");
+				"Unable to start run, PSM3_GPUDIRECT requires rv module with %s support.\n", buf);
 #else
-				"Unable to start run, PSM3_GPUDIRECT requires rv module with ONEAPI_ZE support.\n");
+		err = psm3_handle_error(PSMI_EP_NORETURN,
+				PSM2_INTERNAL_ERR,
+				"Unable to start run, PSM3_GPUDIRECT requires rv module with GPU support.\n");
 #endif
 	} else if (!(protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) {
 		// only GDR Copy and GPU Send DMA allowed
-		gpudirect_rdma_send_limit = gpudirect_rdma_recv_limit = 0;
+		psm3_gpu_gpudirect_rdma_send_limit = psm3_gpu_gpudirect_rdma_recv_limit = 0;
 	} else {
-		if (gpudirect_rdma_send_limit)
+		if (psm3_gpu_gpudirect_rdma_send_limit)
 			proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND;
-		if (gpudirect_rdma_recv_limit)
+		if (psm3_gpu_gpudirect_rdma_recv_limit)
 			proto->flags |= IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV;
 	}
 	// from here forward can't use psmi_parse_gpudirect,
-	// must use is_gpudirect_enabled
+	// must use psm3_gpu_is_gpudirect_enabled
 
 	/* The following cases need to be handled:
 	 * 1) GPU DIRECT is turned off but GDR COPY is turned on by the user or
@@ -536,15 +543,15 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 	 * 2) GPU DIRECT is on but GDR COPY is turned off by the user - Leave
 	 *.   this config as it is.
 	 */
-	if (!is_gpudirect_enabled)
-		is_gdr_copy_enabled = gdr_copy_limit_send =
-			gdr_copy_limit_recv = 0;
+	if (!psm3_gpu_is_gpudirect_enabled)
+		psm3_gpu_is_gdr_copy_enabled = psm3_gpu_gdr_copy_limit_send =
+			psm3_gpu_gdr_copy_limit_recv = 0;
 	/* technically this is not needed since we only consider GDRCopy Send
 	 * for TINY, SHORT, and single MTU RTS payload.  But does no harm.
 	 */
-	gdr_copy_limit_send = min(gdr_copy_limit_send, proto->ep->mtu);
+	psm3_gpu_gdr_copy_limit_send = min(psm3_gpu_gdr_copy_limit_send, proto->ep->mtu);
 
-	if (PSMI_IS_GPU_ENABLED &&
+	if (PSM3_GPU_IS_ENABLED &&
 		 (protoexp_flags & IPS_PROTOEXP_FLAG_ENABLED)) {
 		struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS;
 		uint32_t maxsz, chunksz, max_elements;
@@ -613,7 +620,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 			proto->gpu_hostbuf_send_cfg.bufsz,
 			proto->gpu_prefetch_limit);
 	}
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 #ifdef PSM_HAVE_REG_MR
 	// we allocate MR cache here (as opposed to in protoexp) because
@@ -629,7 +636,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 		uint32_t default_cache_size_mb;	// in megabytes
 		uint32_t cache_pri_entries;
 		uint64_t cache_pri_size;	// in bytes
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		uint64_t cache_gpu_pri_size;	// in bytes
 		union psmi_envvar_val env_mr_cache_gpu_evict;
 #endif
@@ -707,7 +714,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 				PSMI_ENVVAR_TYPE_UINT,
 				(union psmi_envvar_val)default_cache_entries,
 				&env_mr_cache_entries);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		// cache_gpu_pri_size only used to confirm RV GPU cache size
 		// Without GPU Direct we will not register any GPU MRs
 		// if we have GPU Direct w/o RDMA, no priority pin/MRs except
@@ -716,17 +723,17 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 		// grow pri_entries to account for it
 		// Note cache_pri_size == 0 if rdmamode not enabled
 		cache_gpu_pri_size = 0;
-		if (PSMI_IS_GPU_ENABLED && is_gpudirect_enabled) {
-			if (gpudirect_rdma_send_limit || gpudirect_rdma_recv_limit)
+		if (PSM3_GPU_IS_ENABLED && psm3_gpu_is_gpudirect_enabled) {
+			if (psm3_gpu_gpudirect_rdma_send_limit || psm3_gpu_gpudirect_rdma_recv_limit)
 				cache_gpu_pri_size = cache_pri_size;
-			if (gdr_copy_limit_send || gdr_copy_limit_recv) {
+			if (psm3_gpu_gdr_copy_limit_send || psm3_gpu_gdr_copy_limit_recv) {
 				// min of one extra for GDRCopy
-				// largest recv with GDR copy is gdr_copy_limit_recv
-				// largest send with GDR copy is gdr_copy_limit_send
+				// largest recv with GDR copy is psm3_gpu_gdr_copy_limit_recv
+				// largest send with GDR copy is psm3_gpu_gdr_copy_limit_send
 				cache_gpu_pri_size +=
 					ROUNDUP64P2(max(proto->epinfo.ep_mtu,
-							max(gdr_copy_limit_recv,
-							gdr_copy_limit_send)),
+							max(psm3_gpu_gdr_copy_limit_recv,
+							psm3_gpu_gdr_copy_limit_send)),
 						PSMI_GPU_PAGESIZE);
 			}
 			psm3_getenv("PSM3_RV_GPU_CACHE_EVICT",
@@ -737,13 +744,13 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 			psm3_gpu_cache_evict = (uint64_t)env_mr_cache_gpu_evict.e_uint * 1024;
 		}
 
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 		proto->ep->mr_cache = proto->mr_cache
 					= psm3_verbs_alloc_mr_cache(proto->ep,
 						env_mr_cache_entries.e_uint, proto->ep->mr_cache_mode,
 						env_mr_cache_size_mb.e_uint,
 						cache_pri_entries, cache_pri_size
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 						, cache_gpu_pri_size
 #endif
 						);
@@ -763,7 +770,7 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 			_HFI_INFO("WARNING: Send DMA requires an MR Cache, disabling PSM3_SDMA\n");
 		proto->iovec_thresh_eager = proto->iovec_thresh_eager_blocking =
 		    ~0U;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		proto->iovec_gpu_thresh_eager = proto->iovec_gpu_thresh_eager_blocking =
 		    ~0U;
 #endif
@@ -771,39 +778,34 @@ psm3_ips_proto_init(psm2_ep_t ep, const ptl_t *ptl,
 	// without a real cache, Send DMA makes no sense
 	psmi_assert(proto->ep->mr_cache_mode || proto->iovec_thresh_eager == ~0);
 	psmi_assert(proto->ep->mr_cache_mode || proto->iovec_thresh_eager_blocking == ~0U);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	// without a real cache, GPU Direct Send DMA makes no sense
 	psmi_assert(proto->ep->mr_cache_mode || proto->iovec_gpu_thresh_eager == ~0);
 	psmi_assert(proto->ep->mr_cache_mode || proto->iovec_gpu_thresh_eager_blocking == ~0U);
 #endif
 #endif /* PSM_HAVE_REG_MR */
 
-#ifdef PSM_CUDA
-	_HFI_DBG("Cuda %d GPU Direct support: driver %d GPU device %d\n",
-		is_cuda_enabled, is_driver_gpudirect_enabled, _device_support_gpudirect);
-#elif defined(PSM_ONEAPI)
-	_HFI_DBG("OneAPI ZE %d GPU Direct support: driver %d GPU device %d\n",
-		is_oneapi_ze_enabled, is_driver_gpudirect_enabled, _device_support_gpudirect);
-#endif
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
+	_HFI_DBG("GPU ("PSM3_GPU_TYPES") Enabled %d (%s) GPU Direct support: driver %d GPU device %d\n",
+		PSM3_GPU_IS_ENABLED, PSM3_GPU_TYPE, psm3_gpu_is_driver_gpudirect_enabled, PSM3_GPU_GPUDIRECT_SUPPORTED());
 	_HFI_DBG("GDR Copy: %d limit send=%u recv=%u gpu_rndv=%u GPU RDMA flags=0x%x limit send=%u recv=%u\n",
-		is_gdr_copy_enabled, gdr_copy_limit_send, gdr_copy_limit_recv,
+		psm3_gpu_is_gdr_copy_enabled, psm3_gpu_gdr_copy_limit_send, psm3_gpu_gdr_copy_limit_recv,
 		psm3_gpu_thresh_rndv,
 		proto->flags & (IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV
 				|IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND),
-		gpudirect_rdma_send_limit, gpudirect_rdma_recv_limit);
+		psm3_gpu_gpudirect_rdma_send_limit, psm3_gpu_gpudirect_rdma_recv_limit);
 #ifdef PSM_HAVE_REG_MR
 	_HFI_DBG("send dma thresh: %u %u GPU send DMA thresh %u %u\n",
 		proto->iovec_thresh_eager, proto->iovec_thresh_eager_blocking,
 		proto->iovec_gpu_thresh_eager,
 		proto->iovec_gpu_thresh_eager_blocking);
 #endif
-#else /* PSM_CUDA || PSM_ONEAPI */
+#else /* PSM_HAVE_GPU */
 #ifdef PSM_HAVE_REG_MR
 	_HFI_DBG("send dma thresh: %u %u\n", proto->iovec_thresh_eager,
 		proto->iovec_thresh_eager_blocking);
 #endif
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 #ifdef PSM_HAVE_REG_MR
 	_HFI_DBG("rdma: %u MR cache %u\n", proto->ep->rdmamode,
 		proto->ep->mr_cache_mode);
@@ -971,9 +973,7 @@ psm3_ips_proto_fini(struct ips_proto *proto)
 {
 	psm2_error_t err = PSM2_OK;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
 	PSM3_GPU_SHUTDOWN_DTOH_MEMCPYS(proto);
-#endif
 
 	if ((err = psm3_ips_ibta_fini(proto)))
 		goto fail;
@@ -1038,8 +1038,8 @@ proto_sdma_init(struct ips_proto *proto)
 		}
 	}
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (! is_gpudirect_enabled
+#ifdef PSM_HAVE_GPU
+	if (! psm3_gpu_is_gpudirect_enabled
 	    || !psmi_hal_has_cap(PSM_HAL_CAP_GPUDIRECT_SDMA))
 		env_sdma.e_uint = 0;
 	else
@@ -1064,7 +1064,7 @@ proto_sdma_init(struct ips_proto *proto)
 				 env_hfiegr.e_uint;
 		}
 	}
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 	return err;
 }
@@ -1200,7 +1200,7 @@ psm3_ips_proto_timer_ctrlq_callback(struct psmi_timer *timer, uint64_t t_cyc_exp
 						   cqe->msg_scb.flow, &cqe->msg_scb,
 						   cqe->msg_scb.cksum, 0, PSMI_TRUE,
 						   have_cksum, cqe->msg_scb.cksum[0]
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			       , 0
 #endif
 				);
@@ -1309,7 +1309,7 @@ psm3_ips_proto_send_ctrl_message(struct ips_flow *flow, uint8_t message_type,
 	err = psmi_hal_transfer_frame(proto, flow,
 						   ctrlscb, payload, paylen,
 						   PSMI_TRUE, have_cksum, ctrlscb->cksum[0]
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 						   , 0
 #endif
 			     );
@@ -1492,7 +1492,7 @@ psm3_ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
 							scb->ips_lrh.flags &
 							IPS_SEND_FLAG_PKTCKSUM,
 							scb->cksum[0]
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 						   , IS_TRANSFER_BUF_GPU_MEM(scb)
 #endif
 			     ))
@@ -1529,9 +1529,9 @@ psm3_ips_proto_flow_flush_pio(struct ips_flow *flow, int *nflushed)
 #endif
 			PSM2_LOG_PKT_STRM(PSM2_LOG_TX,&scb->ips_lrh,"PKT_STRM: payload_size=%d err: %d",
 				scb->payload_size, err);
-		} else if (err == PSM2_TCP_DATA_SENT) {
+		} else if (err == PSM2_RELIABLE_DATA_SENT) {
 			// no credits and timers
-			// TDB - implement credits for TCP
+			// TDB - implement credits for reliable send
 			GENERIC_PERF_END(PSM_TX_SPEEDPATH_CTR); /* perf stats */
 			scb->scb_flags &= ~IPS_SEND_FLAG_PENDING;
 			num_sent++;
@@ -2036,7 +2036,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 			"also carry all or a portion of the message payload.\n"
 			"Large Rendezvous messages may be broken into multiple "
 			"window size chunks each with a separate CTS.\n"
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			"When sending from a GPU application buffer the "
 			"mechanisms include:\n"
 			"  - gdrcopy - Direct GPU copy via mmaping GPU memory\n"
@@ -2051,7 +2051,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 			"application buffer when it posts the receive. "
 			"With the exception of RDMA, all receive mechanisms "
 			"involve some form of copy.\n"
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			"When receiving into a GPU application buffer the "
 			"mechanisms include:\n"
 			"  - gdrcopy - Direct GPU copy via mmaping GPU memory\n"
@@ -2068,7 +2068,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("tiny_cpu_isend_bytes",
 				   "Tiny message bytes sent async from a CPU buffer",
 				   &proto->strat_stats.tiny_cpu_isend_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("tiny_gdrcopy_isend",
 				   "Tiny messages sent async from a GPU buffer via GDR copy",
 				   &proto->strat_stats.tiny_gdrcopy_isend),
@@ -2088,7 +2088,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("tiny_cpu_send_bytes",
 				   "Tiny message bytes sent sync from a CPU buffer",
 				   &proto->strat_stats.tiny_cpu_send_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("tiny_gdrcopy_send",
 				   "Tiny messages sent sync from a GPU buffer via GDR copy",
 				   &proto->strat_stats.tiny_gdrcopy_send),
@@ -2114,7 +2114,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("tiny_sysbuf_recv_bytes",
 				   "Tiny message bytes received into a bounce buffer",
 				   &proto->strat_stats.tiny_sysbuf_recv_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("tiny_gdrcopy_recv",
 				   "Tiny messages received into an application GPU buffer via GDR copy",
 				   &proto->strat_stats.tiny_gdrcopy_recv),
@@ -2141,7 +2141,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("short_dma_cpu_isend_bytes",
 				   "Short message bytes sent async from a CPU buffer via send DMA",
 				   &proto->strat_stats.short_dma_cpu_isend_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("short_gdrcopy_isend",
 				   "Short messages sent async from a GPU buffer via GDR copy",
 				   &proto->strat_stats.short_gdrcopy_isend),
@@ -2173,7 +2173,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("short_dma_cpu_send_bytes",
 				   "Short message bytes sent sync from a CPU buffer via send DMA",
 				   &proto->strat_stats.short_dma_cpu_send_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("short_gdrcopy_send",
 				   "Short messages sent sync from a GPU buffer via GDR copy",
 				   &proto->strat_stats.short_gdrcopy_send),
@@ -2206,7 +2206,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("short_sysbuf_recv_bytes",
 				   "Short message bytes received into a bounce buffer",
 				   &proto->strat_stats.short_sysbuf_recv_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("short_gdrcopy_recv",
 				   "Short messages received into an application GPU buffer via GDR copy",
 				   &proto->strat_stats.short_gdrcopy_recv),
@@ -2233,7 +2233,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("eager_dma_cpu_isend_bytes",
 				   "Eager message bytes sent async from a CPU buffer via send DMA",
 				   &proto->strat_stats.eager_dma_cpu_isend_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("eager_cuCopy_isend",
 				   "Eager messages sent async from a GPU buffer via GPU copy",
 				   &proto->strat_stats.eager_cuCopy_isend),
@@ -2259,7 +2259,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("eager_dma_cpu_send_bytes",
 				   "Eager message bytes sent sync from a CPU buffer via send DMA",
 				   &proto->strat_stats.eager_dma_cpu_send_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("eager_cuCopy_send",
 				   "Eager messages sent sync from a GPU buffer via GPU copy",
 				   &proto->strat_stats.eager_cuCopy_send),
@@ -2286,7 +2286,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("eager_sysbuf_recv_bytes",
 				   "Eager message bytes received into a bounce buffer",
 				   &proto->strat_stats.eager_sysbuf_recv_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("eager_gdrcopy_recv",
 				   "Eager messages received into an application GPU buffer via GDR copy",
 				   &proto->strat_stats.eager_gdrcopy_recv),
@@ -2307,7 +2307,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("rndv_cpu_isend_bytes",
 				   "Rendezvous message bytes sent async from a CPU buffer",
 				   &proto->strat_stats.rndv_cpu_isend_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("rndv_gpu_isend",
 				   "Rendezvous messages sent async from a GPU buffer",
 				   &proto->strat_stats.rndv_gpu_isend),
@@ -2321,7 +2321,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("rndv_cpu_send_bytes",
 				   "Rendezvous message bytes sent sync from a CPU buffer",
 				   &proto->strat_stats.rndv_cpu_send_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("rndv_gpu_send",
 				   "Rendezvous messages sent sync from a GPU buffer",
 				   &proto->strat_stats.rndv_gpu_send),
@@ -2342,7 +2342,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("rndv_rts_sysbuf_recv_bytes",
 				   "RTS packet message bytes received into an bounce buffer",
 				   &proto->strat_stats.rndv_rts_sysbuf_recv_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("rndv_rts_cuCopy_recv",
 				   "RTS packet messages received into an application GPU buffer via GPU copy",
 				   &proto->strat_stats.rndv_rts_cuCopy_recv),
@@ -2363,7 +2363,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("rndv_long_cpu_recv_bytes",
 				   "Long Data rendezvous message bytes received into an application CPU buffer",
 				   &proto->strat_stats.rndv_long_cpu_recv_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("rndv_long_cuCopy_recv",
 				   "Long Data rendezvous messages received into an application GPU buffer via GPU copy",
 				   &proto->strat_stats.rndv_long_cuCopy_recv),
@@ -2390,7 +2390,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("rndv_long_dma_cpu_send_bytes",
 				   "Long Data rendezvous message bytes sent from a CPU buffer via send DMA",
 				   &proto->strat_stats.rndv_long_dma_cpu_send_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("rndv_long_cuCopy_send",
 				   "Long Data rendezvous messages sent from a GPU buffer via GPU copy",
 				   &proto->strat_stats.rndv_long_cuCopy_send),
@@ -2417,7 +2417,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("rndv_rdma_cpu_recv_bytes",
 				   "RDMA rendezvous message bytes received direct into a CPU buffer",
 				   &proto->strat_stats.rndv_rdma_cpu_recv_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("rndv_rdma_gdr_recv",
 				   "RDMA rendezvous messages received direct into a GPU buffer",
 				   &proto->strat_stats.rndv_rdma_gdr_recv),
@@ -2437,7 +2437,7 @@ ips_proto_register_stats(struct ips_proto *proto)
 		PSMI_STATS_DECLU64("rndv_rdma_cpu_send_bytes",
 				   "RDMA rendezvous message bytes sent from a CPU buffer via send RDMA",
 				   &proto->strat_stats.rndv_rdma_cpu_send_bytes),
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		PSMI_STATS_DECLU64("rndv_rdma_gdr_send",
 				   "RDMA rendezvous messages sent from a GPU buffer via send RDMA",
 				   &proto->strat_stats.rndv_rdma_gdr_send),
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto.h b/prov/psm3/psm3/ptl_ips/ips_proto.h
index 47bf7a50c1d..8c81c4dcb49 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto.h
+++ b/prov/psm3/psm3/ptl_ips/ips_proto.h
@@ -94,8 +94,8 @@ struct ips_epinfo {
 	uint8_t ep_lmc;
 	enum psm3_ibv_rate ep_link_rate;
 	uint16_t ep_sl;		/* PSM3_NIC_SL only when path record not used */
-	uint32_t ep_mtu;	// PSM payload after potential hdr & PSM3_MTU decrease
-				// or TCP increase beyond wire size
+	uint32_t ep_mtu;	// PSM payload after potential hdr & PSM3_MTU adjustment
+	                	// for TCP and RC it can be beyond wire size
 	uint16_t ep_pkey;	/* PSM3_PKEY only when path record not used */
 	uint64_t ep_timeout_ack;	/* PSM3_ERRCHK_TIMEOUT if no path record */
 	uint64_t ep_timeout_ack_max;
@@ -356,7 +356,7 @@ struct ips_proto {
 	uint32_t iovec_thresh_eager_blocking;
 #endif
 #ifdef PSM_HAVE_REG_MR
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	uint32_t iovec_gpu_thresh_eager;
 	uint32_t iovec_gpu_thresh_eager_blocking;
 #endif
@@ -431,21 +431,12 @@ struct ips_proto {
 	void *opp_ctxt;
 	struct opp_api opp_fn;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	struct ips_gpu_hostbuf_mpool_cb_context gpu_hostbuf_send_cfg;
 	struct ips_gpu_hostbuf_mpool_cb_context gpu_hostbuf_small_send_cfg;
 	mpool_t gpu_hostbuf_pool_send;
 	mpool_t gpu_hostbuf_pool_small_send;
-#endif
-
-#ifdef PSM_CUDA
-	CUstream cudastream_send;
-#elif defined(PSM_ONEAPI)
-	/* Will not be used if psm3_oneapi_immed_async_copy */
-	ze_command_queue_handle_t cq_sends[MAX_ZE_DEVICES];
-#endif
-
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+	union ips_proto_gpu_specific gpu_specific;
 	unsigned gpu_prefetch_limit;
 #endif
 /*
@@ -656,6 +647,21 @@ struct ips_epaddr {
 			uint32_t use_max_inline_data;
 
 			uint8_t rc_connected;
+
+			// MR for flow->recv_seq_num
+			struct ibv_mr *recv_seq_mr;
+			// remote flow->recv_seq_num addr and rkey
+			uint64_t remote_recv_seq_addr;
+			uint32_t remote_recv_seq_rkey;
+			// psn num of remote flow->recv_seq_num
+			uint32_t remote_recv_psn;
+			// MR for remote flow->recv_seq_num storage
+			struct ibv_mr *remote_recv_psn_mr;
+			// indicare whether we have outstanding RDMA Read for
+			// remote flow->recv_seq_num
+			uint8_t remote_seq_outstanding;
+			// congestion control count
+			uint16_t cc_count;
 #endif /* USE_RC */
 		} verbs;
 #endif /* PSM_VERBS */
@@ -838,7 +844,7 @@ MOCK_DCL_EPILOGUE(psm3_ips_ibta_init);
 psm2_error_t psm3_ips_ibta_fini(struct ips_proto *proto);
 
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 PSMI_ALWAYS_INLINE(
 uint32_t ips_gpu_next_window(uint32_t max_window, uint32_t offset,
 			      uint32_t len))
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_connect.h b/prov/psm3/psm3/ptl_ips/ips_proto_connect.h
index 51f1f9affcb..a586f9dff1a 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_connect.h
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_connect.h
@@ -123,15 +123,21 @@ struct ips_connect_reqrep {
 			uint8_t reserved[16];	// 64b aligned
 			// fields below can be zero depending on rdmamode
 
-			// TBD - we could combine the RDMA=1 and RDMA=2,3
-			// sets of fields below into a union and save space
-			// or make room for more reserved space
-
-			// For rndv module connection establishment, PSM3_RDMA=1
-			// zero if no rndv mod RDMA
-			union ibv_gid gid; // sender's gid
-			uint32_t rv_index; // senders process index
-			uint32_t resv;	// alignment
+			union {
+				struct {
+					// For rndv module connection establishment, PSM3_RDMA=1
+					// zero if no rndv mod RDMA
+					union ibv_gid gid; // sender's gid
+					uint32_t rv_index; // senders process index
+					uint32_t resv;	// alignment
+				} rv;
+				struct {
+					// For PSM3_RDMA=3 only
+					uint64_t recv_addr;
+					uint32_t recv_rkey;
+					uint8_t resv[12];
+				} urc; // user space RC
+			};
 
 			// For user space RC QP connection establishment
 			// only set for USE_RC with PSM3_RDMA=2 or 3
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c
index 4cc1ebc701b..8b69b9d34d8 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_expected.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_expected.c
@@ -102,7 +102,7 @@ static psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc);
 #endif // PSM_HAVE_RDMA
 static psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static
 void psmi_gpu_run_prefetcher(struct ips_protoexp *protoexp,
 			     struct ips_tid_send_desc *tidsendc);
@@ -252,8 +252,8 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto,
 #endif
 #endif
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (PSMI_IS_GPU_ENABLED) {
+#ifdef PSM_HAVE_GPU
+	if (PSM3_GPU_IS_ENABLED) {
 		struct psmi_rlimit_mpool rlim = GPU_HOSTBUFFER_LIMITS;
 		uint32_t maxsz, chunksz, max_elements;
 		uint32_t pool_num_obj_max_total;
@@ -323,7 +323,7 @@ MOCKABLE(psm3_ips_protoexp_init)(const struct ips_proto *proto,
 	return err;
 
 fail:
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (protoexp != NULL && protoexp->gpu_hostbuf_pool_recv != NULL)
 		psm3_mpool_destroy(protoexp->gpu_hostbuf_pool_recv);
 	if (protoexp != NULL && protoexp->gpu_hostbuf_pool_small_recv != NULL)
@@ -346,9 +346,8 @@ psm2_error_t psm3_ips_protoexp_fini(struct ips_protoexp *protoexp)
 {
 	psm2_error_t err = PSM2_OK;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if(PSMI_IS_GPU_ENABLED &&
-		 !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) {
+#ifdef PSM_HAVE_GPU
+	if (PSM3_GPU_IS_ENABLED) {
 		psm3_mpool_destroy(protoexp->gpu_hostbuf_pool_small_recv);
 		psm3_mpool_destroy(protoexp->gpu_hostbuf_pool_recv);
 		PSM3_GPU_SHUTDOWN_HTOD_MEMCPYS(protoexp);
@@ -483,12 +482,12 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
 	getreq->tidgr_bytesdone = 0;
 	getreq->tidgr_flags = flags;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if ((req->is_buf_gpu_mem &&
 	    !(protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV)) ||
 	    ((req->is_buf_gpu_mem &&
 	     (protoexp->proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_RECV) &&
-	     (length > gpudirect_rdma_recv_limit
+	     (length > psm3_gpu_gpudirect_rdma_recv_limit
 		|| length & 0x03 || (uintptr_t)buf & 0x03
  		)))) {
 		getreq->gpu_hostbuf_used = 1;
@@ -505,14 +504,14 @@ psm3_ips_protoexp_tid_get_from_token(struct ips_protoexp *protoexp,
 #endif
 			protoexp->proto->strat_stats.rndv_rdma_cpu_recv++;
 			protoexp->proto->strat_stats.rndv_rdma_cpu_recv_bytes += length;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		}
 	}
 #endif
 
 	/* nbytes is the bytes each channel should transfer. */
 	count = ((ips_epaddr_t *) epaddr)->msgctl->ipsaddr_count;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (req->is_buf_gpu_mem)
 		nbytes = PSMI_ALIGNUP((length + count - 1) / count, PSMI_GPU_PAGESIZE);
 	else
@@ -632,7 +631,7 @@ psm3_ips_protoexp_send_tid_grant(struct ips_tid_recv_desc *tidrecvc)
 }
 
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 void psm3_ips_deallocate_send_chb(struct ips_gpu_hostbuf* chb, int reset)
 {
 	if (chb->is_tempbuf) {
@@ -672,7 +671,7 @@ ips_protoexp_tidsendc_complete(struct ips_tid_send_desc *tidsendc)
 	}
 #endif
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (req->gpu_hostbuf_used) {
 		if (tidsendc->gpu_num_buf == 1) {
 			tidsendc->gpu_hostbuf[0]->bytes_read +=
@@ -1237,7 +1236,7 @@ int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref,
 	}
 #endif
 	if (_HFI_PDBG_ON) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (tidrecvc->is_ptr_gpu_backed)
 			_HFI_PDBG_DUMP_GPU_ALWAYS(tidrecvc->buffer, len);
 		else
@@ -1269,7 +1268,7 @@ int ips_protoexp_handle_immed_data(struct ips_proto *proto, uint64_t conn_ref,
 
 
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static
 psm2_error_t
 psmi_gpu_reclaim_hostbufs(struct ips_tid_get_request *getreq)
@@ -1565,7 +1564,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 	_HFI_MMDBG("tidsendc created userbuf %p buffer %p length %u\n",
 			tidsendc->userbuf,  tidsendc->buffer, tidsendc->length);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/* Matching on previous prefetches and initiating next prefetch */
 	struct ips_gpu_hostbuf *chb = NULL, *chb_next = NULL;
 	psm2_chb_match_type_t rc = PSMI_GPU_CONTINUE;
@@ -1638,7 +1637,7 @@ psm3_ips_tid_send_handle_tidreq(struct ips_protoexp *protoexp,
 		protoexp->proto->strat_stats.rndv_rdma_gdr_send++;
 		protoexp->proto->strat_stats.rndv_rdma_gdr_send_bytes += tid_list->tsess_length;
 	} else
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 	{
 		protoexp->proto->strat_stats.rndv_rdma_cpu_send++;
 		protoexp->proto->strat_stats.rndv_rdma_cpu_send_bytes += tid_list->tsess_length;
@@ -1716,7 +1715,7 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc)
 		// no need to register again
 		err = PSM2_OK;
 	} else if (
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		! tidsendc->mqreq->gpu_hostbuf_used &&
 #endif
 			// separate MR cache's per EP, so this confirms we have the same EP
@@ -1730,7 +1729,7 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc)
 		_HFI_MMDBG("CTS send chunk register send: %p %u bytes\n", tidsendc->buffer , tidsendc->length);
 		tidsendc->mr = psm3_verbs_reg_mr(proto->mr_cache, 1,
                          tidsendc->buffer, tidsendc->length, IBV_ACCESS_RDMA
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 						| (PSM3_GPU_ADDR_SEND_MR(tidsendc->mqreq)
 							?IBV_ACCESS_IS_GPU_ADDR:0)
 #endif
@@ -1775,7 +1774,7 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc)
 	}
 	if (err == PSM2_OK) {
 		if (_HFI_PDBG_ON) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (tidsendc->mqreq->is_buf_gpu_mem && !tidsendc->mqreq->gpu_hostbuf_used)
 				_HFI_PDBG_DUMP_GPU_ALWAYS(tidsendc->buffer, tidsendc->tid_list.tsess_length);
 			else
@@ -1803,7 +1802,7 @@ psm2_error_t ips_tid_issue_rdma_write(struct ips_tid_send_desc *tidsendc)
 	}
 	if (err == PSM2_OK) {
 		if (_HFI_PDBG_ON) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (tidsendc->mqreq->is_buf_gpu_mem && !tidsendc->mqreq->gpu_hostbuf_used)
 				_HFI_PDBG_DUMP_GPU_ALWAYS(tidsendc->buffer, tidsendc->tid_list.tsess_length);
 			else
@@ -1840,12 +1839,12 @@ static
 psm2_error_t ips_tid_send_exp(struct ips_tid_send_desc *tidsendc)
 {
 	psm2_error_t err = PSM2_OK;
-#if   defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	struct ips_protoexp *protoexp = tidsendc->protoexp;
 #endif
 
 	_HFI_MMDBG("ips_tid_send_exp\n");
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	struct ips_gpu_hostbuf *chb, *chb_next;
 	uint32_t offset_in_chb, i;
 	// wait for async copies into needed prefetcher chb's to finish
@@ -2005,7 +2004,7 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp,
 	ips_scb_t *grantscb;
 #ifdef PSM_VERBS
 	psm2_mq_req_t req = getreq->tidgr_req;
-#elif defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#elif defined(PSM_HAVE_GPU)
 	psm2_mq_req_t req = getreq->tidgr_req;
 #endif
 #if defined(PSM_VERBS)
@@ -2046,7 +2045,7 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp,
 	tidrecvc->mr = NULL;	// be safe,but should be NULL since clear on release
 #endif
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
        if (req->is_buf_gpu_mem)
                tidrecvc->is_ptr_gpu_backed = !getreq->gpu_hostbuf_used;
        else
@@ -2095,17 +2094,17 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp,
 					    getreq->tidgr_offset);
 		tidrecvc->gpu_hostbuf = NULL;
 	}
-#else // PSM_CUDA || PSM_ONEAPI
+#else /* PSM_HAVE_GPU */
 	tidrecvc->buffer =
 	    (void *)((uintptr_t) getreq->tidgr_lbuf + getreq->tidgr_offset);
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 
 #if defined(PSM_SOCKETS) && PSMI_HAL_INST_CNT == 1
 	psmi_assert_always(0);	// should not get here
 #elif defined(PSM_VERBS)
 	// separate MR cache's per EP, so this confirms we have the same EP
 	if (
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		! getreq->gpu_hostbuf_used &&
 #endif
 		req->mr && req->mr->cache == proto->mr_cache) {
@@ -2115,12 +2114,12 @@ ips_tid_recv_alloc(struct ips_protoexp *protoexp,
 		_HFI_MMDBG("CTS chunk register recv: %p %u bytes\n", tidrecvc->buffer, nbytes_this);
 		tidrecvc->mr = psm3_verbs_reg_mr(proto->mr_cache, 1,
                         tidrecvc->buffer, nbytes_this, IBV_ACCESS_RDMA|IBV_ACCESS_REMOTE_WRITE
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-                                       | (PSM3_GPU_ADDR_RECV_MR(tidrecvc, getreq)?IBV_ACCESS_IS_GPU_ADDR:0)
+#ifdef PSM_HAVE_GPU
+                                       | (PSM3_GPU_ADDR_RECV_MR(tidrecvc, getreq->gpu_hostbuf_used)?IBV_ACCESS_IS_GPU_ADDR:0)
 #endif
 						);
 		if (! tidrecvc->mr) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (chb)
 				psm3_mpool_put(chb);
 #endif
@@ -2220,7 +2219,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current)
 #endif
 #endif
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/* due to unaligned recv using hostbuf, must always do this */
 	{
 		/* Before processing pending TID requests, first try to free up
@@ -2289,7 +2288,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current)
 			}
 		}
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (getreq->gpu_hostbuf_used) {
 			/* If this is a large transfer, we may be able to
 			 * start reclaiming before all of the data is sent. */
@@ -2322,7 +2321,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current)
 		 * async cuda copies to fill it, so the extra CTS is minimal
 		 * impact to the sender.
 		 */
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		psm2_mq_req_t req = getreq->tidgr_req;
 		if (req->is_buf_gpu_mem){
 			if (((getreq->tidgr_offset + nbytes_this) <
@@ -2392,7 +2391,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current)
 				    getreq->tidgr_length);
 
 			if (getreq->tidgr_offset == getreq->tidgr_length) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				if (getreq->gpu_hostbuf_used) {
 					/* this completes the tid xfer setup.
 					   move to the pending cuda ops queue,
@@ -2446,7 +2445,7 @@ ips_tid_pendtids_timer_callback(struct psmi_timer *timer, uint64_t current)
 }
 
 #ifdef PSM_HAVE_RDMA
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static
 void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc)
 {
@@ -2463,7 +2462,7 @@ void psmi_cudamemcpy_tid_to_device(struct ips_tid_recv_desc *tidrecvc)
 	tidrecvc->gpu_hostbuf = NULL;
 	ips_tid_pendtids_timer_callback(&tidrecvc->getreq->tidgr_protoexp->timer_getreqs,0);
 }
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 #endif // PSM_HAVE_RDMA
 
 #ifdef PSM_HAVE_RDMA
@@ -2479,7 +2478,7 @@ psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc)
 	psmi_assert(getreq != NULL);
 	psmi_assert(tidrecvc->state == TIDRECVC_STATE_BUSY);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (tidrecvc->gpu_hostbuf)
 		psmi_cudamemcpy_tid_to_device(tidrecvc);
 #endif
@@ -2502,7 +2501,7 @@ psm2_error_t ips_tid_recv_free(struct ips_tid_recv_desc *tidrecvc)
 	psm3_ips_tf_deallocate(&protoexp->tfc, tidrecvc->rdescid._desc_idx, 1);
 
 	if (getreq->tidgr_bytesdone == getreq->tidgr_length) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		/* if cuda, we handle callbacks when the cuda xfer is done */
 		if (!getreq->gpu_hostbuf_used) {
 			if (getreq->tidgr_callback)
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c
index a4f71ab8e5e..286b59507ca 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_mq.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_mq.c
@@ -149,7 +149,7 @@ int ips_proto_mq_eager_complete(void *reqp, uint32_t nbytes)
 	 * completion notification sent to the sender, this is the only place
 	 * where send side chb's can be freed and put back into the mpool.
 	 */
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	struct ips_gpu_hostbuf *chb;
 	if (req->gpu_hostbuf_used) {
 		while (!STAILQ_EMPTY(&req->sendreq_prefetch)) {
@@ -202,8 +202,8 @@ ips_shortcpy(void *vdest, const void *vsrc, uint32_t nchars))
 	unsigned char *dest = vdest;
 	const unsigned char *src = vsrc;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (PSMI_IS_GPU_ENABLED && (PSMI_IS_GPU_MEM(vdest) || PSMI_IS_GPU_MEM(vsrc))) {
+#ifdef PSM_HAVE_GPU
+	if ((PSM3_IS_GPU_MEM(vdest) || PSM3_IS_GPU_MEM(vsrc))) {
 		PSM3_GPU_MEMCPY(vdest, vsrc, nchars);
 		return;
 	}
@@ -223,7 +223,7 @@ ips_shortcpy(void *vdest, const void *vsrc, uint32_t nchars))
 	return;
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 PSMI_ALWAYS_INLINE(
 void
 ips_shortcpy_host_mem(void *vdest, const void *vsrc, uint32_t nchars))
@@ -356,7 +356,7 @@ ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req,
 		}
 #endif
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (req->is_buf_gpu_mem) {
 			// flags will get handled in pio transfer_frame
 			// but use cuMemcpy instead of GDRCopy
@@ -369,7 +369,7 @@ ips_ptl_mq_eager(struct ips_proto *proto, psm2_mq_req_t req,
 			// TBD USER_BUF_GPU only useful for RTS
 			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
 		}
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 
 		buf += pktlen;
 		offset += pktlen;
@@ -463,7 +463,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 	// small synchronous payload is sent in RTS itself
 	// CTS becomes the synchronous ACK
 	if (len <= flow->frag_size &&
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	    !req->is_buf_gpu_mem &&
 #endif
 	    (psmi_hal_has_cap(PSM_HAL_CAP_NON_DW_PKT_SIZE) || !(len & 0x3))) {
@@ -476,7 +476,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 		req->send_msgoff = 0;
 	}
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/* Used to indicate to the receiver that the send
 	 * is issued on a device buffer. This helps the
 	 * receiver select TID instead of using eager buffers.
@@ -492,7 +492,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 	    (len > GPUDIRECT_THRESH_RV)) ||
 	    ((proto->flags & IPS_PROTO_FLAG_GPUDIRECT_RDMA_SEND)  &&
 	    req->is_buf_gpu_mem &&
-	    (len > gpudirect_rdma_send_limit))) {
+	    (len > psm3_gpu_gpudirect_rdma_send_limit))) {
 		/* send from intermediate host buffer */
 		_HFI_VDBG("send from intermediate host buffer\n");
 		struct ips_gpu_hostbuf *chb;
@@ -566,7 +566,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 	// length, etc below)
 	//
 	// register buffer we will use as source for RDMA Write
-	// for PSM_CUDA/PSM_ONEAPI, a group of host bounce buffers may be used above
+	// for GPU, a group of host bounce buffers may be used above
 	// ips_scb_buffer catches when RTS contains the data, in which case no
 	// need for memory registration.  While unlkely we also skip
 	// registration for zero length sync messages
@@ -576,14 +576,14 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 			&& proto->protoexp 	/* expected tid recieve enabled */
 			&& ips_epaddr_rdma_connected(ipsaddr)
 			&& !req->mr
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-			&& (!PSMI_IS_GPU_ENABLED || len > GPUDIRECT_THRESH_RV)
+#ifdef PSM_HAVE_GPU
+			&& (!PSM3_GPU_IS_ENABLED || len > GPUDIRECT_THRESH_RV)
 			&& ! req->gpu_hostbuf_used
 #endif
 		) {
 		req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0,
 						 req->req_data.buf, req->req_data.send_msglen, IBV_ACCESS_RDMA
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 						| (req->is_buf_gpu_mem?IBV_ACCESS_IS_GPU_ADDR:0)
 #endif
 						);
@@ -610,7 +610,7 @@ ips_ptl_mq_rndv(struct ips_proto *proto, psm2_mq_req_t req,
 	return err;
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 static inline
 int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len,
 				uint32_t flags_user)
@@ -623,7 +623,7 @@ int psm3_is_needed_rendezvous(struct ips_proto *proto, uint32_t len,
 
 	return 0;
 }
-#endif //PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 
 
 psm2_error_t
@@ -637,9 +637,9 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 	ips_epaddr_t *ipsaddr;
 	ips_scb_t *scb;
 	psm2_mq_req_t req;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	int gpu_mem = 0;
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 
 	req = psm3_mq_req_alloc(mq, MQE_TYPE_SEND);
 	if_pf(req == NULL)
@@ -666,12 +666,12 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 	req->req_data.tag = *tag;
 	req->req_data.context = context;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	req->is_buf_gpu_mem = PSM3_IS_BUFFER_GPU_MEM(ubuf, len);
 	req->gpu_hostbuf_used = 0;
 	if (req->is_buf_gpu_mem) {
 		gpu_mem = 1;
-		PSM3_MARK_BUF_SYNCHRONOUS(ubuf);
+		PSM3_GPU_MARK_BUF_SYNCHRONOUS(ubuf);
 		if (psm3_is_needed_rendezvous(proto, len, 0))
 			goto do_rendezvous;
 	}
@@ -692,7 +692,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
 
 		const void *user_buffer = ubuf;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (!req->is_buf_gpu_mem) {
 			mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data,
 							  (uint32_t *) user_buffer, len);
@@ -707,7 +707,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 			 * memcpy to move data between HFI resources
 			 * and the GPU
 			 */
-			if (len <= gdr_copy_limit_send &&
+			if (len <= psm3_gpu_gdr_copy_limit_send &&
 				NULL != (user_buffer = psmi_hal_gdr_convert_gpu_to_host_addr(
 						(unsigned long)ubuf, len, 0, proto->ep))) {
 				mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data,
@@ -763,11 +763,11 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 #ifdef PSM_HAVE_REG_MR
 		int used_send_dma = 0;
 #endif
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (req->is_buf_gpu_mem) {
 			// TBD USER_BUF_GPU only useful for RTS
 			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
-			if (len <= gdr_copy_limit_send &&
+			if (len <= psm3_gpu_gdr_copy_limit_send &&
 				NULL != (user_buffer = psmi_hal_gdr_convert_gpu_to_host_addr(
 					(unsigned long)ubuf, len , 0, proto->ep))) {
 				/* init req so ips_proto_mq_eager_complete can unmap */
@@ -802,7 +802,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 				}
 			}
 		} else
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 		{
 #ifdef PSM_HAVE_REG_MR
 			if (len > proto->iovec_thresh_eager) {
@@ -894,7 +894,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 	} else if (len <= mq->rndv_nic_thresh) {
 		req->send_msgoff = 0;
 		req->rts_peer = (psm2_epaddr_t) ipsaddr;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (req->is_buf_gpu_mem) {
 #ifdef PSM_HAVE_REG_MR
 			// TBD - no upper bound for send DMA here
@@ -913,7 +913,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 				proto->strat_stats.eager_cuCopy_isend_bytes += len;
 			}
 		} else
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 		{
 #ifdef PSM_HAVE_REG_MR
 			// TBD - no upper bound for send DMA here
@@ -943,7 +943,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 		     len, tag->tag[0], tag->tag[1], tag->tag[2], req);
 	} else {		/* skip eager accounting below */
 do_rendezvous:
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (gpu_mem) {
 			proto->strat_stats.rndv_gpu_isend++;
 			proto->strat_stats.rndv_gpu_isend_bytes += len;
@@ -969,7 +969,7 @@ psm3_ips_proto_mq_isend(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags_user
 	mq->stats.tx_num++;
 	mq->stats.tx_eager_num++;
 	mq->stats.tx_eager_bytes += len;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (gpu_mem) {
 		mq->stats.tx_eager_gpu_num++;
 		mq->stats.tx_eager_gpu_bytes += len;
@@ -992,7 +992,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 	ips_epaddr_t *ipsaddr;
 	ips_scb_t *scb;
 
-#if   defined(PSM_CUDA) || defined (PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	int gpu_mem = 0;
 #endif
 
@@ -1010,10 +1010,10 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 
 	psmi_assert(proto->msgflowid < EP_FLOW_LAST);
 
-#if defined(PSM_CUDA) || defined (PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	gpu_mem = PSM3_IS_BUFFER_GPU_MEM(ubuf, len);
 	if (gpu_mem) {
-		PSM3_MARK_BUF_SYNCHRONOUS(ubuf);
+		PSM3_GPU_MARK_BUF_SYNCHRONOUS(ubuf);
 		if (psm3_is_needed_rendezvous(proto, len, flags))
 			goto do_rendezvous;
 	}
@@ -1033,7 +1033,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		    ipsaddr->msgctl->mq_send_seqnum);
 		ipsaddr->msgctl->mq_send_seqnum++;
 		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		const void *user_buffer = ubuf;
 		if (!gpu_mem) {
 			mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data,
@@ -1049,7 +1049,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 			 * memcpy to move data between HFI resources
 			 * and the GPU
 			 */
-			if (len <= gdr_copy_limit_send &&
+			if (len <= psm3_gpu_gdr_copy_limit_send &&
 				NULL != (user_buffer = psmi_hal_gdr_convert_gpu_to_host_addr(
 						(unsigned long)ubuf, len, 0, proto->ep))) {
 				mq_copy_tiny_host_mem((uint32_t *) &scb->ips_lrh.hdr_data,
@@ -1091,13 +1091,13 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		ips_scb_copy_tag(scb->ips_lrh.tag, tag->tag);
 
 		const void * user_buffer = ubuf;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		int converted = 0;
 		if (gpu_mem) {
 			// TBD USER_BUF_GPU only useful for RTS
 			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
 			/* will use PIO */
-			if (len <= gdr_copy_limit_send &&
+			if (len <= psm3_gpu_gdr_copy_limit_send &&
 				NULL != (user_buffer = psmi_hal_gdr_convert_gpu_to_host_addr(
 						(unsigned long)ubuf, len, 0, proto->ep))) {
 				converted = 1;
@@ -1131,7 +1131,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 				}
 			}
 		} else
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 		{
 #ifdef PSM_HAVE_REG_MR
 			if (len > proto->iovec_thresh_eager_blocking
@@ -1209,7 +1209,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 				psmi_assert(flow->transfer == PSM_TRANSFER_PIO);
 				/* PIO and now have a bounce buffer */
 				/* copy to bounce buffer */
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				if (!gpu_mem || converted) {
 					// host address
 					ips_shortcpy_host_mem
@@ -1245,7 +1245,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		if (err > PSM2_OK_NO_PROGRESS)
 			return err;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		req->gpu_hostbuf_used = 0;
 		if (gpu_mem) {
 			req->is_buf_gpu_mem = 1;
@@ -1269,7 +1269,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 			req->is_buf_gpu_mem = 0;
 #else
 		{
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 #ifdef PSM_HAVE_REG_MR
 			// TBD - no upper bound for send DMA here
 			// non-priority MR and will fallback if can't register
@@ -1319,7 +1319,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 		req->flags_user = flags;
 		req->flags_internal |= PSMI_REQ_FLAG_IS_INTERNAL;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		if (gpu_mem) {
 			req->is_buf_gpu_mem = 1;
 			proto->strat_stats.rndv_gpu_send++;
@@ -1348,7 +1348,7 @@ psm3_ips_proto_mq_send(psm2_mq_t mq, psm2_epaddr_t mepaddr, uint32_t flags,
 	mq->stats.tx_num++;
 	mq->stats.tx_eager_num++;
 	mq->stats.tx_eager_bytes += len;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (gpu_mem) {
 		mq->stats.tx_eager_gpu_num++;
 		mq->stats.tx_eager_gpu_bytes += len;
@@ -1379,7 +1379,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 	psmi_assert(req->req_data.recv_msglen == req->req_data.send_msglen);
 	req->mq->stats.rx_user_num++;
 	req->mq->stats.rx_user_bytes += req->req_data.recv_msglen;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/* Cases where we do not use TIDs:
 	 * 0) Received full message as payload to RTS, CTS is just an ack
 	 * 1) Recv on a host buffer, Send on a gpu buffer and len is <= 3 bytes
@@ -1398,7 +1398,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 		|| ! ips_epaddr_rdma_connected((ips_epaddr_t *) epaddr)
 #endif
 		) {
-#else // PSM_CUDA || PSM_ONEAPI
+#else /* PSM_HAVE_GPU */
 	if (req->recv_msgoff >= req->req_data.recv_msglen ||
 	    proto->protoexp == NULL	/* no expected tid recieve */
 #ifdef PSM_HAVE_REG_MR
@@ -1406,7 +1406,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 #endif
 	    || req->req_data.recv_msglen <= proto->mq->rndv_nic_thresh /* less rv theshold */
 		) {  /* no expected tid recieve */
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 #ifdef PSM_HAVE_REG_MR
 //do_long_data:
 #endif
@@ -1415,7 +1415,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 		/* there is no order requirement, try to push CTS request
 		 * directly, if fails, then queue it for later try. */
 		_HFI_VDBG("pushing CTS recv off %u len %u"
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			" rGPU %u sGPU %u"
 #endif
 			" rv thresh %u"
@@ -1424,7 +1424,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 #endif
 			" epaddr %p RDMA %u\n",
 			req->recv_msgoff, req->req_data.recv_msglen,
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			req->is_buf_gpu_mem, req->is_sendbuf_gpu_mem,
 #endif
 			proto->mq->rndv_nic_thresh,
@@ -1435,7 +1435,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 
 		if (req->recv_msgoff < req->req_data.recv_msglen) {
 			// RTS did not have the message as payload
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (req->is_buf_gpu_mem) {
 				proto->strat_stats.rndv_long_gpu_recv++;
 				proto->strat_stats.rndv_long_gpu_recv_bytes += req->req_data.recv_msglen;
@@ -1443,7 +1443,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 #endif
 				proto->strat_stats.rndv_long_cpu_recv++;
 				proto->strat_stats.rndv_long_cpu_recv_bytes += req->req_data.recv_msglen;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			}
 #endif
 		}
@@ -1483,7 +1483,7 @@ ips_proto_mq_rts_match_callback(psm2_mq_req_t req, int was_posted)
 		// various sized messages which may arrive in the buffer
 #ifdef PSM_HAVE_REG_MR
 		psmi_assert(req->req_data.send_msglen);	// 0 len uses LONG_DATA above
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		// for GPU receive buffer we need to sort things out at a lower level
 		// since may use a host bounce buffer for RDMA and need to register it
 		if (! req->is_buf_gpu_mem) {
@@ -1571,7 +1571,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 	uint32_t nbytes_left = req->req_data.send_msglen - req->recv_msgoff;
 	uint32_t nbytes_this, chunk_size;
 	uint32_t frag_size, unaligned_bytes;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	int converted = 0;
 #endif
 	struct ips_flow *flow;
@@ -1585,7 +1585,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 	frag_size = flow->frag_size;
 	chunk_size = min(proto->ep->chunk_max_segs*frag_size,
 					 proto->ep->chunk_max_size);
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (req->is_buf_gpu_mem) {
 #ifdef PSM_HAVE_REG_MR
 		// rare, but when RV connection not available, we
@@ -1607,7 +1607,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 			// for GPU send buffer <= 3, receiver can select
 			// LONG DATA and we can use GDRCopy
 			// must repin per attempt
-		if (req->req_data.send_msglen <= gdr_copy_limit_send &&
+		if (req->req_data.send_msglen <= psm3_gpu_gdr_copy_limit_send &&
 				0 != (buf =  (uintptr_t)psmi_hal_gdr_convert_gpu_to_host_addr(
 				(unsigned long)req->req_data.buf,
 				req->req_data.send_msglen, 0, proto->ep))) {
@@ -1620,7 +1620,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 			proto->strat_stats.rndv_long_cuCopy_send_bytes += dostats*req->req_data.send_msglen;
 		}
 	} else {
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 #ifdef PSM_HAVE_REG_MR
 		// TBD - no upper bound for send DMA here
 		// non-priority MR and will fallback if can't register
@@ -1636,9 +1636,9 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 #endif /* PSM_HAVE_REG_MR */
 		{
 			proto->strat_stats.rndv_long_copy_cpu_send += dostats;
-			proto->strat_stats.rndv_long_copy_cpu_send_bytes += (uint64_t)dostats*req->req_data.send_msglen;
+			proto->strat_stats.rndv_long_copy_cpu_send_bytes += dostats*(uint64_t)req->req_data.send_msglen;
 		}
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	}
 #endif
 
@@ -1673,7 +1673,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 		/* attached unaligned bytes into packet header */
 		unaligned_bytes = nbytes_left & 0x3;
 		if (unaligned_bytes) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			if (!req->is_buf_gpu_mem || converted)
 				mq_copy_tiny_host_mem((uint32_t *)&scb->ips_lrh.mdata,
 					(uint32_t *)buf, unaligned_bytes);
@@ -1700,7 +1700,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 			ips_scb_flags(scb) |= IPS_SEND_FLAG_SEND_MR;
 		}
 #endif
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		// SDMA identifies GPU buffers itself. But PIO path needs flags
 		if (req->is_buf_gpu_mem) {
 #ifdef PSM_HAVE_REG_MR
@@ -1712,7 +1712,7 @@ psm3_ips_proto_mq_push_rts_data(struct ips_proto *proto, psm2_mq_req_t req)
 			// TBD USER_BUF_GPU only useful for RTS
 			ips_scb_flags(scb) |= IPS_SEND_FLAG_USER_BUF_GPU;
 		}
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 
 		scb->frag_size = frag_size;
 		nbytes_this = min(chunk_size, nbytes_left);
@@ -1799,7 +1799,7 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev)
 			    p_hdr->data[1].u32w0);
 		proto->epaddr_stats.cts_rdma_recv++;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		psmi_assert(p_hdr->data[1].u32w1 > min(psm3_gpu_thresh_rndv, mq->rndv_nic_thresh));	// msglen
 #else
 		psmi_assert(p_hdr->data[1].u32w1 > mq->rndv_nic_thresh);	// msglen
@@ -1815,7 +1815,7 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev)
 
 #ifdef PSM_HAVE_REG_MR
 		if (! req->mr
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			&& ! req->gpu_hostbuf_used
 #endif
 			) {
@@ -1823,7 +1823,7 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev)
 			// or we failed to register memory previously.
 			req->mr = psm3_verbs_reg_mr(proto->mr_cache, 0,
 							req->req_data.buf, req->req_data.send_msglen, IBV_ACCESS_RDMA
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 								| (req->is_buf_gpu_mem?IBV_ACCESS_IS_GPU_ADDR:0)
 #endif
 							);
@@ -1870,7 +1870,7 @@ psm3_ips_proto_mq_handle_cts(struct ips_recvhdrq_event *rcv_ev)
 			// for send DMA if req->mr != NULL.
 			if (req->mr &&
 				(!psm3_verbs_user_space_mr(req->mr)
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				|| (req->is_buf_gpu_mem && req->req_data.send_msglen <= proto->iovec_gpu_thresh_eager)
 				|| (!req->is_buf_gpu_mem && req->req_data.send_msglen <= proto->iovec_thresh_eager)
 #else
@@ -2000,7 +2000,7 @@ psm3_ips_proto_mq_handle_rts(struct ips_recvhdrq_event *rcv_ev)
 	if (p_hdr->flags & IPS_SEND_FLAG_BLOCKING)
 		req->type |= MQE_TYPE_WAITING_PEER;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (p_hdr->flags & IPS_SEND_FLAG_USER_BUF_GPU)
 		req->is_sendbuf_gpu_mem = 1;
 	else
@@ -2256,7 +2256,7 @@ psm3_ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
 		 */
 		if (req) {
 			//u32w0 is offset - only cnt recv msgs on 1st pkt in msg
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 			int use_gdrcopy = 0;
 			if (!req->is_buf_gpu_mem) {
 				if (req->state == MQ_STATE_UNEXP) {
@@ -2287,7 +2287,7 @@ psm3_ips_proto_mq_handle_eager(struct ips_recvhdrq_event *rcv_ev)
 			}
 			psm3_mq_handle_data(mq, req,
 				p_hdr->data[1].u32w0, payload, paylen);
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 
 			if (msgorder == IPS_MSG_ORDER_FUTURE_RECV)
 				ret = IPS_RECVHDRQ_BREAK;
@@ -2403,10 +2403,10 @@ psm3_ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev)
 	psm2_mq_req_t req;
 	struct ips_flow *flow;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	int use_gdrcopy = 0;
 	struct ips_proto *proto = rcv_ev->proto;
-#endif // PSM_CUDA || PSM_ONEAPI
+#endif /* PSM_HAVE_GPU */
 	psmi_copy_tiny_fn_t psmi_copy_tiny_fn = mq_copy_tiny;
 
 
@@ -2426,7 +2426,7 @@ psm3_ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev)
 	paylen = ips_recvhdrq_event_paylen(rcv_ev);
 	psmi_assert(paylen == 0 || payload);
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	// cpu stats already tracked when sent CTS
 	if (req->is_buf_gpu_mem) {
 		req->req_data.buf = req->user_gpu_buffer;
@@ -2470,7 +2470,7 @@ psm3_ips_proto_mq_handle_data(struct ips_recvhdrq_event *rcv_ev)
 	}
 
 	psm3_mq_handle_data(mq, req, p_hdr->data[1].u32w0, payload, paylen
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 				, use_gdrcopy, rcv_ev->proto->ep);
 #else
 				);
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_params.h b/prov/psm3/psm3/ptl_ips/ips_proto_params.h
index f288d6c54a1..fce2435f259 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_params.h
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_params.h
@@ -141,7 +141,7 @@
 #define IPS_SEND_FLAG_PKTCKSUM          0x02	/* Has packet checksum */
 #define IPS_SEND_FLAG_AMISTINY		0x04	/* AM is tiny, exclusive */
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /* This flag is used to indicate to the reciever when
  * the send is issued on a device buffer. This helps in
  * selecting TID path on the recieve side regardless of
@@ -159,7 +159,7 @@
 #define IPS_SEND_FLAG_PERSISTENT	0x0200
 #define IPS_SEND_FLAG_NO_LMC		0x0400
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /* This flag is used to indicate if the send is on
  * a GPU buffer. This helps PIO/SDMA paths to detect
  * if payload is GPU buffer without having to call
@@ -219,7 +219,7 @@
 #define IPS_PROTO_FLAG_PPOLICY_STATIC 0x1c00
 
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /* Use RNDV (TID) for all message sizes */
 //#define IPS_PROTO_FLAG_ALWAYS_RNDV		0x10000	// unused
 /* Use GPUDirect RDMA for SDMA */
@@ -246,6 +246,7 @@
 #define IPS_PROTOEXP_FLAG_RDMA_KERNEL        0x01    /* kernel RV module RDMA */
 #define IPS_PROTOEXP_FLAG_RDMA_USER          0x02    /* user RC QP for RDMA only */
 #define IPS_PROTOEXP_FLAG_RDMA_USER_RC       0x03    /* user RC QP eager & RDMA */
+#define IPS_PROTOEXP_FLAG_RDMA_QP(flag)      ((flag)&IPS_PROTOEXP_FLAG_RDMA_MASK)    /* QP RDMA mode */
 #define IPS_PROTOEXP_FLAG_USER_RC_QP(flag) ((flag)&0x02) /* either RC QP mode */
 #define IPS_PROTOEXP_FLAG_KERNEL_QP(flag) \
 		(((flag)&IPS_PROTOEXP_FLAG_RDMA_MASK) == IPS_PROTOEXP_FLAG_RDMA_KERNEL)
diff --git a/prov/psm3/psm3/ptl_ips/ips_proto_recv.c b/prov/psm3/psm3/ptl_ips/ips_proto_recv.c
index 2fbc0a0773b..4aa0fc476fa 100644
--- a/prov/psm3/psm3/ptl_ips/ips_proto_recv.c
+++ b/prov/psm3/psm3/ptl_ips/ips_proto_recv.c
@@ -315,6 +315,9 @@ psm3_ips_proto_process_ack(struct ips_recvhdrq_event *rcv_ev)
 	ips_scb_t *scb;
 
 	ack_seq_num.psn_num = p_hdr->ack_seq_num;
+#ifdef USE_RC
+	ipsaddr->verbs.remote_recv_psn = ack_seq_num.psn_num;
+#endif
 	// check actual psn acked (ack_seq_num-1), we only want to process acks
 	// for packets we never got an ack for
 	if ((flowid = ips_proto_flowid(p_hdr)) < EP_NUM_FLOW_ENTRIES) {
diff --git a/prov/psm3/psm3/ptl_ips/ips_scb.c b/prov/psm3/psm3/ptl_ips/ips_scb.c
index 05aead8cc33..a3149c2455a 100644
--- a/prov/psm3/psm3/ptl_ips/ips_scb.c
+++ b/prov/psm3/psm3/ptl_ips/ips_scb.c
@@ -276,7 +276,7 @@ ips_scb_t *MOCKABLE(psm3_ips_scbctrl_alloc)(struct ips_scbctrl *scbc, int scbnum
 		scb->nfrag = 1;
 		scb->frag_size = 0;
 		scb->chunk_size = 0;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 		scb->mq_req = NULL;
 #endif
 #ifdef PSM_HAVE_REG_MR
@@ -346,7 +346,7 @@ ips_scb_t *MOCKABLE(psm3_ips_scbctrl_alloc_tiny)(struct ips_scbctrl *scbc)
 	scb->nfrag = 1;
 	scb->frag_size = 0;
 	scb->chunk_size = 0;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	scb->mq_req = NULL;
 #endif
 #ifdef PSM_HAVE_REG_MR
diff --git a/prov/psm3/psm3/ptl_ips/ips_scb.h b/prov/psm3/psm3/ptl_ips/ips_scb.h
index 97670116fdf..6345830b632 100644
--- a/prov/psm3/psm3/ptl_ips/ips_scb.h
+++ b/prov/psm3/psm3/ptl_ips/ips_scb.h
@@ -185,16 +185,16 @@ struct ips_scb {
 		psm2_am_completion_fn_t completion_am;
 	};
 	void *cb_param;
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	psm2_mq_req_t mq_req;		/* back pointer to original request */
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 	struct {
 		struct ips_message_header ips_lrh;
 	} PSMI_CACHEALIGN;
 };
 
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 #define IS_TRANSFER_BUF_GPU_MEM(scb) (ips_scb_flags(scb) & IPS_SEND_FLAG_PAYLOAD_BUF_GPU)
 #endif
 
diff --git a/prov/psm3/psm3/ptl_ips/ptl.c b/prov/psm3/psm3/ptl_ips/ptl.c
index 9878713a37c..3f416231783 100644
--- a/prov/psm3/psm3/ptl_ips/ptl.c
+++ b/prov/psm3/psm3/ptl_ips/ptl.c
@@ -560,14 +560,15 @@ psm3_ips_ptl_disconnect(ptl_t *ptl_gen, int force, int numep,
 }
 
 /* Only symbol we expose out of here */
-struct ptl_ctl_init
-psm3_ptl_ips = {
-	ips_ptl_sizeof, ips_ptl_init, ips_ptl_fini, ips_ptl_setopt,
-	    ips_ptl_getopt
+struct ptl_ctl_init psm3_ptl_ips = {
+	.sizeof_ptl = ips_ptl_sizeof,
+	.init = ips_ptl_init,
+	.fini = ips_ptl_fini,
+	.setopt = ips_ptl_setopt,
+	.getopt = ips_ptl_getopt,
 };
 
-struct ptl_ctl_rcvthread
-psm3_ptl_ips_rcvthread = {
-	ips_ptl_rcvthread_is_enabled,
-	psm3_ips_ptl_rcvthread_transfer_ownership,
+struct ptl_ctl_rcvthread psm3_ptl_ips_rcvthread = {
+	.is_enabled = ips_ptl_rcvthread_is_enabled,
+	.transfer_ownership = psm3_ips_ptl_rcvthread_transfer_ownership,
 };
diff --git a/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c
index 562721a0b37..cac70401242 100644
--- a/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c
+++ b/prov/psm3/psm3/ptl_ips/ptl_rcvthread.c
@@ -97,14 +97,6 @@ struct ptl_rcvthread {
 pthread_t psm3_rcv_threadid;
 #endif
 
-#ifdef PSM_CUDA
-/* This is a global cuda context (extern declaration in psm_user.h)
- * stored to provide hints during a cuda failure
- * due to a null cuda context.
- */
-CUcontext cu_ctxt;
-#endif
-
 // for psm3_wait and psm3_wake
 static pthread_mutex_t     wait_mutex;
 static pthread_cond_t      wait_condvar;
@@ -144,15 +136,16 @@ psm2_error_t psm3_ips_ptl_rcvthread_init(ptl_t *ptl_gen, struct ips_recvhdrq *re
 	rcvc->ptl = ptl_gen;
 	rcvc->t_start_cyc = get_cycles();
 
-#ifdef PSM_CUDA
-	if (PSMI_IS_GPU_ENABLED)
-		PSMI_CUDA_CALL(cuCtxGetCurrent, &cu_ctxt);
-#endif
+	PSM3_GPU_FETCH_CTXT();
 
 	if (psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RTS_RX_THREAD) &&
 	    (!psmi_hal_has_sw_status(PSM_HAL_PSMI_RUNTIME_RX_THREAD_STARTED))){
 
-		pthread_cond_init(&wait_condvar, NULL);
+		pthread_condattr_t attr;
+		pthread_condattr_init(&attr);
+		pthread_condattr_setclock(&attr, CLOCK_MONOTONIC);
+		pthread_cond_init(&wait_condvar, &attr);
+
 		pthread_mutex_init(&wait_mutex, NULL);
 		wait_signalled = 0;
 
@@ -375,12 +368,12 @@ psm2_error_t rcvthread_initsched(struct ptl_rcvthread *rcvc)
 // means migrating to a singleton model and properly fixing/removing the
 // transfer_ownership of rcvc when EPs are destroyed so can ensure
 // poll_type properly maintained on all affected EPs.
-psm2_error_t psm3_wait(int timeout)
+psm2_error_t psm3_wait(int timeout_ms)
 {
 	psm2_ep_t ep;
 	psm2_error_t ret = PSM2_OK;
 
-	_HFI_VDBG("Wait for event. timeout=%d\n", timeout);
+	_HFI_VDBG("Wait for event. timeout=%d ms\n", timeout_ms);
 	// TBD - while psm3_wait is active, we would like a quick poll() timeout
 	// because it is our only checking for PSM protocol timeouts.  However
 	// poll() has probably already started, so too late to change it now.
@@ -460,20 +453,20 @@ psm2_error_t psm3_wait(int timeout)
 		wait_signalled = 0;
 		wait_nosleep_signalled_count++;
 		_HFI_VDBG("found already signaled, no sleep\n");
-	} else if (timeout < 0) {	// infinite timeout
+	} else if (timeout_ms < 0) {	// infinite timeout
 		// Wait for condition variable to be signaled or broadcast.
 		pthread_cond_wait(&wait_condvar, &wait_mutex);
 		wait_signalled = 0;
 		wait_sleep_til_signal_count++;
 		_HFI_VDBG("slept, infinite timeout\n");
 	} else {
-		struct timespec wait_time;
+		struct timespec wait_time; // absolute timestamp
 		clock_gettime(CLOCK_MONOTONIC, &wait_time);	// current time
-		wait_time.tv_sec += timeout / 1000;
-		wait_time.tv_nsec += (timeout % 1000) * 1000;
-		if (wait_time.tv_nsec > 1000000000) { // handle carry from nsec to sec
-			wait_time.tv_sec++; 
-			wait_time.tv_nsec -= 1000000000;
+		wait_time.tv_sec  +=  timeout_ms / MSEC_PER_SEC;
+		wait_time.tv_nsec += (timeout_ms % MSEC_PER_SEC) * NSEC_PER_MSEC;
+		if (wait_time.tv_nsec >= NSEC_PER_SEC) { // handle carry from nsec to sec
+			wait_time.tv_sec++;
+			wait_time.tv_nsec -= NSEC_PER_SEC;
 		}
 		if (0 > pthread_cond_timedwait(&wait_condvar, &wait_mutex, &wait_time)) {
 			_HFI_VDBG("slept, timeout\n");
@@ -486,7 +479,7 @@ psm2_error_t psm3_wait(int timeout)
 			wait_sleep_signal_count++;
 		}
 	}
-	pthread_mutex_unlock( &wait_mutex );
+	pthread_mutex_unlock(&wait_mutex);
 	// TBD if ret == PSM2_OK we could use ipeek to see if any real progress
 	// was made and loop back to start to wait again if not.  For now we
 	// leave that to our caller
@@ -564,10 +557,7 @@ void *ips_ptl_pollintr(void *rcvthreadc)
 	int next_timeout = rcvc->last_timeout;
 	psm2_error_t err;
 
-#ifdef PSM_CUDA
-	if (PSMI_IS_GPU_ENABLED && cu_ctxt != NULL)
-		PSMI_CUDA_CALL(cuCtxSetCurrent, cu_ctxt);
-#endif
+	PSM3_GPU_REFRESH_CTXT();
 
 	PSM2_LOG_MSG("entering");
 	/* No reason to have many of these, keep this as a backup in case the
diff --git a/prov/psm3/psm3/ptl_self/ptl.c b/prov/psm3/psm3/ptl_self/ptl.c
index 19231015d9b..31eeb1a85ae 100644
--- a/prov/psm3/psm3/ptl_self/ptl.c
+++ b/prov/psm3/psm3/ptl_self/ptl.c
@@ -158,25 +158,16 @@ self_mq_isend(psm2_mq_t mq, psm2_epaddr_t epaddr, uint32_t flags_user,
 	if_pf(send_req == NULL)
 	    return PSM2_NO_MEMORY;
 
-#ifdef PSM_CUDA
+#ifdef PSM_HAVE_GPU
 	// we technically don't need to set is_buf_gpu_mem because psm3_mq_mtucpy
 	// will be used to copy the data to the destination or a sysbuf and it will
 	// check if the buffer is GPU memory. But we do need the sync_memops()
-	if (len && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(ubuf)) {
-		psmi_cuda_set_attr_sync_memops(ubuf);
+	if (len && PSM3_IS_GPU_MEM(ubuf)) {
+		PSM3_GPU_MARK_BUF_SYNCHRONOUS(ubuf);
 		send_req->is_buf_gpu_mem = 1;
 	} else
 		send_req->is_buf_gpu_mem = 0;
 #endif
-#ifdef PSM_ONEAPI
-	// we don't need to set is_buf_gpu_mem because psm3_mq_mtucpy will be
-	// used to copy the data to the destination or a sysbuf and it will
-	// check if the buffer is a GPU memory
-	//if (len && PSMI_IS_GPU_ENABLED && PSMI_IS_GPU_MEM(ubuf)) {
-	//	send_req->is_buf_gpu_mem = 1;
-	//} else
-	//	send_req->is_buf_gpu_mem = 0;
-#endif
 
 	mq->stats.tx_num++;
 	mq->stats.tx_rndv_num++;
@@ -441,8 +432,10 @@ self_ptl_getopt(const void *component_obj, int optname,
 }
 
 /* Only symbol we expose out of here */
-struct ptl_ctl_init
-psm3_ptl_self = {
-	self_ptl_sizeof, self_ptl_init, self_ptl_fini, self_ptl_setopt,
-	self_ptl_getopt
+struct ptl_ctl_init psm3_ptl_self = {
+	.sizeof_ptl = self_ptl_sizeof,
+	.init = self_ptl_init,
+	.fini = self_ptl_fini,
+	.setopt = self_ptl_setopt,
+	.getopt = self_ptl_getopt,
 };
diff --git a/prov/psm3/psm3/utils/utils_debug.c b/prov/psm3/psm3/utils/utils_debug.c
index e218f3bd12f..97fbd9de585 100644
--- a/prov/psm3/psm3/utils/utils_debug.c
+++ b/prov/psm3/psm3/utils/utils_debug.c
@@ -514,7 +514,7 @@ void psm3_dump_buf(uint8_t *buf, uint32_t len)
 	}
 }
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 void psm3_dump_gpu_buf(uint8_t *buf, uint32_t len)
 {
 	int i, j, print_len;
diff --git a/prov/psm3/psm3/utils/utils_dsa.c b/prov/psm3/psm3/utils/utils_dsa.c
index a990babb208..9dfe3368200 100644
--- a/prov/psm3/psm3/utils/utils_dsa.c
+++ b/prov/psm3/psm3/utils/utils_dsa.c
@@ -293,8 +293,8 @@ void psm3_dsa_memcpy(void *dest, const void *src, uint32_t n, int rx,
 	int copied_chunks = 0;
 	uint32_t dsa_cp_len;
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (n && PSMI_IS_GPU_ENABLED && (PSMI_IS_GPU_MEM(dest) || PSMI_IS_GPU_MEM((void *) src))) {
+#ifdef PSM_HAVE_GPU
+	if (n && (PSM3_IS_GPU_MEM(dest) || PSM3_IS_GPU_MEM((void *) src))) {
 		_HFI_VDBG("GPU copy from %p to %p for %u\n", src, dest, n);
 		PSM3_GPU_MEMCPY(dest, src, n);
 		return;
diff --git a/prov/psm3/psm3/utils/utils_dwordcpy-x86_64.c b/prov/psm3/psm3/utils/utils_dwordcpy-x86_64.c
index 6929bc200a6..818e6d9ea56 100644
--- a/prov/psm3/psm3/utils/utils_dwordcpy-x86_64.c
+++ b/prov/psm3/psm3/utils/utils_dwordcpy-x86_64.c
@@ -169,8 +169,8 @@ void psm3_qwordcpy(volatile uint64_t *dest, const uint64_t *src, uint32_t nqword
 void MOCKABLE(psm3_mq_mtucpy)(void *vdest, const void *vsrc, uint32_t nchars)
 {
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
-	if (nchars && PSMI_IS_GPU_ENABLED && (PSMI_IS_GPU_MEM(vdest) || PSMI_IS_GPU_MEM((void *) vsrc))) {
+#ifdef PSM_HAVE_GPU
+	if (nchars && (PSM3_IS_GPU_MEM(vdest) || PSM3_IS_GPU_MEM((void *) vsrc))) {
 		PSM3_GPU_MEMCPY(vdest, vsrc, nchars);
 		return;
 	}
diff --git a/prov/psm3/src/psm3_revision.c.in b/prov/psm3/src/psm3_revision.c.in
index 6c02d64ec9e..936e7474a3d 100644
--- a/prov/psm3/src/psm3_revision.c.in
+++ b/prov/psm3/src/psm3_revision.c.in
@@ -17,10 +17,6 @@
 #define PSMX3_GIT_CHECKSUM	"@PSM3_GIT_HASH@"
 #endif
 
-#ifndef PSM3_MARCH
-#define PSM3_MARCH	"@PSM3_MARCH@"
-#endif
-
 char psm3_IEFS_version[] = PSMX3_IEFS_VERSION;
 char psm3_build_timestamp[] = PSMX3_BUILD_TIMESTAMP;
 char psm3_sources_checksum[] = PSMX3_SRC_CHECKSUM;
diff --git a/prov/psm3/src/psmx3.h b/prov/psm3/src/psmx3.h
index 6edfa308338..7a126099b71 100644
--- a/prov/psm3/src/psmx3.h
+++ b/prov/psm3/src/psmx3.h
@@ -853,6 +853,7 @@ struct psmx3_env {
 	char	*tag_layout;
 #endif
 	int	yield_mode;
+	int	wait_enable;
 };
 
 #define PSMX3_MAX_UNITS	PSMI_MAX_RAILS /* from psm_config.h */
diff --git a/prov/psm3/src/psmx3_atomic.c b/prov/psm3/src/psmx3_atomic.c
index b05416c5e07..b518f2a81f5 100644
--- a/prov/psm3/src/psmx3_atomic.c
+++ b/prov/psm3/src/psmx3_atomic.c
@@ -182,7 +182,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 #define PSMX3_BXOR(dst,src)	(dst) ^= (src)
 #define PSMX3_COPY(dst,src)	(dst) = (src)
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /* res is always CPU address, dst could be GPU address */
 #define PSMX3_ATOMIC_READ(dst,res,cnt,TYPE) \
 		do { \
@@ -195,7 +195,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			psm3_memcpy(r, d, sizeof(TYPE)*cnt); \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
-#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#else /* PSM_HAVE_GPU */
 #define PSMX3_ATOMIC_READ(dst,res,cnt,TYPE) \
 		do { \
 			int i; \
@@ -206,9 +206,9 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 				r[i] = d[i]; \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /* src is always CPU address, dst could be GPU address */
 #define PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE) \
 		do { \
@@ -228,7 +228,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			} \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
-#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#else /* PSM_HAVE_GPU */
 #define PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE) \
 		do { \
 			int i; \
@@ -239,9 +239,9 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 				OP(d[i],s[i]); \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /* src is always CPU address, dst could be GPU address */
 // optimized to avoid unnecessary read and compare, OP==PSMX3_COPY and not used
 #define PSMX3_ATOMIC_WRITE_COPY(dst,src,cnt,OP,TYPE) \
@@ -255,12 +255,12 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			psm3_memcpy(d, s, sizeof(TYPE)*cnt); \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
-#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#else /* PSM_HAVE_GPU */
 #define PSMX3_ATOMIC_WRITE_COPY(dst,src,cnt,OP,TYPE) \
 	PSMX3_ATOMIC_WRITE(dst,src,cnt,OP,TYPE)
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /* src, res are always CPU address, dst could be GPU address */
 #define PSMX3_ATOMIC_READWRITE(dst,src,res,cnt,OP,TYPE) \
 		do { \
@@ -281,7 +281,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			} \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
-#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#else /* PSM_HAVE_GPU */
 #define PSMX3_ATOMIC_READWRITE(dst,src,res,cnt,OP,TYPE) \
 		do { \
 			int i; \
@@ -295,9 +295,9 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			} \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /* src, cmp, res are always CPU address, dst could be GPU address */
 #define PSMX3_ATOMIC_CSWAP(dst,src,cmp,res,cnt,CMP_OP,TYPE) \
 		do { \
@@ -320,7 +320,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			} \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
-#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#else /* PSM_HAVE_GPU */
 #define PSMX3_ATOMIC_CSWAP(dst,src,cmp,res,cnt,CMP_OP,TYPE) \
 		do { \
 			int i; \
@@ -336,9 +336,9 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			} \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 /* src, cmp, res are always CPU address, dst could be GPU address */
 #define PSMX3_ATOMIC_MSWAP(dst,src,cmp,res,cnt,TYPE) \
 		do { \
@@ -359,7 +359,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			} \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
-#else /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#else /* PSM_HAVE_GPU */
 #define PSMX3_ATOMIC_MSWAP(dst,src,cmp,res,cnt,TYPE) \
 		do { \
 			int i; \
@@ -374,7 +374,7 @@ static inline size_t psmx3_ioc_size(const struct fi_ioc *ioc, size_t count,
 			} \
 			psmx3_unlock(&psmx3_atomic_lock, 1); \
 		} while (0)
-#endif /* defined(PSM_CUDA) || defined(PSM_ONEAPI) */
+#endif /* PSM_HAVE_GPU */
 
 static int psmx3_atomic_do_write(void *dest, void *src,
 				 int datatype, int op, int count)
diff --git a/prov/psm3/src/psmx3_attr.c b/prov/psm3/src/psmx3_attr.c
index 402253fba59..34f9b671071 100644
--- a/prov/psm3/src/psmx3_attr.c
+++ b/prov/psm3/src/psmx3_attr.c
@@ -263,7 +263,7 @@ static struct fi_info *psmx3_dupinfo(const struct fi_info *info)
 #endif /* HAVE_PSM3_DL */
 
 static uint64_t psmx3_check_fi_hmem_cap(void) {
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	/* if parses as empty or invalid use default of 0 */
 	/* psm3 below us will provide warning as needed when it parses it */
 	int gpu = 0;
@@ -278,7 +278,7 @@ static uint64_t psmx3_check_fi_hmem_cap(void) {
 							0, UINT_MAX);
 	if ((gpu || gpudirect) && !ofi_hmem_p2p_disabled())
 		return FI_HMEM;
-#endif /* PSM_CUDA || PSM_ONEAPI */
+#endif /* PSM_HAVE_GPU */
 	return 0;
 }
 
@@ -319,28 +319,30 @@ static uint64_t get_max_inject_size(void) {
 			thresh_rv = temp;
 	}
 
-#if defined(PSM_CUDA) || defined(PSM_ONEAPI)
+#ifdef PSM_HAVE_GPU
 	if (psmx3_prov_info.caps & FI_HMEM) {
 		if (have_nic) {
 			// GPU ips rendezvous threshold
-			// sockets HAL avoids rendezvous, so this may be overly restrictive
-			temp = PSM3_GPU_THRESH_RNDV;
-			// PSM3_CUDA_THRESH_RNDV depricated, use PSM3_GPU_THRESH_RNDV if set
-			psm3_parse_str_uint(psm3_env_get("PSM3_CUDA_THRESH_RNDV"), &temp,
-								0, UINT_MAX);
-			psm3_parse_str_uint(psm3_env_get("PSM3_GPU_THRESH_RNDV"), &temp,
-								0, UINT_MAX);
-			if (thresh_rv > temp)
-				thresh_rv = temp;
+			uint32_t out;
+			if (psm3_info_query(PSM2_INFO_QUERY_GPU_THRESH_RNDV, &out, 0, NULL)) {
+				PSMX3_WARN(&psmx3_prov, FI_LOG_CORE,
+					"Unable to get PSM3_GPU_THRESH_RNDV.\n");
+			} else if (thresh_rv > out) {
+				thresh_rv = out;
+			}
 		}
 
 		if (have_shm) {
 			// GPU shm rendezvous threshold
-			temp = PSM3_MQ_RNDV_SHM_GPU_THRESH;
-			psm3_parse_str_uint(psm3_env_get("PSM3_MQ_RNDV_SHM_GPU_THRESH"), &temp,
-								0, UINT_MAX);
-			if (thresh_rv > temp)
-				thresh_rv = temp;
+			// we only have default, real value may be overriden at MQ init
+			// when open PSM3 endpoint
+			uint32_t out;
+			if (psm3_info_query(PSM2_INFO_QUERY_MQ_RNDV_SHM_GPU_THRESH_DEFAULT, &out, 0, NULL)) {
+				PSMX3_WARN(&psmx3_prov, FI_LOG_CORE,
+					"Unable to get PSM3_MQ_RNDV_SHM_GPU_THRESH default.\n");
+			} else if (thresh_rv > out) {
+				thresh_rv = out;
+			}
 		}
 	}
 #endif
diff --git a/prov/psm3/src/psmx3_ep.c b/prov/psm3/src/psmx3_ep.c
index 62042023c65..47400d93298 100644
--- a/prov/psm3/src/psmx3_ep.c
+++ b/prov/psm3/src/psmx3_ep.c
@@ -147,6 +147,46 @@ STATIC ssize_t psmx3_ep_cancel(fid_t fid, void *context)
 	return psmx3_errno(err);
 }
 
+STATIC int psmx3_ep_getopt_cuda_api_permitted(
+	struct psmx3_fid_ep *ep, bool *value)
+{
+	// invariant: if both rx and tx are set, then they are expected to be
+	// the same internal PSM endpoint
+	assert(!ep->tx || !ep->rx || ep->tx->psm2_ep == ep->rx->psm2_ep);
+
+	uint64_t size = (uint64_t)sizeof(*value);
+
+	psm2_error_t err = psm3_getopt(
+		PSM2_COMPONENT_CORE,
+		ep->tx ? ep->tx->psm2_ep : ep->rx->psm2_ep,
+		PSM2_CORE_OPT_EP_CUDA_PERMITTED,
+		value,
+		&size);
+	if (err)
+		return -FI_EINVAL;
+
+	return 0;
+}
+
+STATIC int psmx3_ep_setopt_cuda_api_permitted(
+	struct psmx3_fid_ep *ep, const bool *value)
+{
+	// invariant: if both rx and tx are set, then they are expected to be
+	// the same internal PSM endpoint
+	assert(!ep->tx || !ep->rx || ep->tx->psm2_ep == ep->rx->psm2_ep);
+
+	psm2_error_t err = psm3_setopt(
+		PSM2_COMPONENT_CORE,
+		ep->tx ? ep->tx->psm2_ep : ep->rx->psm2_ep,
+		PSM2_CORE_OPT_EP_CUDA_PERMITTED,
+		value,
+		sizeof(*value));
+	if (err)
+		return -FI_EINVAL;
+
+	return 0;
+}
+
 DIRECT_FN
 STATIC int psmx3_ep_getopt(fid_t fid, int level, int optname,
 			   void *optval, size_t *optlen)
@@ -164,6 +204,11 @@ STATIC int psmx3_ep_getopt(fid_t fid, int level, int optname,
 		*optlen = sizeof(size_t);
 		break;
 
+	case FI_OPT_CUDA_API_PERMITTED:
+		if (!optlen || *optlen != sizeof(bool))
+			return -FI_EINVAL;
+		return psmx3_ep_getopt_cuda_api_permitted(ep, (bool *)optval);
+
 	default:
 		return -FI_ENOPROTOOPT;
 	}
@@ -187,6 +232,11 @@ STATIC int psmx3_ep_setopt(fid_t fid, int level, int optname,
 		ep->min_multi_recv = *(size_t *)optval;
 		break;
 
+	case FI_OPT_CUDA_API_PERMITTED:
+		if (optlen != sizeof(bool))
+			return -FI_EINVAL;
+		return psmx3_ep_setopt_cuda_api_permitted(ep, (const bool *)optval);
+
 	default:
 		return -FI_ENOPROTOOPT;
 	}
diff --git a/prov/psm3/src/psmx3_init.c b/prov/psm3/src/psmx3_init.c
index 29359d3ea34..3408cf2ec41 100644
--- a/prov/psm3/src/psmx3_init.c
+++ b/prov/psm3/src/psmx3_init.c
@@ -45,7 +45,7 @@ static const char* FI_PSM3_NAME_SERVER_HELP =
 			"Whether to turn on the name server or not (default: yes)";
 static const char* FI_PSM3_TAGGED_RMA_HELP =
 			"Whether to use tagged messages for large size RMA or not " \
-			"(default: yes)";
+			"(default: no)";
 static const char* FI_PSM3_UUID_HELP =
 			"Unique Job ID required by the fabric";
 static const char* FI_PSM3_DELAY_HELP =
@@ -81,6 +81,8 @@ static const char* FI_PSM3_TAG_LAYOUT_HELP =
 #endif
 static const char* FI_PSM3_YIELD_MODE_HELP =
 			"Enabled interrupt driven operation with fi_wait. (default: no).";
+static const char* FI_PSM3_WAIT_ENABLE_HELP =
+			"Enabled use of wait semantics outside of yield mode. (default: no).";
 
 #define FI_PSM3_PREFIX "FI_PSM3_"
 #define FI_PSM3_PREFIX_LEN strlen(FI_PSM3_PREFIX)
@@ -132,7 +134,7 @@ int psmx3_param_get_str(struct fi_provider *provider, const char *env_var_name,
 
 struct psmx3_env psmx3_env = {
 	.name_server	= 1,
-	.tagged_rma	= 1,
+	.tagged_rma	= 0,
 	.uuid		= PSMX3_DEFAULT_UUID,
 	.uuid_override  = 0,
 	.delay		= 0,
@@ -149,6 +151,7 @@ struct psmx3_env psmx3_env = {
 	.tag_layout	= "auto",
 #endif
 	.yield_mode	= 0,
+	.wait_enable	= 0,
 };
 
 #if (PSMX3_TAG_LAYOUT == PSMX3_TAG_LAYOUT_RUNTIME)
@@ -253,6 +256,8 @@ static void psmx3_init_env(void)
 	//fi_param_get_bool(&psmx3_prov, "yield_mode", &psmx3_env.yield_mode);
 	psmx3_param_get_bool(&psmx3_prov, "FI_PSM3_YIELD_MODE",
 				FI_PSM3_YIELD_MODE_HELP, 0, &psmx3_env.yield_mode);
+	psmx3_param_get_bool(&psmx3_prov, "FI_PSM3_WAIT_ENABLE",
+				FI_PSM3_WAIT_ENABLE_HELP, 0, &psmx3_env.wait_enable);
 }
 
 void psmx3_init_tag_layout(struct fi_info *info)
@@ -680,18 +685,6 @@ static int psmx3_getinfo(uint32_t api_version, const char *node,
 
 	PSMX3_INFO(&psmx3_prov, FI_LOG_CORE,"\n");
 
-	__builtin_cpu_init();
-	if (!__builtin_cpu_supports(PSM3_MARCH)) {
-		PSMX3_INFO(&psmx3_prov, FI_LOG_CORE,
-			"CPU does not support '%s'.\n", PSM3_MARCH);
-		OFI_INFO_STR(&psmx3_prov,
-			(__builtin_cpu_supports("avx2") ? "AVX2" :
-				(__builtin_cpu_supports("avx") ? "AVX" :
-					(__builtin_cpu_supports("sse4.2") ? "SSE4.2" : "unknown"))),
-			PSM3_MARCH, "CPU Supports", "PSM3 Built With");
-		goto err_out;
-	}
-
 	if (psmx3_init_prov_info(hints, &prov_info))
 		goto err_out;
 
@@ -946,6 +939,8 @@ PROVIDER_INI
 #endif
 	fi_param_define(&psmx3_prov, "yield_mode", FI_PARAM_BOOL,
 			FI_PSM3_YIELD_MODE_HELP);
+	fi_param_define(&psmx3_prov, "wait_enable", FI_PARAM_BOOL,
+			FI_PSM3_WAIT_ENABLE_HELP);
 
 	psmx3_init_env();
 
diff --git a/prov/psm3/src/psmx3_rma.c b/prov/psm3/src/psmx3_rma.c
index e76491c9878..f8c88f628f2 100644
--- a/prov/psm3/src/psmx3_rma.c
+++ b/prov/psm3/src/psmx3_rma.c
@@ -64,31 +64,33 @@ static inline void psmx3_iov_copy(struct iovec *iov, size_t count,
 /* RMA protocol:
  *
  * Write REQ:
- *	args[0].u32w0	cmd, flag
- *	args[0].u32w1	len
- *	args[1].u64	req
- *	args[2].u64	addr
- *	args[3].u64	key
- *	args[4].u64	data (optional)
+ *	args[0].u32w0 : cmd, flag
+ *	args[0].u32w1 : req len
+ *	args[1].u64   : req
+ *	args[2].u64   : target base address
+ *	args[3].u64   : target mr key
+ *	args[4].u64   : cq data (optional)
+ *	args[5].u32w0 : target base offset (optional; unused for long protocol)
+ *	args[5].u32w1 : reserved
  *
  * Write REP:
- *	args[0].u32w0	cmd, flag
- *	args[0].u32w1	error
- *	args[1].u64	req
+ *	args[0].u32w0 : cmd, flag
+ *	args[0].u32w1 : error
+ *	args[1].u64   : req
  *
  * Read REQ:
- *	args[0].u32w0	cmd, flag
- *	args[0].u32w1	len
- *	args[1].u64	req
- *	args[2].u64	addr
- *	args[3].u64	key
- *	args[4].u64	offset / unused for long protocol
+ *	args[0].u32w0 : cmd, flag
+ *	args[0].u32w1 : len
+ *	args[1].u64   : req
+ *	args[2].u64   : addr
+ *	args[3].u64   : key
+ *	args[4].u64   : offset / unused for long protocol
  *
  * Read REP:
- *	args[0].u32w0	cmd, flag
- *	args[0].u32w1	error
- *	args[1].u64	req
- *	args[2].u64	offset
+ *	args[0].u32w0 : cmd, flag
+ *	args[0].u32w1 : error
+ *	args[1].u64   : req
+ *	args[2].u64   : offset
  */
 
 int psmx3_am_rma_handler(psm2_am_token_t token, psm2_amarg_t *args,
@@ -98,6 +100,8 @@ int psmx3_am_rma_handler(psm2_am_token_t token, psm2_amarg_t *args,
 	psm2_amarg_t rep_args[8];
 	uint8_t *rma_addr;
 	ssize_t rma_len;
+	size_t rma_offset;
+	uint32_t cq_data;
 	uint64_t key;
 	int err = 0;
 	int op_error = 0;
@@ -123,23 +127,23 @@ int psmx3_am_rma_handler(psm2_am_token_t token, psm2_amarg_t *args,
 		rma_len = args[0].u32w1;
 		rma_addr = (uint8_t *)(uintptr_t)args[2].u64;
 		key = args[3].u64;
+		cq_data = args[4].u64;
+		rma_offset = args[5].u32w0;
 		mr = psmx3_mr_get(rx->domain, key);
 		op_error = mr ?
-			psmx3_mr_validate(mr, (uint64_t)rma_addr, len, FI_REMOTE_WRITE) :
+			psmx3_mr_validate(mr, (uint64_t)rma_addr + rma_offset, len, FI_REMOTE_WRITE) :
 			-FI_EINVAL;
 		if (!op_error) {
-			rma_addr += mr->offset;
-			psm3_memcpy(rma_addr, src, len);
+			psm3_memcpy(rma_addr + mr->offset + rma_offset, src, len);
 			if (eom) {
 				if (rx->ep->recv_cq && has_data) {
-					/* TODO: report the addr/len of the whole write */
 					event = psmx3_cq_create_event(
 							rx->ep->recv_cq,
 							0, /* context */
 							rma_addr,
 							FI_REMOTE_WRITE | FI_RMA | FI_REMOTE_CQ_DATA,
-							rma_len,
-							args[4].u64,
+							rma_offset + rma_len,
+							cq_data,
 							0, /* tag */
 							0, /* olen */
 							0);
@@ -409,6 +413,7 @@ static ssize_t psmx3_rma_self(int am_cmd,
 	struct psmx3_fid_cntr *cntr = NULL;
 	struct psmx3_fid_cntr *mr_cntr = NULL;
 	struct psmx3_fid_cq *cq = NULL;
+	psm2_ep_t psm_ep = ep->tx ? ep->tx->psm2_ep : ep->rx->psm2_ep;
 	int no_event;
 	int err = 0;
 	int op_error = 0;
@@ -458,7 +463,7 @@ static ssize_t psmx3_rma_self(int am_cmd,
 				cq = ep->recv_cq;
 			if (mr->cntr != cntr)
 				mr_cntr = mr->cntr;
-			psm3_memcpy((void *)addr, buf, len);
+			psm3_ep_memcpy(psm_ep, (void *)addr, buf, len);
 			break;
 
 		case PSMX3_AM_REQ_WRITEV:
@@ -470,14 +475,14 @@ static ssize_t psmx3_rma_self(int am_cmd,
 			dst = (void *)addr;
 			for (i=0; i<iov_count; i++)
 				if (iov[i].iov_len) {
-					psm3_memcpy(dst, iov[i].iov_base, iov[i].iov_len);
+					psm3_ep_memcpy(psm_ep, dst, iov[i].iov_base, iov[i].iov_len);
 					dst += iov[i].iov_len;
 				}
 			break;
 
 		case PSMX3_AM_REQ_READ:
 			cntr = ep->remote_read_cntr;
-			psm3_memcpy(buf, (void *)addr, len);
+			psm3_ep_memcpy(psm_ep, buf, (void *)addr, len);
 			break;
 
 		case PSMX3_AM_REQ_READV:
@@ -485,7 +490,7 @@ static ssize_t psmx3_rma_self(int am_cmd,
 			src = (void *)addr;
 			for (i=0; i<iov_count; i++)
 				if (iov[i].iov_len) {
-					psm3_memcpy(iov[i].iov_base, src, iov[i].iov_len);
+					psm3_ep_memcpy(psm_ep, iov[i].iov_base, src, iov[i].iov_len);
 					src += iov[i].iov_len;
 				}
 			break;
@@ -944,6 +949,7 @@ ssize_t psmx3_write_generic(struct fid_ep *ep, const void *buf, size_t len,
 	void *psm2_context;
 	int no_event;
 	size_t req_refcnt = 0;
+	size_t offset = 0;
 	int err;
 
 	ep_priv = container_of(ep, struct psmx3_fid_ep, ep);
@@ -1050,12 +1056,14 @@ ssize_t psmx3_write_generic(struct fid_ep *ep, const void *buf, size_t len,
 	}
 
 	PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE);
-	nargs = 4;
+	nargs = 6;
+	args[4].u64 = 0; /* cq_data always zero when !EOM */
 	while (len > chunk_size) {
 		args[0].u32w1 = chunk_size;
 		args[1].u64 = (uint64_t)(uintptr_t)req;
 		args[2].u64 = addr;
 		args[3].u64 = key;
+		args[5].u32w0 = offset;
 		err = psm3_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
 					    args, nargs, (void *)buf,
 					    chunk_size, am_flags, NULL, NULL);
@@ -1068,8 +1076,8 @@ ssize_t psmx3_write_generic(struct fid_ep *ep, const void *buf, size_t len,
 		}
 		psmx3_am_poll(ep_priv->tx);
 		buf = (const uint8_t *)buf + chunk_size;
-		addr += chunk_size;
 		len -= chunk_size;
+		offset += chunk_size;
 		req_refcnt++;
 	}
 
@@ -1077,10 +1085,10 @@ ssize_t psmx3_write_generic(struct fid_ep *ep, const void *buf, size_t len,
 	args[1].u64 = (uint64_t)(uintptr_t)req;
 	args[2].u64 = addr;
 	args[3].u64 = key;
+	args[5].u32w0 = offset;
 	if (flags & FI_REMOTE_CQ_DATA) {
 		PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM);
 		args[4].u64 = data;
-		nargs++;
 	} else {
 		PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM);
 	}
@@ -1184,19 +1192,20 @@ ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov,
 		PSMX3_CTXT_USER(&req->fi_context) = context;
 		PSMX3_CTXT_EP(&req->fi_context) = ep_priv;
 
+		nargs = 6;
 		args[0].u32w0 = 0;
 		PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE);
 		args[0].u32w1 = len;
 		args[1].u64 = (uint64_t)(uintptr_t)req;
 		args[2].u64 = addr;
 		args[3].u64 = key;
-		nargs = 4;
+		args[5].u32w0 = 0;
 		if (flags & FI_REMOTE_CQ_DATA) {
 			PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM);
 			args[4].u64 = data;
-			nargs++;
 		} else {
 			PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM);
+			args[4].u64 = 0;
 		}
 		err = psm3_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
 					    args, nargs, (void *)buf, len,
@@ -1285,7 +1294,8 @@ ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov,
 
 		/* Case 2.2: use short protocol all other segments */
 		PSMX3_AM_SET_OP(args[0].u32w0, PSMX3_AM_REQ_WRITE);
-		nargs = 4;
+		nargs = 6;
+		args[4].u64 = 0; /* cq_data always zero when !EOM */
 		buf = iov[i].iov_base;
 		len = iov[i].iov_len;
 		while (len > chunk_size) {
@@ -1293,6 +1303,7 @@ ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov,
 			args[1].u64 = (uint64_t)(uintptr_t)req;
 			args[2].u64 = addr;
 			args[3].u64 = key;
+			args[5].u32w0 = len_sent;
 			err = psm3_am_request_short(psm2_epaddr, PSMX3_AM_RMA_HANDLER,
 						    args, nargs, (void *)buf,
 						    chunk_size, am_flags,
@@ -1304,7 +1315,6 @@ ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov,
 			}
 			psmx3_am_poll(ep_priv->tx);
 			buf += chunk_size;
-			addr += chunk_size;
 			len -= chunk_size;
 			len_sent += chunk_size;
 			req_refcnt++;
@@ -1314,11 +1324,11 @@ ssize_t psmx3_writev_generic(struct fid_ep *ep, const struct iovec *iov,
 		args[1].u64 = (uint64_t)(uintptr_t)req;
 		args[2].u64 = addr;
 		args[3].u64 = key;
+		args[5].u32w0 = len_sent;
 		if (len_sent + len == total_len) {
 			if (flags & FI_REMOTE_CQ_DATA) {
 				PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_DATA | PSMX3_AM_EOM);
 				args[4].u64 = data;
-				nargs++;
 			} else {
 				PSMX3_AM_SET_FLAG(args[0].u32w0, PSMX3_AM_EOM);
 			}
diff --git a/prov/psm3/src/psmx3_wait.c b/prov/psm3/src/psmx3_wait.c
index 7f798a60c66..3ffc118e663 100644
--- a/prov/psm3/src/psmx3_wait.c
+++ b/prov/psm3/src/psmx3_wait.c
@@ -195,7 +195,7 @@ STATIC int psmx3_wait_wait(struct fid_wait *wait, int timeout)
 	struct util_wait *wait_priv;
 	struct psmx3_fid_fabric *fabric;
 	int err;
-	
+
 	wait_priv = container_of(wait, struct util_wait, wait_fid);
 	fabric = container_of(wait_priv->fabric, struct psmx3_fid_fabric, util_fabric);
 
@@ -213,8 +213,6 @@ STATIC int psmx3_wait_wait(struct fid_wait *wait, int timeout)
 	// don't seem extensible and they have no way to determine if they are empty
 	// so we depend on FI_PSM3_YIELD_MODE=1 to disable normal waitset handling
 	// and allow this simplified use to meet Intel MPI needs.
-	//if (wait_priv->pollset is empty && wait_priv->wait_obj == FI_WAIT_YIELD)
-	//	psm3_wait();
 	if (psmx3_env.yield_mode) {
 		switch (psm3_wait(timeout)) {
 		case PSM2_OK:
@@ -226,6 +224,16 @@ STATIC int psmx3_wait_wait(struct fid_wait *wait, int timeout)
 		}
 	}
 
+	/* outside of YIELD_MODE, one must explicitly enable support for
+	 * fi_wait().  user beware... this is not supported by PSM3 proper,
+	 * but instead only plumbed within the PSMX3 provider shim.
+	 */
+	if (!psmx3_env.wait_enable) {
+		PSMX3_WARN(fabric->util_fabric.prov, FI_LOG_FABRIC,
+			"fi_wait() not enabled (see FI_PSM3_WAIT_ENABLE)\n");
+		return -FI_ENOSYS;
+	}
+
 	psmx3_wait_start_progress(fabric);
 
 	err = psmx3_wait_wait_wait(wait, timeout);
@@ -239,20 +247,61 @@ DIRECT_FN
 int psmx3_wait_open(struct fid_fabric *fabric, struct fi_wait_attr *attr,
 		   struct fid_wait **waitset)
 {
+	struct util_fabric *util_fabric = container_of(fabric, struct util_fabric, fabric_fid);
 	struct fid_wait *wait;
 	int err;
-	if (psmx3_env.yield_mode && attr->wait_obj == FI_WAIT_YIELD) {
-		// CQ and CNTR won't be allowed to be added to waitset, so
-		// we simply create an UNSPEC fd waitset for simplicity here
-		// It should not actually be used
-		struct fi_wait_attr tmp = *attr;
-		tmp.wait_obj = FI_WAIT_UNSPEC;
-		err = ofi_wait_fd_open(fabric, &tmp, &wait);
-	} else {
+
+	switch (attr->wait_obj) {
+	case FI_WAIT_UNSPEC:
+	case FI_WAIT_FD:
+	case FI_WAIT_POLLFD:
 		err = ofi_wait_fd_open(fabric, attr, &wait);
+		if (err)
+			return err;
+
+		break;
+
+	case FI_WAIT_YIELD:
+		// NOTE: we use the YIELD type only as a special indicator for
+		// the Intel MPI yield mode global wait set.  it is otherwise
+		// unsupported
+		if (!psmx3_env.yield_mode) {
+			PSMX3_WARN(util_fabric->prov, FI_LOG_FABRIC,
+				"wait object %u not supported outside of yield mode\n",
+				attr->wait_obj);
+			return -FI_ENOSYS;
+		}
+
+		// if not in YIELD_MODE, create a yield wait object.
+		//
+		// if in YIELD_MODE, we want callers to only ever wait by
+		// invoking a top-level fi_wait(), since YIELD_MODE will turn
+		// all waits into a global wait.  can also use a yield wait
+		// object for this, justified by:
+		//
+		// - it is not valid to call fi_control(...GETWAIT...) on a
+		//   YIELD object, which is what we want to force callers to
+		//   wait via fi_wait().
+		// - CQ and CNTR won't be allowed to be added to waitset by
+		//   an explicit yield mode check in their open() calls, so the
+		//   object type shouldn't matter.
+		// - in yield mode, fi_wait() will never interact with the wait
+		//   set directly, but instead just delegate to psm3_wait(), so
+		//   the underlying wait set type doesn't matter.
+		//
+		err = ofi_wait_yield_open(fabric, attr, &wait);
+		if (err)
+			return err;
+
+		break;
+
+	case FI_WAIT_MUTEX_COND:
+	default:
+		PSMX3_WARN(util_fabric->prov, FI_LOG_FABRIC,
+			"wait object %u not supported\n",
+			attr->wait_obj);
+		return -FI_ENOSYS;
 	}
-	if (err)
-		return err;
 
 	psmx3_wait_ops_save = wait->ops;
 	psmx3_wait_ops = *psmx3_wait_ops_save;