From 73ead8989f78477e6d9c9f8315c42abe14e43962 Mon Sep 17 00:00:00 2001
From: Ben Wibking <wibkingb@msu.edu>
Date: Wed, 29 Jan 2025 00:29:33 -0500
Subject: [PATCH 1/4] Enable GPU-aware MPI by default

This turns on GPU-aware MPI by default.

On all current machines, simulations run faster with GPU-aware MPI enabled. Two technical issues that prevented this are now resolved: AMReX now has the communication arena, which does not use managed memory, and SLURM no longer uses cgroup isolation for GPU bindings by default.

Closes https://github.com/AMReX-Codes/amrex/issues/2967.
---
 Src/Base/AMReX_ParallelDescriptor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/Src/Base/AMReX_ParallelDescriptor.cpp b/Src/Base/AMReX_ParallelDescriptor.cpp
index f6ac26e7984..67848505f2e 100644
--- a/Src/Base/AMReX_ParallelDescriptor.cpp
+++ b/Src/Base/AMReX_ParallelDescriptor.cpp
@@ -57,7 +57,7 @@ namespace amrex::ParallelDescriptor {
 #endif
 
 #ifdef AMREX_USE_GPU
-    bool use_gpu_aware_mpi = false;
+    bool use_gpu_aware_mpi = true;
 #else
     bool use_gpu_aware_mpi = false;
 #endif

From cc4ea66086ad4a7362902b33b8b91e671645cb08 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Thu, 30 Jan 2025 10:03:28 +1100
Subject: [PATCH 2/4] check for GPU-aware support

---
 Src/Base/AMReX_ParallelDescriptor.cpp | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_ParallelDescriptor.cpp b/Src/Base/AMReX_ParallelDescriptor.cpp
index 67848505f2e..83d55bb273a 100644
--- a/Src/Base/AMReX_ParallelDescriptor.cpp
+++ b/Src/Base/AMReX_ParallelDescriptor.cpp
@@ -10,6 +10,9 @@
 
 #ifdef BL_USE_MPI
 #include <AMReX_ccse-mpi.H>
+#if __has_include(<mpi-ext.h>) && defined(OPEN_MPI)
+#         include <mpi-ext.h>
+#endif
 #endif
 
 #ifdef AMREX_PMI
@@ -57,7 +60,7 @@ namespace amrex::ParallelDescriptor {
 #endif
 
 #ifdef AMREX_USE_GPU
-    bool use_gpu_aware_mpi = true;
+    bool use_gpu_aware_mpi = false;
 #else
     bool use_gpu_aware_mpi = false;
 #endif
@@ -1510,6 +1513,28 @@ ReadAndBcastFile (const std::string& filename, Vector<char>& charBuf,
 void
 Initialize ()
 {
+#if defined(AMREX_USE_CUDA)
+
+#if (defined(OMPI_HAVE_MPI_EXT_CUDA) && OMPI_HAVE_MPI_EXT_CUDA) || (defined(MPICH) && defined(MPIX_GPU_SUPPORT_CUDA))
+    use_gpu_aware_mpi = (bool) MPIX_Query_cuda_support();
+#endif
+
+#elif defined(AMREX_USE_HIP)
+
+#if defined(OMPI_HAVE_MPI_EXT_ROCM) && OMPI_HAVE_MPI_EXT_ROCM
+    use_gpu_aware_mpi = (bool) MPIX_Query_rocm_support();
+#elif defined(MPICH) && defined(MPIX_GPU_SUPPORT_HIP)
+    use_gpu_aware_mpi = (bool) MPIX_Query_hip_support();
+#endif
+
+#elif defined(AMREX_USE_SYCL)
+
+#if defined(MPICH) && defined(MPIX_GPU_SUPPORT_ZE)
+    use_gpu_aware_mpi = (bool) MPIX_Query_ze_support();
+#endif
+
+#endif
+
 #ifndef BL_AMRPROF
     ParmParse pp("amrex");
     pp.queryAdd("use_gpu_aware_mpi", use_gpu_aware_mpi);

From 269c8a324e886306e0c9b31ea605ce59b772fa1e Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <WeiqunZhang@lbl.gov>
Date: Sun, 2 Feb 2025 15:00:56 -0800
Subject: [PATCH 3/4] Apply suggestions from code review

---
 Src/Base/AMReX_ParallelDescriptor.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/Src/Base/AMReX_ParallelDescriptor.cpp b/Src/Base/AMReX_ParallelDescriptor.cpp
index 83d55bb273a..88e11c3b324 100644
--- a/Src/Base/AMReX_ParallelDescriptor.cpp
+++ b/Src/Base/AMReX_ParallelDescriptor.cpp
@@ -1524,13 +1524,19 @@ Initialize ()
 #if defined(OMPI_HAVE_MPI_EXT_ROCM) && OMPI_HAVE_MPI_EXT_ROCM
     use_gpu_aware_mpi = (bool) MPIX_Query_rocm_support();
 #elif defined(MPICH) && defined(MPIX_GPU_SUPPORT_HIP)
-    use_gpu_aware_mpi = (bool) MPIX_Query_hip_support();
+    int is_supported = 0;
+    if (MPIX_GPU_query_support(MPIX_GPU_SUPPORT_HIP, &is_supported) == MPI_SUCCESS) {
+        use_gpu_aware_mpi = (bool) is_supported;
+    }
 #endif
 
 #elif defined(AMREX_USE_SYCL)
 
 #if defined(MPICH) && defined(MPIX_GPU_SUPPORT_ZE)
-    use_gpu_aware_mpi = (bool) MPIX_Query_ze_support();
+    int is_supported = 0;
+    if (MPIX_GPU_query_support(MPIX_GPU_SUPPORT_ZE, &is_supported) == MPI_SUCCESS) {
+        use_gpu_aware_mpi = (bool) is_supported;
+    }
 #endif
 
 #endif

From c6e3c923cce3549ead4372b09147f9d86aac7b36 Mon Sep 17 00:00:00 2001
From: Ben Wibking <ben@wibking.com>
Date: Mon, 3 Feb 2025 16:22:07 +1100
Subject: [PATCH 4/4] update docs

---
 Docs/sphinx_documentation/source/GPU.rst | 31 ++++++++++++------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst
index b4b7aba9b0f..9e370b8ee20 100644
--- a/Docs/sphinx_documentation/source/GPU.rst
+++ b/Docs/sphinx_documentation/source/GPU.rst
@@ -1643,7 +1643,7 @@ Finally, the parallel communication of particle data has been ported and optimiz
 platforms. This includes :cpp:`Redistribute()`, which moves particles back to the proper grids after their positions
 have changed, as well as :cpp:`fillNeighbors()` and :cpp:`updateNeighbors()`, which are used to exchange halo particles.
 As with :cpp:`MultiFab` data, these have been designed to minimize host / device traffic as much as possible, and can
-take advantage of the Cuda-aware MPI implementations available on platforms such as ORNL's Summit.
+take advantage of the GPU-aware MPI implementations available on platforms such as ORNL's Frontier.
 
 
 Profiling with GPUs
@@ -1742,17 +1742,18 @@ Inputs Parameters
 The following inputs parameters control the behavior of amrex when running on GPUs. They should be prefaced
 by "amrex" in your :cpp:`inputs` file.
 
-+----------------------------+-----------------------------------------------------------------------+-------------+----------+
-|                            | Description                                                           |   Type      | Default  |
-+============================+=======================================================================+=============+==========+
-| use_gpu_aware_mpi          | Whether to use GPU memory for communication buffers during MPI calls. | Bool        | 0        |
-|                            | If true, the buffers will use device memory. If false (i.e., 0), they |             |          |
-|                            | will use pinned memory. In practice, we find it is not always worth   |             |          |
-|                            | it to use GPU aware MPI.                                              |             |          |
-+----------------------------+-----------------------------------------------------------------------+-------------+----------+
-| abort_on_out_of_gpu_memory | If the size of free memory on the GPU is less than the size of a      | Bool        | 0        |
-|                            | requested allocation, AMReX will call AMReX::Abort() with an error    |             |          |
-|                            | describing how much free memory there is and what was requested.      |             |          |
-+----------------------------+-----------------------------------------------------------------------+-------------+----------+
-| the_arena_is_managed       | Whether :cpp:`The_Arena()` allocates managed memory.                  | Bool        | 0        |
-+----------------------------+-----------------------------------------------------------------------+-------------+----------+
++----------------------------+-----------------------------------------------------------------------+-------------+----------------+
+|                            | Description                                                           |   Type      | Default        |
++============================+=======================================================================+=============+================+
+| use_gpu_aware_mpi          | Whether to use GPU memory for communication buffers during MPI calls. | Bool        | MPI-dependent  |
+|                            | If true, the buffers will use device memory. If false (i.e., 0), they |             |                |
+|                            | will use pinned memory. It will be activated if AMReX detects that    |             |                |
+|                            | GPU-aware MPI is supported by the MPI library (MPICH, OpenMPI, and    |             |                |
+|                            | derivative implementations).                                          |             |                |
++----------------------------+-----------------------------------------------------------------------+-------------+----------------+
+| abort_on_out_of_gpu_memory | If the size of free memory on the GPU is less than the size of a      | Bool        | 0              |
+|                            | requested allocation, AMReX will call AMReX::Abort() with an error    |             |                |
+|                            | describing how much free memory there is and what was requested.      |             |                |
++----------------------------+-----------------------------------------------------------------------+-------------+----------------+
+| the_arena_is_managed       | Whether :cpp:`The_Arena()` allocates managed memory.                  | Bool        | 0              |
++----------------------------+-----------------------------------------------------------------------+-------------+----------------+