From ad4e4961b110ef43eadc999de2e5f29003f8d207 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Tue, 17 Dec 2024 10:57:34 -0800
Subject: [PATCH] GMRESMLMG: Support Multi-level composite solve

---
 Src/Base/AMReX_FabArrayUtility.H              | 203 ++++++++++++++
 Src/LinearSolvers/AMReX_GMRES_MLMG.H          | 255 ++++++++++++------
 Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H    |  35 +++
 Src/LinearSolvers/MLMG/AMReX_MLLinOp.H        |  25 +-
 Src/LinearSolvers/MLMG/AMReX_MLMG.H           |   2 +-
 .../LinearSolvers/ABecLaplacian_C/MyTest.cpp  | 110 ++++++--
 6 files changed, 523 insertions(+), 107 deletions(-)
diff --git a/Src/Base/AMReX_FabArrayUtility.H b/Src/Base/AMReX_FabArrayUtility.H
index bfac76c471e..1d6be626612 100644
--- a/Src/Base/AMReX_FabArrayUtility.H
+++ b/Src/Base/AMReX_FabArrayUtility.H
@@ -1602,6 +1602,209 @@ Dot (FabArray<FAB> const& x, int xcomp, FabArray<FAB> const& y, int ycomp, int n
     return sm;
 }
 
+/**
+ * \brief Compute dot product of FabArray with itself
+ *
+ * \param x      first FabArray
+ * \param xcomp  starting component of x
+ * \param y      second FabArray
+ * \param ycomp  starting component of y
+ * \param ncomp  number of components
+ * \param nghost number of ghost cells
+ * \param local  If true, MPI communication is skipped.
+ */
+template <typename FAB, std::enable_if_t<IsBaseFab<FAB>::value,int> FOO = 0>
+typename FAB::value_type
+Dot (FabArray<FAB> const& x, int xcomp, int ncomp, IntVect const& nghost, bool local = false)
+{
+    BL_ASSERT(x.nGrowVect().allGE(nghost));
+
+    BL_PROFILE("amrex::Dot()");
+
+    using T = typename FAB::value_type;
+    auto sm = T(0.0);
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        auto const& xma = x.const_arrays();
+        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
+        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
+        {
+            auto t = T(0.0);
+            auto const& xfab = xma[box_no];
+            for (int n = 0; n < ncomp; ++n) {
+                auto v = xfab(i,j,k,xcomp+n);
+                t += v*v;
+            }
+            return t;
+        });
+    } else
+#endif
+    {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
+#endif
+        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.growntilebox(nghost);
+            auto const& xfab = x.const_array(mfi);
+            AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
+            {
+                auto v = xfab(i,j,k,xcomp+n);
+                sm += v*v;
+            });
+        }
+    }
+
+    if (!local) {
+        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
+    }
+
+    return sm;
+}
+
+/**
+ * \brief Compute dot product of two FabArrays in region that mask is true
+ *
+ * \param mask   mask
+ * \param x      first FabArray
+ * \param xcomp  starting component of x
+ * \param y      second FabArray
+ * \param ycomp  starting component of y
+ * \param ncomp  number of components
+ * \param nghost number of ghost cells
+ * \param local  If true, MPI communication is skipped.
+ */
+template <typename IFAB, typename FAB,
+          std::enable_if_t<IsBaseFab<FAB>::value && IsBaseFab<IFAB>::value,int> FOO = 0>
+typename FAB::value_type
+Dot (FabArray<IFAB> const& mask, FabArray<FAB> const& x, int xcomp,
+     FabArray<FAB> const& y, int ycomp, int ncomp, IntVect const& nghost,
+     bool local = false)
+{
+    BL_ASSERT(x.boxArray() == y.boxArray() && x.boxArray() == mask.boxArray());
+    BL_ASSERT(x.DistributionMap() == y.DistributionMap() && x.DistributionMap() == mask.DistributionMap());
+    BL_ASSERT(x.nGrowVect().allGE(nghost) && y.nGrowVect().allGE(nghost) &&
+              mask.nGrowVect().allGE(nghost));
+
+    BL_PROFILE("amrex::Dot()");
+
+    using T = typename FAB::value_type;
+    auto sm = T(0.0);
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        auto const& mma = mask.const_arrays();
+        auto const& xma = x.const_arrays();
+        auto const& yma = y.const_arrays();
+        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
+        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
+        {
+            auto t = T(0.0);
+            if (mma[box_no](i,j,k)) {
+                auto const& xfab = xma[box_no];
+                auto const& yfab = yma[box_no];
+                for (int n = 0; n < ncomp; ++n) {
+                    t += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
+                }
+            }
+            return t;
+        });
+    } else
+#endif
+    {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
+#endif
+        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.growntilebox(nghost);
+            auto const& mfab = mask.const_array(mfi);
+            auto const& xfab = x.const_array(mfi);
+            auto const& yfab = y.const_array(mfi);
+            AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
+            {
+                if (mfab(i,j,k)) {
+                    sm += xfab(i,j,k,xcomp+n) * yfab(i,j,k,ycomp+n);
+                }
+            });
+        }
+    }
+
+    if (!local) {
+        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
+    }
+
+    return sm;
+}
+
+/**
+ * \brief Compute dot product of FabArray with itself in region that mask is true
+ *
+ * \param mask   mask
+ * \param x      FabArray
+ * \param xcomp  starting component of x
+ * \param ncomp  number of components
+ * \param nghost number of ghost cells
+ * \param local  If true, MPI communication is skipped.
+ */
+template <typename IFAB, typename FAB,
+          std::enable_if_t<IsBaseFab<FAB>::value && IsBaseFab<IFAB>::value,int> FOO = 0>
+typename FAB::value_type
+Dot (FabArray<IFAB> const& mask, FabArray<FAB> const& x, int xcomp, int ncomp,
+     IntVect const& nghost, bool local = false)
+{
+    BL_ASSERT(x.boxArray() == mask.boxArray());
+    BL_ASSERT(x.DistributionMap() == mask.DistributionMap());
+    BL_ASSERT(x.nGrowVect().allGE(nghost) && mask.nGrowVect().allGE(nghost));
+
+    BL_PROFILE("amrex::Dot()");
+
+    using T = typename FAB::value_type;
+    auto sm = T(0.0);
+#ifdef AMREX_USE_GPU
+    if (Gpu::inLaunchRegion()) {
+        auto const& mma = mask.const_arrays();
+        auto const& xma = x.const_arrays();
+        sm = ParReduce(TypeList<ReduceOpSum>{}, TypeList<T>{}, x, nghost,
+        [=] AMREX_GPU_DEVICE (int box_no, int i, int j, int k) noexcept -> GpuTuple<T>
+        {
+            auto t = T(0.0);
+            if (mma[box_no](i,j,k)) {
+                auto const& xfab = xma[box_no];
+                for (int n = 0; n < ncomp; ++n) {
+                    auto v = xfab(i,j,k,xcomp+n);
+                    t += v*v;
+                }
+            }
+            return t;
+        });
+    } else
+#endif
+    {
+#ifdef AMREX_USE_OMP
+#pragma omp parallel if (!system::regtest_reduction) reduction(+:sm)
+#endif
+        for (MFIter mfi(x,true); mfi.isValid(); ++mfi)
+        {
+            Box const& bx = mfi.growntilebox(nghost);
+            auto const& mfab = mask.const_array(mfi);
+            auto const& xfab = x.const_array(mfi);
+            AMREX_LOOP_4D(bx, ncomp, i, j, k, n,
+            {
+                if (mfab(i,j,k)) {
+                    auto v = xfab(i,j,k,xcomp+n);
+                    sm += v*v;
+                }
+            });
+        }
+    }
+
+    if (!local) {
+        ParallelAllReduce::Sum(sm, ParallelContext::CommunicatorSub());
+    }
+
+    return sm;
+}
+
 //! dst = val
 template <class MF, std::enable_if_t<IsMultiFabLike_v<MF>,int> = 0>
 void setVal (MF& dst, typename MF::value_type val)
diff --git a/Src/LinearSolvers/AMReX_GMRES_MLMG.H b/Src/LinearSolvers/AMReX_GMRES_MLMG.H
index 56ee477250e..e9fbc41abc2 100644
--- a/Src/LinearSolvers/AMReX_GMRES_MLMG.H
+++ b/Src/LinearSolvers/AMReX_GMRES_MLMG.H
@@ -4,6 +4,7 @@
 
 #include <AMReX_GMRES.H>
 #include <AMReX_MLMG.H>
+#include <AMReX_TypeTraits.H>
 #include <utility>
 
 namespace amrex {
@@ -15,13 +16,14 @@ namespace amrex {
  * as the preconditioner.
  *
  */
-template <typename MF>
+template <typename VEC, bool Composite = false>
 class GMRESMLMGT
 {
 public:
+    using MF = std::conditional_t<Composite, typename VEC::value_type, VEC>;
     using MG = MLMGT<MF>;
     using RT = typename MG::RT; // double or float
-    using GM = GMRES<MF,GMRESMLMGT<MF>>;
+    using GM = GMRES<VEC,GMRESMLMGT<VEC,Composite>>;
 
     explicit GMRESMLMGT (MG& mlmg);
 
@@ -33,7 +35,11 @@ public:
      * \param a_tol_rel relative tolerance.
      * \param a_tol_abs absolute tolerance.
      */
-    void solve (MF& a_sol, MF const& a_rhs, RT a_tol_rel, RT a_tol_abs);
+    template <bool C=Composite, std::enable_if_t<!C,int> = 0>
+    void solve (VEC& a_sol, VEC const& a_rhs, RT a_tol_rel, RT a_tol_abs);
+
+    template <bool C=Composite, std::enable_if_t<C,int> = 0>
+    void solve (Vector<MF*> const& a_sol, Vector<MF const*> const& a_rhs, RT a_tol_rel, RT a_tol_abs);
 
     //! Sets verbosity.
     void setVerbose (int v) { m_gmres.setVerbose(v); }
@@ -63,33 +69,33 @@ public:
     void setPropertyOfZero (bool b) { m_prop_zero = b; }
 
     //! Make MultiFab without ghost cells
-    MF makeVecRHS () const;
+    VEC makeVecRHS () const;
 
     //! Make MultiFab with ghost cells and set ghost cells to zero
-    MF makeVecLHS () const;
+    VEC makeVecLHS () const;
 
-    RT norm2 (MF const& mf) const;
+    RT norm2 (VEC const& mf) const;
 
-    static void scale (MF& mf, RT scale_factor);
+    static void scale (VEC& mf, RT scale_factor);
 
-    RT dotProduct (MF const& mf1, MF const& mf2) const;
+    RT dotProduct (VEC const& mf1, VEC const& mf2) const;
 
     //! lhs = 0
-    static void setToZero (MF& lhs);
+    static void setToZero (VEC& lhs);
 
     //! lhs = rhs
-    static void assign (MF& lhs, MF const& rhs);
+    static void assign (VEC& lhs, VEC const& rhs);
 
     //! lhs += a*rhs
-    static void increment (MF& lhs, MF const& rhs, RT a);
+    static void increment (VEC& lhs, VEC const& rhs, RT a);
 
     //! lhs = a*rhs_a + b*rhs_b
-    static void linComb (MF& lhs, RT a, MF const& rhs_a, RT b, MF const& rhs_b);
+    static void linComb (VEC& lhs, RT a, VEC const& rhs_a, RT b, VEC const& rhs_b);
 
     //! lhs = L(rhs)
-    void apply (MF& lhs, MF const& rhs) const;
+    void apply (VEC& lhs, VEC const& rhs) const;
 
-    void precond (MF& lhs, MF const& rhs) const;
+    void precond (VEC& lhs, VEC const& rhs) const;
 
     //! Control whether or not to use MLMG as preconditioner.
     bool usePrecond (bool new_flag) { return std::exchange(m_use_precond, new_flag); }
@@ -101,117 +107,190 @@ private:
     GM m_gmres;
     MG* m_mlmg;
     MLLinOpT<MF>* m_linop;
+    int m_nlevels = 0;
     bool m_use_precond = true;
     bool m_prop_zero = false;
     int m_precond_niters = 1;
 };
 
-template <typename MF>
-GMRESMLMGT<MF>::GMRESMLMGT (MG& mlmg)
-    : m_mlmg(&mlmg), m_linop(&mlmg.getLinOp())
+template <typename VEC, bool Composite>
+GMRESMLMGT<VEC,Composite>::GMRESMLMGT (MG& mlmg)
+    : m_mlmg(&mlmg), m_linop(&mlmg.getLinOp()), m_nlevels(m_linop->NAMRLevels())
 {
-    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(m_linop->NAMRLevels() == 1,
-                                     "Only support single level solve");
     m_mlmg->setVerbose(0);
     m_mlmg->setBottomVerbose(0);
     m_mlmg->prepareForGMRES();
     m_gmres.define(*this);
 }
 
-template <typename MF>
-auto GMRESMLMGT<MF>::makeVecRHS () const -> MF
+template <typename VEC, bool Composite>
+auto GMRESMLMGT<VEC,Composite>::makeVecRHS () const -> VEC
 {
-    return m_linop->make(0, 0, IntVect(0));
+    if constexpr (!Composite) {
+        return m_linop->make(0, 0, IntVect(0));
+    } else {
+        VEC vmf(m_nlevels);
+        for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+            vmf[ilev] = m_linop->make(ilev, 0, IntVect(0));
+        }
+        return vmf;
+    }
 }
 
-template <typename MF>
-auto GMRESMLMGT<MF>::makeVecLHS () const -> MF
+template <typename VEC, bool Composite>
+auto GMRESMLMGT<VEC,Composite>::makeVecLHS () const -> VEC
 {
-    auto mf = m_linop->make(0, 0, IntVect(1));
-    setBndry(mf, RT(0), 0, nComp(mf));
-    return mf;
+    if constexpr (!Composite) {
+        auto mf = m_linop->make(0, 0, IntVect(1));
+        setBndry(mf, RT(0), 0, nComp(mf));
+        return mf;
+    } else {
+        VEC vmf(m_nlevels);
+        for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+            vmf[ilev] = m_linop->make(ilev, 0, IntVect(1));
+            setBndry(vmf[ilev], RT(0), 0, nComp(vmf[ilev]));
+        }
+        return vmf;
+    }
 }
 
-template <typename MF>
-auto GMRESMLMGT<MF>::norm2 (MF const& mf) const -> RT
+template <typename VEC, bool Composite>
+auto GMRESMLMGT<VEC,Composite>::norm2 (VEC const& mf) const -> RT
 {
-    auto r = m_linop->xdoty(0, 0, mf, mf, false);
-    return std::sqrt(r);
+    if constexpr (!Composite) {
+        return m_linop->norm2ForGMRES({&mf});
+    } else {
+        return m_linop->norm2ForGMRES(GetVecOfConstPtrs(mf));
+    }
 }
 
-template <typename MF>
-void GMRESMLMGT<MF>::scale (MF& mf, RT scale_factor)
+template <typename VEC, bool Composite>
+void GMRESMLMGT<VEC,Composite>::scale (VEC& mf, RT scale_factor)
 {
-    Scale(mf, scale_factor, 0, nComp(mf), 0);
+    if constexpr (!Composite) {
+        Scale(mf, scale_factor, 0, nComp(mf), 0);
+    } else {
+        for (auto& xmf : mf) {
+            Scale(xmf, scale_factor, 0, nComp(xmf), 0);
+        }
+    }
 }
 
-template <typename MF>
-auto GMRESMLMGT<MF>::dotProduct (MF const& mf1, MF const& mf2) const -> RT
+template <typename VEC, bool Composite>
+auto GMRESMLMGT<VEC,Composite>::dotProduct (VEC const& mf1, VEC const& mf2) const -> RT
 {
-    return m_linop->xdoty(0, 0, mf1, mf2, false);
+    if constexpr (!Composite) {
+        return m_linop->dotProductForGMRES({&mf1}, {&mf2});
+    } else {
+        return m_linop->dotProductForGMRES(GetVecOfConstPtrs(mf1), GetVecOfConstPtrs(mf2));
+    }
 }
 
-template <typename MF>
-void GMRESMLMGT<MF>::setToZero (MF& lhs)
+template <typename VEC, bool Composite>
+void GMRESMLMGT<VEC,Composite>::setToZero (VEC& lhs)
 {
-    setVal(lhs, RT(0.0));
+    if constexpr (!Composite) {
+        setVal(lhs, RT(0.0));
+    } else {
+        for (auto& xmf : lhs) {
+            setVal(xmf, RT(0.0));
+        }
+    }
 }
 
-template <typename MF>
-void GMRESMLMGT<MF>::assign (MF& lhs, MF const& rhs)
+template <typename VEC, bool Composite>
+void GMRESMLMGT<VEC,Composite>::assign (VEC& lhs, VEC const& rhs)
 {
-    LocalCopy(lhs, rhs, 0, 0, nComp(lhs), IntVect(0));
+    if constexpr (!Composite) {
+        LocalCopy(lhs, rhs, 0, 0, nComp(lhs), IntVect(0));
+    } else {
+        auto nlevels = int(lhs.size());
+        for (int ilev = 0; ilev < nlevels; ++ilev) {
+            LocalCopy(lhs[ilev], rhs[ilev], 0, 0, nComp(lhs[ilev]), IntVect(0));
+        }
+    }
 }
 
-template <typename MF>
-void GMRESMLMGT<MF>::increment (MF& lhs, MF const& rhs, RT a)
+template <typename VEC, bool Composite>
+void GMRESMLMGT<VEC,Composite>::increment (VEC& lhs, VEC const& rhs, RT a)
 {
-    Saxpy(lhs, a, rhs, 0, 0, nComp(lhs), IntVect(0));
+    if constexpr (!Composite) {
+        Saxpy(lhs, a, rhs, 0, 0, nComp(lhs), IntVect(0));
+    } else {
+        auto nlevels = int(lhs.size());
+        for (int ilev = 0; ilev < nlevels; ++ilev) {
+            Saxpy(lhs[ilev], a, rhs[ilev], 0, 0, nComp(lhs[ilev]), IntVect(0));
+        }
+    }
 }
 
-template <typename MF>
-void GMRESMLMGT<MF>::linComb (MF& lhs, RT a, MF const& rhs_a, RT b, MF const& rhs_b)
+template <typename VEC, bool Composite>
+void GMRESMLMGT<VEC,Composite>::linComb (VEC& lhs, RT a, VEC const& rhs_a, RT b, VEC const& rhs_b)
 {
-    LinComb(lhs, a, rhs_a, 0, b, rhs_b, 0, 0, nComp(lhs), IntVect(0));
+    if constexpr (!Composite) {
+        LinComb(lhs, a, rhs_a, 0, b, rhs_b, 0, 0, nComp(lhs), IntVect(0));
+    } else {
+        auto nlevels = int(lhs.size());
+        for (int ilev = 0; ilev < nlevels; ++ilev) {
+            LinComb(lhs[ilev], a, rhs_a[ilev], 0, b, rhs_b[ilev], 0, 0, nComp(lhs[ilev]), IntVect(0));
+        }
+    }
 }
 
-template <typename MF>
-void GMRESMLMGT<MF>::apply (MF& lhs, MF const& rhs) const
+template <typename VEC, bool Composite>
+void GMRESMLMGT<VEC,Composite>::apply (VEC& lhs, VEC const& rhs) const
 {
-    m_linop->apply(0, 0, lhs, const_cast<MF&>(rhs),
-                   MLLinOpT<MF>::BCMode::Homogeneous,
-                   MLLinOpT<MF>::StateMode::Correction);
+    if constexpr(!Composite) {
+        m_linop->apply(0, 0, lhs, const_cast<VEC&>(rhs),
+                       MLLinOpT<VEC>::BCMode::Homogeneous,
+                       MLLinOpT<VEC>::StateMode::Correction);
+    } else {
+        m_mlmg->apply(GetVecOfPtrs(lhs), GetVecOfPtrs(const_cast<VEC&>(rhs)));
+    }
 }
 
-template <typename MF>
-void GMRESMLMGT<MF>::precond (MF& lhs, MF const& rhs) const
+template <typename VEC, bool Composite>
+void GMRESMLMGT<VEC,Composite>::precond (VEC& lhs, VEC const& rhs) const
 {
-    if (m_use_precond) {
-        m_mlmg->prepareMGcycle();
-
-        for (int icycle = 0; icycle < m_precond_niters; ++icycle) {
-            if (icycle == 0) {
-                LocalCopy(m_mlmg->res[0][0], rhs, 0, 0, nComp(rhs), IntVect(0));
-            } else {
-                m_mlmg->computeResOfCorrection(0,0);
-                LocalCopy(m_mlmg->res[0][0], m_mlmg->rescor[0][0], 0, 0, nComp(rhs), IntVect(0));
-            }
-
-            m_mlmg->mgVcycle(0,0);
-
-            if (icycle == 0) {
-                LocalCopy(lhs, m_mlmg->cor[0][0], 0, 0, nComp(rhs), IntVect(0));
-            } else {
-                increment(lhs, m_mlmg->cor[0][0], RT(1));
+    if constexpr (!Composite) {
+        if (m_use_precond) {
+            m_mlmg->prepareMGcycle();
+
+            for (int icycle = 0; icycle < m_precond_niters; ++icycle) {
+                if (icycle == 0) {
+                    LocalCopy(m_mlmg->res[0][0], rhs, 0, 0, nComp(rhs), IntVect(0));
+                } else {
+                    m_mlmg->computeResOfCorrection(0,0);
+                    LocalCopy(m_mlmg->res[0][0], m_mlmg->rescor[0][0], 0, 0, nComp(rhs), IntVect(0));
+                }
+
+                m_mlmg->mgVcycle(0,0);
+
+                if (icycle == 0) {
+                    LocalCopy(lhs, m_mlmg->cor[0][0], 0, 0, nComp(rhs), IntVect(0));
+                } else {
+                    increment(lhs, m_mlmg->cor[0][0], RT(1));
+                }
             }
+        } else {
+            LocalCopy(lhs, rhs, 0, 0, nComp(lhs), IntVect(0));
         }
     } else {
-        LocalCopy(lhs, rhs, 0, 0, nComp(lhs), IntVect(0));
+        if (m_use_precond) {
+            m_mlmg->setFixedIter(m_precond_niters);
+            setToZero(lhs);
+            m_mlmg->solve(GetVecOfPtrs(lhs), GetVecOfConstPtrs(rhs), 0, 0);
+        } else {
+            for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+                LocalCopy(lhs[ilev], rhs[ilev], 0, 0, nComp(lhs[ilev]), IntVect(0));
+            }
+        }
     }
 }
 
-template <typename MF>
-void GMRESMLMGT<MF>::solve (MF& a_sol, MF const& a_rhs, RT a_tol_rel, RT a_tol_abs)
+template <typename VEC, bool Composite>
+template <bool C, std::enable_if_t<!C,int>>
+void GMRESMLMGT<VEC,Composite>::solve (VEC& a_sol, VEC const& a_rhs, RT a_tol_rel, RT a_tol_abs)
 {
     if (m_prop_zero) {
         auto rhs = makeVecRHS();
@@ -229,7 +308,29 @@ void GMRESMLMGT<MF>::solve (MF& a_sol, MF const& a_rhs, RT a_tol_rel, RT a_tol_a
     }
 }
 
-using GMRESMLMG = GMRESMLMGT<MultiFab>;
+template <typename VEC, bool Composite>
+template <bool C, std::enable_if_t<C,int>>
+void GMRESMLMGT<VEC,Composite>::solve (Vector<MF*> const& a_sol, Vector<MF const*> const& a_rhs, RT a_tol_rel, RT a_tol_abs)
+{
+    auto res = makeVecRHS();
+    m_mlmg->apply(GetVecOfPtrs(res), a_sol); // res = L(sol)
+    // res = L(sol) - rhs
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        Saxpy(res[ilev], RT(-1), *a_rhs[ilev], 0, 0, nComp(res[ilev]), IntVect(0));
+    }
+    auto cor = makeVecLHS();
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        m_linop->setDirichletNodesToZero(ilev,0,res[ilev]);
+        m_linop->setLevelBC(ilev, nullptr);
+    }
+    m_gmres.solve(cor, res, a_tol_rel, a_tol_abs); // L(cor) = res
+    // sol = sol - cor
+    for (int ilev = 0; ilev < m_nlevels; ++ilev) {
+        Saxpy(*a_sol[ilev], RT(-1), cor[ilev], 0, 0, nComp(*a_sol[ilev]), IntVect(0));
+    }
+}
+
+using GMRESMLMG = GMRESMLMGT<MultiFab,false>;
 
 }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
index b318b318eb2..b6d76b4c57a 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLCellLinOp.H
@@ -113,6 +113,10 @@ public:
 
     RT xdoty (int amrlev, int mglev, const MF& x, const MF& y, bool local) const final;
 
+    RT dotProductForGMRES (Vector<MF const*> const& x, Vector<MF const*> const& y) const final;
+
+    RT norm2ForGMRES (Vector<MF const*> const& x) const final;
+
     virtual void Fapply (int amrlev, int mglev, MF& out, const MF& in) const = 0;
     virtual void Fsmooth (int amrlev, int mglev, MF& sol, const MF& rhs, int redblack) const = 0;
     virtual void FFlux (int amrlev, const MFIter& mfi,
@@ -1939,6 +1943,37 @@ MLCellLinOpT<MF>::xdoty (int /*amrlev*/, int /*mglev*/, const MF& x, const MF& y
     return result;
 }
 
+template <typename MF>
+auto
+MLCellLinOpT<MF>::dotProductForGMRES (Vector<MF const*> const& x, Vector<MF const*> const& y) const -> RT
+{
+    const int ncomp = this->getNComp();
+    const IntVect nghost(0);
+    RT result = 0;
+    for (int ilev = 0; ilev < this->NAMRLevels()-1; ++ilev) {
+        result += amrex::Dot(*m_norm_fine_mask[ilev], *x[ilev], 0, *y[ilev], 0, ncomp, nghost, true);
+    }
+    result += amrex::Dot(*x[this->NAMRLevels()-1], 0,
+                         *y[this->NAMRLevels()-1], 0, ncomp, nghost, true);
+    ParallelAllReduce::Sum(result, ParallelContext::CommunicatorSub());
+    return result;
+}
+
+template <typename MF>
+auto
+MLCellLinOpT<MF>::norm2ForGMRES (Vector<MF const*> const& x) const -> RT
+{
+    const int ncomp = this->getNComp();
+    const IntVect nghost(0);
+    RT result = 0;
+    for (int ilev = 0; ilev < this->NAMRLevels()-1; ++ilev) {
+        result += amrex::Dot(*m_norm_fine_mask[ilev], *x[ilev], 0, ncomp, nghost, true);
+    }
+    result += amrex::Dot(*x[this->NAMRLevels()-1], 0, ncomp, nghost, true);
+    ParallelAllReduce::Sum(result, ParallelContext::CommunicatorSub());
+    return std::sqrt(result);
+}
+
 template <typename MF>
 void
 MLCellLinOpT<MF>::computeVolInv () const
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
index ce6a8b53335..acc99b254eb 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLLinOp.H
@@ -91,7 +91,7 @@ template <typename T> class MLMGT;
 template <typename T> class MLCGSolverT;
 template <typename T> class MLPoissonT;
 template <typename T> class MLABecLaplacianT;
-template <typename T> class GMRESMLMGT;
+template <typename T, bool Composite> class GMRESMLMGT;
 
 template <typename MF>
 class MLLinOpT
@@ -102,7 +102,7 @@ public:
     template <typename T> friend class MLCGSolverT;
     template <typename T> friend class MLPoissonT;
     template <typename T> friend class MLABecLaplacianT;
-    template <typename T> friend class GMRESMLMGT;
+    template <typename T, bool Composite> friend class GMRESMLMGT;
 
     using MFType = MF;
     using FAB = typename FabDataType<MF>::fab_type;
@@ -485,6 +485,10 @@ public:
     //! x dot y, used by the bottom solver
     virtual RT xdoty (int amrlev, int mglev, const MF& x, const MF& y, bool local) const = 0;
 
+    virtual RT dotProductForGMRES (Vector<MF const*> const& x, Vector<MF const*> const& y) const;
+
+    virtual RT norm2ForGMRES (Vector<MF const*> const& x) const;
+
     virtual std::unique_ptr<MLLinOpT<MF>> makeNLinOp (int /*grid_size*/) const
     {
         amrex::Abort("MLLinOp::makeNLinOp: N-Solve not supported");
@@ -1643,6 +1647,23 @@ MLLinOpT<MF>::setLevelBC (int amrlev, const AMF* levelbcdata,
                      robin_b_raii[amrlev].get(), robin_f_raii[amrlev].get());
 }
 
+template <typename MF>
+auto
+MLLinOpT<MF>::dotProductForGMRES (Vector<MF const*> const& x, Vector<MF const*> const& y) const -> RT
+{
+    AMREX_ALWAYS_ASSERT(NAMRLevels() == 1);
+    return xdoty(0,0,*x[0],*y[0],false);
+}
+
+template <typename MF>
+auto
+MLLinOpT<MF>::norm2ForGMRES (Vector<MF const*> const& x) const -> RT
+{
+    AMREX_ALWAYS_ASSERT(NAMRLevels() == 1);
+    auto r = xdoty(0,0,*x[0],*x[0],false);
+    return std::sqrt(r);
+}
+
 extern template class MLLinOpT<MultiFab>;
 
 using MLLinOp = MLLinOpT<MultiFab>;
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLMG.H b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
index 78b2ffdd3df..f0b453338dc 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLMG.H
+++ b/Src/LinearSolvers/MLMG/AMReX_MLMG.H
@@ -20,7 +20,7 @@ public:
     };
 
     template <typename T> friend class MLCGSolverT;
-    template <typename M> friend class GMRESMLMGT;
+    template <typename M, bool Composite> friend class GMRESMLMGT;
 
     using MFType = MF;
     using FAB = typename MLLinOpT<MF>::FAB;
diff --git a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
index 7a7647ce93a..408fa0b6c08 100644
--- a/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
+++ b/Tests/LinearSolvers/ABecLaplacian_C/MyTest.cpp
@@ -477,12 +477,9 @@ MyTest::solveABecLaplacianGMRES ()
 
     const auto nlevels = static_cast<int>(geom.size());
 
-    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(composite_solve == false || nlevels == 1,
-       "solveABecLaplacianGMRES does not support composite solve");
-
-    for (int ilev = 0; ilev < nlevels; ++ilev)
+    if (composite_solve)
     {
-        MLABecLaplacian mlabec({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, info);
+        MLABecLaplacian mlabec(geom, grids, dmap, info);
 
         mlabec.setMaxOrder(linop_maxorder);
 
@@ -493,40 +490,99 @@ MyTest::solveABecLaplacianGMRES ()
                                          LinOpBCType::Dirichlet,
                                          LinOpBCType::Neumann)});
 
-        if (ilev > 0) {
-            mlabec.setCoarseFineBC(&solution[ilev-1], ref_ratio);
+        for (int ilev = 0; ilev < nlevels; ++ilev)
+        {
+            mlabec.setLevelBC(ilev, &solution[ilev]);
         }
 
-        // for problem with pure homogeneous Neumann BC, we could pass a nullptr
-        mlabec.setLevelBC(0, &solution[ilev]);
-
         mlabec.setScalars(ascalar, bscalar);
 
-        mlabec.setACoeffs(0, acoef[ilev]);
-
-        Array<MultiFab,AMREX_SPACEDIM> face_bcoef;
-        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
+        for (int ilev = 0; ilev < nlevels; ++ilev)
         {
-            const BoxArray& ba = amrex::convert(bcoef[ilev].boxArray(),
-                                                IntVect::TheDimensionVector(idim));
-            face_bcoef[idim].define(ba, bcoef[ilev].DistributionMap(), 1, 0);
+            mlabec.setACoeffs(ilev, acoef[ilev]);
+
+            Array<MultiFab,AMREX_SPACEDIM> face_bcoef;
+            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
+            {
+                const BoxArray& ba = amrex::convert(bcoef[ilev].boxArray(),
+                                                    IntVect::TheDimensionVector(idim));
+                face_bcoef[idim].define(ba, bcoef[ilev].DistributionMap(), 1, 0);
+            }
+            amrex::average_cellcenter_to_face(GetArrOfPtrs(face_bcoef),
+                                              bcoef[ilev], geom[ilev]);
+            mlabec.setBCoeffs(ilev, amrex::GetArrOfConstPtrs(face_bcoef));
         }
-        amrex::average_cellcenter_to_face(GetArrOfPtrs(face_bcoef),
-                                          bcoef[ilev], geom[ilev]);
-        mlabec.setBCoeffs(0, amrex::GetArrOfConstPtrs(face_bcoef));
 
         MLMG mlmg(mlabec);
-        GMRESMLMG gmsolver(mlmg);
+        GMRESMLMGT<Vector<MultiFab>,true> gmsolver(mlmg);
         gmsolver.usePrecond(true);
         gmsolver.setVerbose(verbose);
-        gmsolver.solve(solution[ilev], rhs[ilev], tol_rel, tol_abs);
+        gmsolver.solve(GetVecOfPtrs(solution), GetVecOfConstPtrs(rhs), tol_rel, tol_abs);
 
         if (verbose) {
-            MultiFab res(rhs[ilev].boxArray(), rhs[ilev].DistributionMap(), 1, 0);
-            mlmg.apply({&res}, {&solution[ilev]}); // res = L(sol)
-            MultiFab::Subtract(res, rhs[ilev], 0, 0, 1, 0); // now res = L(sol) - rhs
-            amrex::Print() << "Final residual = " << res.norminf(0)
-                           << " " << res.norm1(0) << " " << res.norm2(0) << '\n';
+            Vector<MultiFab> res(nlevels);
+            for (int ilev = 0; ilev < nlevels; ++ilev) {
+                res[ilev].define(rhs[ilev].boxArray(), rhs[ilev].DistributionMap(), 1, 0);
+            }
+            mlmg.apply(GetVecOfPtrs(res), GetVecOfPtrs(solution)); // res = L(sol)
+            for (int ilev = 0; ilev < nlevels; ++ilev) {
+                MultiFab::Subtract(res[ilev], rhs[ilev], 0, 0, 1, 0);
+                amrex::Print() << "Final residual at level " << ilev << " = "
+                               << res[ilev].norminf(0) << " " << res[ilev].norm1(0) << " "
+                               << res[ilev].norm2(0) << '\n';
+            }
+        }
+    }
+    else
+    {
+        for (int ilev = 0; ilev < nlevels; ++ilev)
+        {
+            MLABecLaplacian mlabec({geom[ilev]}, {grids[ilev]}, {dmap[ilev]}, info);
+
+            mlabec.setMaxOrder(linop_maxorder);
+
+            mlabec.setDomainBC({AMREX_D_DECL(LinOpBCType::Dirichlet,
+                                             LinOpBCType::Neumann,
+                                             LinOpBCType::Neumann)},
+                               {AMREX_D_DECL(LinOpBCType::Neumann,
+                                             LinOpBCType::Dirichlet,
+                                             LinOpBCType::Neumann)});
+
+            if (ilev > 0) {
+                mlabec.setCoarseFineBC(&solution[ilev-1], ref_ratio);
+            }
+
+            // for problem with pure homogeneous Neumann BC, we could pass a nullptr
+            mlabec.setLevelBC(0, &solution[ilev]);
+
+            mlabec.setScalars(ascalar, bscalar);
+
+            mlabec.setACoeffs(0, acoef[ilev]);
+
+            Array<MultiFab,AMREX_SPACEDIM> face_bcoef;
+            for (int idim = 0; idim < AMREX_SPACEDIM; ++idim)
+            {
+                const BoxArray& ba = amrex::convert(bcoef[ilev].boxArray(),
+                                                    IntVect::TheDimensionVector(idim));
+                face_bcoef[idim].define(ba, bcoef[ilev].DistributionMap(), 1, 0);
+            }
+            amrex::average_cellcenter_to_face(GetArrOfPtrs(face_bcoef),
+                                              bcoef[ilev], geom[ilev]);
+            mlabec.setBCoeffs(0, amrex::GetArrOfConstPtrs(face_bcoef));
+
+            MLMG mlmg(mlabec);
+            GMRESMLMG gmsolver(mlmg);
+            gmsolver.usePrecond(true);
+            gmsolver.setVerbose(verbose);
+            gmsolver.solve(solution[ilev], rhs[ilev], tol_rel, tol_abs);
+
+            if (verbose) {
+                MultiFab res(rhs[ilev].boxArray(), rhs[ilev].DistributionMap(), 1, 0);
+                mlmg.apply({&res}, {&solution[ilev]}); // res = L(sol)
+                MultiFab::Subtract(res, rhs[ilev], 0, 0, 1, 0); // now res = L(sol) - rhs
+                amrex::Print() << "Final residual = " << res.norminf(0)
+                               << " " << res.norm1(0) << " " << res.norm2(0) << '\n';
+            }
         }
     }
 }