Start MPI

WeiqunZhang · WeiqunZhang · commit 06f2a75b1c8f · 2024-12-05T19:17:27.000-08:00
diff --git a/Src/LinearSolvers/AMReX_SpMV.H b/Src/LinearSolvers/AMReX_SpMV.H
@@ -26,6 +26,8 @@ void SpMV (AlgVector<T>& y, SpMatrix<T> const& A, AlgVector<T> const& x)
     AMREX_ASSERT(x.numLocalRows() == y.numLocalRows());
     AMREX_ASSERT(x.numGlobalRows() == y.numGlobalRows());
 
+    A.startComm(x);
+
     T      * AMREX_RESTRICT py = y.data();
     T const* AMREX_RESTRICT px = x.data();
     T const* AMREX_RESTRICT mat = A.data();
@@ -163,6 +165,8 @@ void SpMV (AlgVector<T>& y, SpMatrix<T> const& A, AlgVector<T> const& x)
     }
 
 #endif
+
+    A.finishComm(y);
 }
 
 }
diff --git a/Src/LinearSolvers/AMReX_SpMatrix.H b/Src/LinearSolvers/AMReX_SpMatrix.H
@@ -3,6 +3,7 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_AlgPartition.H>
+#include <AMReX_AlgVector.H>
 #include <AMReX_Gpu.H>
 #include <AMReX_INT.H>
 #include <AMReX_Scan.H>
@@ -57,10 +58,19 @@ public:
     template <typename F>
     void setVal (F const& f);
 
+    template <typename U> friend void SpMV(AlgVector<U>& y, SpMatrix<U> const& A, AlgVector<U> const& x);
+
 private:
 
     void define_doit (int nnz);
 
+    void startComm (AlgVector<T> const& x) const;
+    void finishComm (AlgVector<T>& y) const;
+
+#ifdef AMREX_USE_MPI
+    void prepare_for_comm ();
+#endif
+
     struct CSR {
         Vec<T> mat;
         Vec<Long> col_index;
@@ -71,7 +81,12 @@ private:
     AlgPartition m_partition;
     Long m_row_begin = 0;
     Long m_row_end = 0;
-    CSR m_data; // We might need two CSRs, one for local data, the other for remote data
+    CSR m_data;
+
+#ifdef AMREX_USE_MPI
+    CSR m_data_remote;
+    bool m_comm_prepared = false;
+#endif
 };
 
 template <typename T, template<typename> class Allocator>
@@ -149,6 +164,119 @@ void SpMatrix<T,Allocator>::setVal (F const& f)
     });
 }
 
+template <typename T, template<typename> class Allocator>
+void SpMatrix<T,Allocator>::startComm (AlgVector<T> const& x) const
+{
+    if (this->numLocalRows() == 0) { return; }
+
+#ifndef AMREX_USE_MPI
+    amrex::ignore_unused(x);
+#else
+    if (this->numLocalRows() == this->numGlobalRows()) { return; }
+
+    const_cast<SpMatrix<T,Allocator>*>(this)->prepare_for_comm();
+
+#endif
+}
+
+template <typename T, template<typename> class Allocator>
+void SpMatrix<T,Allocator>::finishComm (AlgVector<T>& y) const
+{
+    if (this->numLocalRows() == 0) { return; }
+
+#ifndef AMREX_USE_MPI
+    amrex::ignore_unused(y);
+#else
+    if (this->numLocalRows() == this->numGlobalRows()) { return; }
+#endif
+}
+
+#ifdef AMREX_USE_MPI
+
+template <typename T, template<typename> class Allocator>
+void SpMatrix<T,Allocator>::prepare_for_comm ()
+{
+    if (m_comm_prepared) { return; }
+
+    // First, we need to split the matrix into two parts, a square matrix
+    // for pure local operations and another part for remote operations.
+
+    Long all_nnz = m_data.nnz;
+    Long local_nnz;
+    Gpu::DeviceVector<Long> pfsum(all_nnz);
+    auto* p_pfsum = pfsum.data();
+    auto row_begin = m_row_begin;
+    auto row_end = m_row_end;
+    if (m_data.nnz < Long(std::numeric_limits<int>::max())) {
+        auto const* pcol = m_data.col_index.data();
+        local_nnz = Scan::PrefixSum<Long>(int(all_nnz),
+                                          [=] AMREX_GPU_DEVICE (int i) -> Long {
+                                              return (pcol[i] >= row_begin &&
+                                                      pcol[i] <  row_end); },
+                                          [=] AMREX_GPU_DEVICE (int i, Long const& x) {
+                                              p_pfsum[i] = x; },
+                                          Scan::Type::exclusive, Scan::retSum);
+    } else {
+        auto const* pcol = m_data.col_index.data();
+        local_nnz = Scan::PrefixSum<Long>(all_nnz,
+                                          [=] AMREX_GPU_DEVICE (Long i) -> Long {
+                                              return (pcol[i] >= row_begin &&
+                                                      pcol[i] <  row_end); },
+                                          [=] AMREX_GPU_DEVICE (Long i, Long const& x) {
+                                              p_pfsum[i] = x; },
+                                          Scan::Type::exclusive, Scan::retSum);
+    }
+
+    m_data.nnz = local_nnz;
+    Long remote_nnz = all_nnz - local_nnz;
+    m_data_remote.nnz = remote_nnz;
+
+    if (local_nnz != all_nnz) {
+        m_data_remote.mat.resize(remote_nnz);
+        m_data_remote.col_index.resize(remote_nnz);
+        Vec<T> new_mat(local_nnz);
+        Vec<Long> new_col(local_nnz);
+        auto const* pmat = m_data.mat.data();
+        auto const* pcol = m_data.col_index.data();
+        auto* pmat_l = new_mat.data();
+        auto* pcol_l = new_col.data();
+        auto* pmat_r = m_data_remote.mat.data();
+        auto* pcol_r = m_data_remote.col_index.data();
+        ParallelFor(all_nnz, [=] AMREX_GPU_DEVICE (Long i)
+        {
+            auto ps = p_pfsum[i];
+            auto local = (pcol[i] >= row_begin &&
+                          pcol[i] <  row_end);
+            if (local) {
+                pmat_l[ps] = pmat[i];
+                pcol_l[ps] = pcol[i] - row_begin; // shift the column index to local
+            } else {
+                pmat_r[i-ps] = pmat[i];
+                pcol_r[i-ps] = pcol[i];
+            }
+        });
+        auto noffset = Long(m_data.row_offset.size());
+        auto* pro = m_data.row_offset.data();
+        ParallelFor(noffset, [=] AMREX_GPU_DEVICE (Long i)
+        {
+            if (i < noffset-1) {
+                pro[i] = p_pfsum[pro[i]];
+            } else {
+                pro[i] = local_nnz;
+            }
+        });
+        Gpu::streamSynchronize();
+        m_data.mat.swap(new_mat);
+        m_data.col_index.swap(new_col);
+
+        // xxxxx TODO: still need to work on m_data_remote
+    }
+
+    m_comm_prepared = true;
+}
+
+#endif
+
 }
 
 #endif
diff --git a/Tests/Algebra/GMRES/main.cpp b/Tests/Algebra/GMRES/main.cpp
@@ -1,7 +1,6 @@
 #include <AMReX_GMRES_MV.H>
 
 #include <AMReX.H>
-#include <AMReX_Random.H>
 
 using namespace amrex;
 
@@ -13,16 +12,61 @@ int main (int argc, char* argv[])
         Long n = domain.numPts();
         AlgVector<Real> xvec(n);
         AlgVector<Real> bvec(xvec.partition());
+        AlgVector<Real> exact(xvec.partition());
+
+        Real a = Real(1.e-6);
+        Real dx = Real(2)*amrex::Math::pi<Real>()/Real(domain.length(0));
+
+        // The system is a * phi - del dot grad phi.
+        // Where phi = sin^5(x)*sin^5(y)*sin^5(z)
+
+        BoxIndexer box_indexer(domain);
 
         // Initialzie bvec
-        amrex::FillRandomNormal(bvec.data(), bvec.numLocalRows(), Real(0), Real(1));
+        {
+            auto* rhs = bvec.data();
+            auto* phi = exact.data();
+            auto nrows = bvec.numLocalRows();
+            auto ib = bvec.globalBegin();
+            ParallelFor(nrows, [=] AMREX_GPU_DEVICE (Long lrow)
+            {
+                auto row = lrow + ib; // global row index
+                IntVect cell = box_indexer.intVect(row);
+#if (AMREX_SPACEDIM == 1)
+                auto x = (cell[0]+Real(0.5))*dx;
+                auto phi0 = Math::powi<5>(std::sin(x));
+                auto phixm = Math::powi<5>(std::sin(x-dx));
+                auto phixp = Math::powi<5>(std::sin(x+dx));
+                rhs[lrow] = a*phi0 + (Real(2)*phi0-phixm-phixp) / (dx*dx);
+#elif (AMREX_SPACEDIM == 2)
+                auto x = (cell[0]+Real(0.5))*dx;
+                auto y = (cell[1]+Real(0.5))*dx;
+                auto phi0 = Math::powi<5>(std::sin(x)*std::sin(y));
+                auto phixm = Math::powi<5>(std::sin(x-dx)*std::sin(y));
+                auto phixp = Math::powi<5>(std::sin(x+dx)*std::sin(y));
+                auto phiym = Math::powi<5>(std::sin(x)*std::sin(y-dx));
+                auto phiyp = Math::powi<5>(std::sin(x)*std::sin(y+dx));
+                rhs[lrow] = a*phi0 + (Real(4)*phi0-phixm-phixp-phiym-phiyp) / (dx*dx);
+#else
+                auto x = (cell[0]+Real(0.5))*dx;
+                auto y = (cell[1]+Real(0.5))*dx;
+                auto z = (cell[2]+Real(0.5))*dx;
+                auto phi0 = Math::powi<5>(std::sin(x)*std::sin(y)*std::sin(z));
+                auto phixm = Math::powi<5>(std::sin(x-dx)*std::sin(y)*std::sin(z));
+                auto phixp = Math::powi<5>(std::sin(x+dx)*std::sin(y)*std::sin(z));
+                auto phiym = Math::powi<5>(std::sin(x)*std::sin(y-dx)*std::sin(z));
+                auto phiyp = Math::powi<5>(std::sin(x)*std::sin(y+dx)*std::sin(z));
+                auto phizm = Math::powi<5>(std::sin(x)*std::sin(y)*std::sin(z-dx));
+                auto phizp = Math::powi<5>(std::sin(x)*std::sin(y)*std::sin(z+dx));
+                rhs[lrow] = a*phi0 + (Real(6)*phi0-phixm-phixp-phiym-phiyp-phizm-phizp) / (dx*dx);
+#endif
+                phi[lrow] = phi0;
+            });
+        }
 
         // Initial guess
         xvec.setVal(0);
 
-        BoxIndexer box_indexer(domain);
-
-        // a * phi - del dot grad phi. For simplicity, let a=1 and dx=1.
         // cross stencil w/ periodic boundaries
         auto set_stencil = [=] AMREX_GPU_DEVICE (Long row, Long* col, Real* val)
         {
@@ -37,7 +81,7 @@ int main (int argc, char* argv[])
                 }
                 Long row2 = domain.index(cell2);
                 col[i] = row2;
-                val[i] = Real(-1.0);
+                val[i] = Real(-1.0)/(dx*dx);
                 ++i;
 
                 if (cell[idim] == domain.bigEnd(idim)) {
@@ -47,11 +91,11 @@ int main (int argc, char* argv[])
                 }
                 row2 = domain.index(cell2);
                 col[i] = row2;
-                val[i] = Real(-1.0);
+                val[i] = Real(-1.0)/(dx*dx);
                 ++i;
             }
             col[i] = row;
-            val[i] = Real(2*AMREX_SPACEDIM+1);
+            val[i] = Real(2*AMREX_SPACEDIM)/(dx*dx) + a;
         };
 
         int num_non_zeros = 2*AMREX_SPACEDIM+1;
@@ -60,7 +104,14 @@ int main (int argc, char* argv[])
 
         GMRES_MV<Real> gmres(&mat);
         gmres.setVerbose(2);
-        gmres.solve(xvec, bvec, Real(1.e-10), Real(0.0));
+        auto eps = (sizeof(Real) == 4) ? Real(1.e-5) : Real (1.e-12);
+        gmres.solve(xvec, bvec, eps, Real(0.0));
+
+        // Check the solution
+        amrex::Axpy(xvec, Real(-1.0), exact);
+        auto error = xvec.norminf();
+        amrex::Print() << " Max norm error: " << error << "\n";
+        AMREX_ALWAYS_ASSERT(error*10 < eps);
     }
     amrex::Finalize();
 }

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,8 @@ void SpMV (AlgVector<T>& y, SpMatrix<T> const& A, AlgVector<T> const& x)`
`26`	`26`	`AMREX_ASSERT(x.numLocalRows() == y.numLocalRows());`
`27`	`27`	`AMREX_ASSERT(x.numGlobalRows() == y.numGlobalRows());`
`28`	`28`
	`29`	`+ A.startComm(x);`
	`30`	`+`
`29`	`31`	`T * AMREX_RESTRICT py = y.data();`
`30`	`32`	`T const* AMREX_RESTRICT px = x.data();`
`31`	`33`	`T const* AMREX_RESTRICT mat = A.data();`
`@@ -163,6 +165,8 @@ void SpMV (AlgVector<T>& y, SpMatrix<T> const& A, AlgVector<T> const& x)`
`163`	`165`	`}`
`164`	`166`
`165`	`167`	`#endif`
	`168`	`+`
	`169`	`+ A.finishComm(y);`
`166`	`170`	`}`
`167`	`171`
`168`	`172`	`}`