diff --git a/Src/Base/AMReX_TagParallelFor.H b/Src/Base/AMReX_TagParallelFor.H index ca51cd592e..58620193af 100644 --- a/Src/Base/AMReX_TagParallelFor.H +++ b/Src/Base/AMReX_TagParallelFor.H @@ -121,6 +121,18 @@ namespace detail { return tag.size(); } + template + constexpr + std::enable_if_t().box())>, Box>::value, + bool> + is_box_tag (T const&) { return true; } + + template + constexpr + std::enable_if_t().size())> >::value, + bool> + is_box_tag (T const&) { return false; } + } template @@ -166,6 +178,7 @@ struct TagVector { return; } +#ifdef AMREX_USE_GPU Long l_ntotwarps = 0; ntotwarps = 0; Vector nwarps; @@ -203,6 +216,16 @@ struct TagVector { amrex::ignore_unused(l_ntotwarps); AMREX_ALWAYS_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits::max())); +#else + std::size_t sizeof_tags = ntags*sizeof(TagType); + h_buffer = (char*)The_Pinned_Arena()->alloc(sizeof_tags); + + std::memcpy(h_buffer, tags.data(), sizeof_tags); + + d_tags = reinterpret_cast(h_buffer); + + defined = true; +#endif } void undefine () { @@ -222,10 +245,10 @@ struct TagVector { } }; -#ifdef AMREX_USE_GPU - namespace detail { +#ifdef AMREX_USE_GPU + template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::enable_if_t().box())>, Box>::value, void> @@ -276,10 +299,10 @@ ParallelFor_doit (TagVector const& tv, F && f) if (tv.ntags == 0) { return; } - auto d_tags = tv.d_tags; - auto d_nwarps = tv.d_nwarps; - auto ntags = tv.ntags; - auto ntotwarps = tv.ntotwarps; + const auto d_tags = tv.d_tags; + const auto d_nwarps = tv.d_nwarps; + const auto ntags = tv.ntags; + const auto ntotwarps = tv.ntotwarps; constexpr auto nthreads = tv.nthreads; amrex::launch(tv.nblocks, nthreads, Gpu::gpuStream(), @@ -317,6 +340,51 @@ ParallelFor_doit (TagVector const& tv, F && f) }); } +#else // ifdef AMREX_USE_GPU + +template +void +ParallelFor_doit (TagVector const& tv, F &&) +{ + AMREX_ALWAYS_ASSERT(tv.is_defined()); + + if (tv.ntags == 0) { return; } + + const auto d_tags = tv.d_tags; + const auto ntags = tv.ntags; + +#ifdef AMREX_USE_OMP +#pragma omp parallel +#endif + for (int itag = 0; itag < ntags; ++itag) { + + const auto& t = d_tags[itag]; + + if constexpr (is_box_tag(t)) { + const auto lo = amrex::lbound(t.box()); + const auto hi = amrex::ubound(t.box()); + + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + f(0, 1, i, j, k, t); + } + } + } + } else { + const auto size = t.size(); + + AMREX_PRAGMA_SIMD + for (int i = 0; i < size; ++i) { + f(i, size, t); + } + } + } +} + +#endif + } template @@ -379,20 +447,7 @@ std::enable_if_t().box( ParallelFor (Vector const& tags, int ncomp, F && f) { TagVector tv{tags}; - - detail::ParallelFor_doit(tv, - [=] AMREX_GPU_DEVICE ( -#ifdef AMREX_USE_SYCL - sycl::nd_item<1> const& /*item*/, -#endif - int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept - { - if (icell < ncells) { - for (int n = 0; n < ncomp; ++n) { - f(i,j,k,n,tag); - } - } - }); + ParallelFor(tv, ncomp, f); } template @@ -400,18 +455,7 @@ std::enable_if_t().box( ParallelFor (Vector const& tags, F && f) { TagVector tv{tags}; - - detail::ParallelFor_doit(tv, - [=] AMREX_GPU_DEVICE ( -#ifdef AMREX_USE_SYCL - sycl::nd_item<1> const& /*item*/, -#endif - int icell, int ncells, int i, int j, int k, TagType const& tag) noexcept - { - if (icell < ncells) { - f(i,j,k,tag); - } - }); + ParallelFor(tv, f); } template @@ -419,22 +463,9 @@ std::enable_if_t(). ParallelFor (Vector const& tags, F && f) { TagVector tv{tags}; - - detail::ParallelFor_doit(tv, - [=] AMREX_GPU_DEVICE ( -#ifdef AMREX_USE_SYCL - sycl::nd_item<1> const& /*item*/, -#endif - int icell, int ncells, TagType const& tag) noexcept - { - if (icell < ncells) { - f(icell,tag); - } - }); + ParallelFor(tv, f); } -#endif - } #endif