From 21293049f7352c0611de578186bc149d49096233 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Wed, 26 Feb 2025 19:54:55 +0100 Subject: [PATCH] Add a way to reuse TagParallelFor Tags --- Src/Base/AMReX_FBI.H | 8 +- Src/Base/AMReX_TagParallelFor.H | 297 ++++++++++++++++++++++++-------- 2 files changed, 232 insertions(+), 73 deletions(-) diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H index e20f8bb4b79..2b5b4274766 100644 --- a/Src/Base/AMReX_FBI.H +++ b/Src/Base/AMReX_FBI.H @@ -53,7 +53,9 @@ void fab_to_fab (Vector > const& copy_tags, int scomp, int dcomp, int ncomp, F && f) { - detail::ParallelFor_doit(copy_tags, + TagVector> tv{copy_tags}; + + detail::ParallelFor_doit(tv, [=] AMREX_GPU_DEVICE ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& /*item*/, @@ -85,7 +87,9 @@ fab_to_fab (Vector > const& copy_tags, int scomp, int dcom amrex::Abort("xxxxx TODO This function still has a bug. Even if we fix the bug, it should still be avoided because it is slow due to the lack of atomic operations for this type."); - detail::ParallelFor_doit(tags, + TagVector tv{tags}; + + detail::ParallelFor_doit(tv, [=] AMREX_GPU_DEVICE ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& item, diff --git a/Src/Base/AMReX_TagParallelFor.H b/Src/Base/AMReX_TagParallelFor.H index 63bc127bba4..a4a4faadc52 100644 --- a/Src/Base/AMReX_TagParallelFor.H +++ b/Src/Base/AMReX_TagParallelFor.H @@ -101,31 +101,153 @@ struct VectorTag { Long size () const noexcept { return m_size; } }; -#ifdef AMREX_USE_GPU - namespace detail { -template -std::enable_if_t().box())>, Box>::value, - Long> -get_tag_size (T const& tag) noexcept -{ - AMREX_ASSERT(tag.box().numPts() < Long(std::numeric_limits::max())); - return static_cast(tag.box().numPts()); -} + template + std::enable_if_t().box())>, Box>, Long> + get_tag_size (T const& tag) noexcept + { + AMREX_ASSERT(tag.box().numPts() < Long(std::numeric_limits::max())); + return static_cast(tag.box().numPts()); + } + + template + std::enable_if_t().size())> >, Long> + get_tag_size (T const& tag) noexcept + { + AMREX_ASSERT(tag.size() < Long(std::numeric_limits::max())); + return tag.size(); + } + + template + constexpr + std::enable_if_t().box())>, Box>, bool> + is_box_tag (T const&) { return true; } + + template + constexpr + std::enable_if_t().size())> >, bool> + is_box_tag (T const&) { return false; } -template -std::enable_if_t().size())> >::value, - Long> -get_tag_size (T const& tag) noexcept -{ - AMREX_ASSERT(tag.size() < Long(std::numeric_limits::max())); - return tag.size(); } +template +struct TagVector { + + char* h_buffer = nullptr; + char* d_buffer = nullptr; + TagType* d_tags = nullptr; + int* d_nwarps = nullptr; + int ntags = 0; + int ntotwarps = 0; + int nblocks = 0; + bool defined = false; + static constexpr int nthreads = 256; + + TagVector () = default; + + TagVector (Vector const& tags) { + define(tags); + } + + ~TagVector () { + if (defined) { + undefine(); + } + } + + TagVector (const TagVector& other) = delete; + TagVector (TagVector&& other) = default; + TagVector& operator= (const TagVector& other) = delete; + TagVector& operator= (TagVector&& other) = default; + + [[nodiscard]] bool is_defined () const { return defined; } + + void define (Vector const& tags) { + if (defined) { + undefine(); + } + + ntags = tags.size(); + if (ntags == 0) { + defined = true; + return; + } + +#ifdef AMREX_USE_GPU + Long l_ntotwarps = 0; + ntotwarps = 0; + Vector nwarps; + nwarps.reserve(ntags+1); + for (int i = 0; i < ntags; ++i) + { + auto& tag = tags[i]; + nwarps.push_back(ntotwarps); + auto nw = (detail::get_tag_size(tag) + Gpu::Device::warp_size-1) / + Gpu::Device::warp_size; + l_ntotwarps += nw; + ntotwarps += static_cast(nw); + } + nwarps.push_back(ntotwarps); + + std::size_t sizeof_tags = ntags*sizeof(TagType); + std::size_t offset_nwarps = Arena::align(sizeof_tags); + std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); + std::size_t total_buf_size = offset_nwarps + sizeof_nwarps; + + h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); + d_buffer = (char*)The_Arena()->alloc(total_buf_size); + + std::memcpy(h_buffer, tags.data(), sizeof_tags); + std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); + Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); + + d_tags = reinterpret_cast(d_buffer); + d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); + + constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; + nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; + + defined = true; + + amrex::ignore_unused(l_ntotwarps); + AMREX_ALWAYS_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits::max())); +#else + std::size_t sizeof_tags = ntags*sizeof(TagType); + h_buffer = (char*)The_Pinned_Arena()->alloc(sizeof_tags); + + std::memcpy(h_buffer, tags.data(), sizeof_tags); + + d_tags = reinterpret_cast(h_buffer); + + defined = true; +#endif + } + + void undefine () { + if (defined) { + Gpu::streamSynchronize(); + The_Pinned_Arena()->free(h_buffer); + The_Arena()->free(d_buffer); + h_buffer = nullptr; + d_buffer = nullptr; + d_tags = nullptr; + d_nwarps = nullptr; + ntags = 0; + ntotwarps = 0; + nblocks = 0; + defined = false; + } + } +}; + +namespace detail { + +#ifdef AMREX_USE_GPU + template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -std::enable_if_t().box())>, Box>::value, void> +std::enable_if_t().box())>, Box>, void> tagparfor_call_f ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& item, @@ -150,7 +272,7 @@ tagparfor_call_f ( template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -std::enable_if_t().size())> >::value, void> +std::enable_if_t().size())> >, void> tagparfor_call_f ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& item, @@ -167,48 +289,19 @@ tagparfor_call_f ( template void -ParallelFor_doit (Vector const& tags, F && f) +ParallelFor_doit (TagVector const& tv, F const& f) { - const int ntags = tags.size(); - if (ntags == 0) { return; } - - Long l_ntotwarps = 0; - int ntotwarps = 0; - Vector nwarps; - nwarps.reserve(ntags+1); - for (int i = 0; i < ntags; ++i) - { - auto& tag = tags[i]; - nwarps.push_back(ntotwarps); - auto nw = (get_tag_size(tag) + Gpu::Device::warp_size-1) / Gpu::Device::warp_size; - l_ntotwarps += nw; - ntotwarps += static_cast(nw); - } - nwarps.push_back(ntotwarps); + AMREX_ALWAYS_ASSERT(tv.is_defined()); - std::size_t sizeof_tags = ntags*sizeof(TagType); - std::size_t offset_nwarps = Arena::align(sizeof_tags); - std::size_t sizeof_nwarps = (ntags+1)*sizeof(int); - std::size_t total_buf_size = offset_nwarps + sizeof_nwarps; + if (tv.ntags == 0) { return; } - char* h_buffer = (char*)The_Pinned_Arena()->alloc(total_buf_size); - char* d_buffer = (char*)The_Arena()->alloc(total_buf_size); + const auto d_tags = tv.d_tags; + const auto d_nwarps = tv.d_nwarps; + const auto ntags = tv.ntags; + const auto ntotwarps = tv.ntotwarps; + constexpr auto nthreads = TagVector::nthreads; - std::memcpy(h_buffer, tags.data(), sizeof_tags); - std::memcpy(h_buffer+offset_nwarps, nwarps.data(), sizeof_nwarps); - Gpu::htod_memcpy_async(d_buffer, h_buffer, total_buf_size); - - auto d_tags = reinterpret_cast(d_buffer); - auto d_nwarps = reinterpret_cast(d_buffer+offset_nwarps); - - constexpr int nthreads = 256; - constexpr int nwarps_per_block = nthreads/Gpu::Device::warp_size; - int nblocks = (ntotwarps + nwarps_per_block-1) / nwarps_per_block; - - amrex::ignore_unused(l_ntotwarps); - AMREX_ASSERT(l_ntotwarps+nwarps_per_block-1 < Long(std::numeric_limits::max())); - - amrex::launch(nblocks, nthreads, Gpu::gpuStream(), + amrex::launch(tv.nblocks, nthreads, Gpu::gpuStream(), #ifdef AMREX_USE_SYCL [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept [[sycl::reqd_work_group_size(nthreads)]] @@ -241,20 +334,60 @@ ParallelFor_doit (Vector const& tags, F && f) tagparfor_call_f( icell, d_tags[tag_id], f); #endif }); +} + +#else // ifdef AMREX_USE_GPU - Gpu::streamSynchronize(); - The_Pinned_Arena()->free(h_buffer); - The_Arena()->free(d_buffer); +template +void +ParallelFor_doit (TagVector const& tv, F const& f) +{ + AMREX_ALWAYS_ASSERT(tv.is_defined()); + + if (tv.ntags == 0) { return; } + + const auto d_tags = tv.d_tags; + const auto ntags = tv.ntags; + +#ifdef AMREX_USE_OMP +#pragma omp parallel +#endif + for (int itag = 0; itag < ntags; ++itag) { + + const auto& t = d_tags[itag]; + + if constexpr (is_box_tag(t)) { + const auto lo = amrex::lbound(t.box()); + const auto hi = amrex::ubound(t.box()); + + for (int k = lo.z; k <= hi.z; ++k) { + for (int j = lo.y; j <= hi.y; ++j) { + AMREX_PRAGMA_SIMD + for (int i = lo.x; i <= hi.x; ++i) { + f(0, 1, i, j, k, t); + } + } + } + } else { + const auto size = t.size(); + + AMREX_PRAGMA_SIMD + for (int i = 0; i < size; ++i) { + f(i, size, t); + } + } + } } +#endif + } template -std::enable_if_t().box())>, - Box>::value> -ParallelFor (Vector const& tags, int ncomp, F && f) +std::enable_if_t().box())>, Box>> +ParallelFor (TagVector const& tv, int ncomp, F const& f) { - detail::ParallelFor_doit(tags, + detail::ParallelFor_doit(tv, [=] AMREX_GPU_DEVICE ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& /*item*/, @@ -270,10 +403,10 @@ ParallelFor (Vector const& tags, int ncomp, F && f) } template -std::enable_if_t().box())>, Box>::value, void> -ParallelFor (Vector const& tags, F && f) +std::enable_if_t().box())>, Box>, void> +ParallelFor (TagVector const& tv, F const& f) { - detail::ParallelFor_doit(tags, + detail::ParallelFor_doit(tv, [=] AMREX_GPU_DEVICE ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& /*item*/, @@ -287,10 +420,10 @@ ParallelFor (Vector const& tags, F && f) } template -std::enable_if_t().size())> >::value, void> -ParallelFor (Vector const& tags, F && f) +std::enable_if_t().size())> >, void> +ParallelFor (TagVector const& tv, F const& f) { - detail::ParallelFor_doit(tags, + detail::ParallelFor_doit(tv, [=] AMREX_GPU_DEVICE ( #ifdef AMREX_USE_SYCL sycl::nd_item<1> const& /*item*/, @@ -303,7 +436,29 @@ ParallelFor (Vector const& tags, F && f) }); } -#endif +template +std::enable_if_t().box())>, Box>> +ParallelFor (Vector const& tags, int ncomp, F && f) +{ + TagVector tv{tags}; + ParallelFor(tv, ncomp, std::forward(f)); +} + +template +std::enable_if_t().box())>, Box>, void> +ParallelFor (Vector const& tags, F && f) +{ + TagVector tv{tags}; + ParallelFor(tv, std::forward(f)); +} + +template +std::enable_if_t().size())> >, void> +ParallelFor (Vector const& tags, F && f) +{ + TagVector tv{tags}; + ParallelFor(tv, std::forward(f)); +} }