Skip to content

Commit

Permalink
Make MFParallelFor safe from int overflow
Browse files Browse the repository at this point in the history
This is continuation of the changes in AMReX-Codes#3742 making AMReX ready for big
kernels.

We also store the number of points in BoxIndexer now because we always need
that number in GPU kernels.
  • Loading branch information
WeiqunZhang committed Feb 15, 2024
1 parent f692e78 commit 7e37eb9
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 32 deletions.
14 changes: 11 additions & 3 deletions Src/Base/AMReX_Box.H
Original file line number Diff line number Diff line change
Expand Up @@ -1842,13 +1842,16 @@ Box makeSingleCellBox (int i, int j, int k, IndexType typ = IndexType::TheCellTy

struct BoxIndexer
{
std::uint64_t npts;

#if (AMREX_SPACEDIM == 3)
Math::FastDivmodU64 fdxy;
Math::FastDivmodU64 fdx;
IntVect lo;

BoxIndexer (Box const& box)
: fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
: npts(box.numPts()),
fdxy(std::uint64_t(box.length(0))*std::uint64_t(box.length(1))),
fdx (std::uint64_t(box.length(0))),
lo (box.smallEnd())
{}
Expand Down Expand Up @@ -1877,7 +1880,8 @@ struct BoxIndexer
IntVect lo;

BoxIndexer (Box const& box)
: fdx (std::uint64_t(box.length(0))),
: npts(box.numPts()),
fdx (std::uint64_t(box.length(0))),
lo (box.smallEnd())
{}

Expand All @@ -1902,7 +1906,8 @@ struct BoxIndexer
int lo;

BoxIndexer (Box const& box)
: lo(box.smallEnd(0))
: npts(box.numPts()),
lo(box.smallEnd(0))
{}

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
Expand All @@ -1918,6 +1923,9 @@ struct BoxIndexer
}

#endif

[[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
std::uint64_t numPts () const { return npts; }
};

}
Expand Down
4 changes: 2 additions & 2 deletions Src/Base/AMReX_FabArrayBase.H
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ public:
~ParForInfo ();

std::pair<int*,int*> const& getBlocks () const { return m_nblocks_x; }
Box const* getBoxes () const { return m_boxes; }
BoxIndexer const* getBoxes () const { return m_boxes; }

ParForInfo () = delete;
ParForInfo (ParForInfo const&) = delete;
Expand All @@ -663,7 +663,7 @@ public:
IntVect m_ng;
int m_nthreads;
std::pair<int*,int*> m_nblocks_x;
Box* m_boxes = nullptr;
BoxIndexer* m_boxes = nullptr;
char* m_hp = nullptr;
char* m_dp = nullptr;
};
Expand Down
46 changes: 19 additions & 27 deletions Src/Base/AMReX_MFParallelForG.H
Original file line number Diff line number Diff line change
Expand Up @@ -12,36 +12,33 @@ namespace amrex {
namespace detail {

inline
void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair<int*,int*>& blocks_x, Box*& pboxes,
void build_par_for_nblocks (char*& a_hp, char*& a_dp, std::pair<int*,int*>& blocks_x, BoxIndexer*& pboxes,
Vector<Box> const& boxes, Vector<Long> const& ncells, int nthreads)
{
if (!ncells.empty()) {
const int nboxes = ncells.size();
const std::size_t nbytes_boxes = amrex::aligned_size(16, (nboxes+1) * sizeof(int));
const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(Box);
const std::size_t nbytes_boxes = amrex::aligned_size(alignof(BoxIndexer), (nboxes+1) * sizeof(int));
const std::size_t nbytes = nbytes_boxes + nboxes*sizeof(BoxIndexer);
a_hp = (char*)The_Pinned_Arena()->alloc(nbytes);
int* hp_blks = (int*)a_hp;
Box* hp_boxes = (Box*)(a_hp + nbytes_boxes);
auto* hp_boxes = (BoxIndexer*)(a_hp + nbytes_boxes);
hp_blks[0] = 0;
Long ntot = 0;
bool same_size = true;
for (int i = 0; i < nboxes; ++i) {
Long nblocks = (ncells[i] + nthreads-1) / nthreads;
AMREX_ASSERT((hp_blks[i]+nblocks) <= Long(std::numeric_limits<int>::max()));
hp_blks[i+1] = hp_blks[i] + static_cast<int>(nblocks);
ntot += nblocks;
same_size = same_size && (ncells[i] == ncells[0]);

new (hp_boxes+i) Box(boxes[i]);
new (hp_boxes+i) BoxIndexer(boxes[i]);
}
amrex::ignore_unused(ntot);
AMREX_ASSERT(static_cast<Long>(hp_blks[nboxes]) == ntot); // no overflow

a_dp = (char*) The_Arena()->alloc(nbytes);
Gpu::htod_memcpy_async(a_dp, a_hp, nbytes);

blocks_x.first = hp_blks;
blocks_x.second = (same_size) ? nullptr : (int*)a_dp;
pboxes = (Box*)(a_dp + nbytes_boxes);
pboxes = (BoxIndexer*)(a_dp + nbytes_boxes);
}
}

Expand Down Expand Up @@ -94,49 +91,44 @@ ParallelFor (MF const& mf, IntVect const& nghost, int ncomp, IntVect const&, boo
const int nblocks = par_for_blocks.first[nboxes];
const int block_0_size = par_for_blocks.first[1];
const int* dp_nblocks = par_for_blocks.second;
const Box* dp_boxes = parforinfo.getBoxes();
const BoxIndexer* dp_boxes = parforinfo.getBoxes();

#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)

amrex::launch_global<MT>
<<<nblocks, MT, 0, Gpu::gpuStream()>>>
([=] AMREX_GPU_DEVICE () noexcept
{
int ibox, icell;
int ibox;
std::uint64_t icell;
if (dp_nblocks) {
ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdx.x));
icell = (blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x;
icell = std::uint64_t(blockIdx.x-dp_nblocks[ibox])*MT + threadIdx.x;
} else {
ibox = blockIdx.x / block_0_size;
icell = (blockIdx.x-ibox*block_0_size)*MT + threadIdx.x;
icell = std::uint64_t(blockIdx.x-ibox*block_0_size)*MT + threadIdx.x;
}

#elif defined(AMREX_USE_SYCL)

amrex::launch<MT>(nblocks, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
{
int ibox, icell;
int ibox;
std::uint64_t icell;
int blockIdxx = item.get_group_linear_id();
int threadIdxx = item.get_local_linear_id();
if (dp_nblocks) {
ibox = amrex::bisect(dp_nblocks, 0, nboxes, static_cast<int>(blockIdxx));
icell = (blockIdxx-dp_nblocks[ibox])*MT + threadIdxx;
icell = std::uint64_t(blockIdxx-dp_nblocks[ibox])*MT + threadIdxx;
} else {
ibox = blockIdxx / block_0_size;
icell = (blockIdxx-ibox*block_0_size)*MT + threadIdxx;
icell = std::uint64_t(blockIdxx-ibox*block_0_size)*MT + threadIdxx;
}
#endif
Box const& b = dp_boxes[ibox];
int ncells = b.numPts();
if (icell < ncells) {
const auto len = amrex::length(b);
int k = icell / (len.x*len.y);
int j = (icell - k*(len.x*len.y)) / len.x;
int i = (icell - k*(len.x*len.y)) - j*len.x;
AMREX_D_TERM(i += b.smallEnd(0);,
j += b.smallEnd(1);,
k += b.smallEnd(2);)
BoxIndexer const& b = dp_boxes[ibox];
if (icell < b.numPts()) {
auto [i, j, k] = indexer(icell);
for (int n = 0; n < ncomp; ++n) {
parfor_mf_detail::call_f(f, ibox, i, j, k, n);
}
Expand Down

0 comments on commit 7e37eb9

Please sign in to comment.