From f692e78ac94a39bb5a5e40c5019dac08d702f6cc Mon Sep 17 00:00:00 2001 From: AlexanderSinn <64009254+AlexanderSinn@users.noreply.github.com> Date: Thu, 15 Feb 2024 01:06:28 +0100 Subject: [PATCH] Only do a htod memcpy in getParticleTileData when necessary (#3760) ## Summary Previously a htod memcopy was done every time `getParticleTileData()` or `getConstParticleTileData()` was called if runtime components where allocated. Now it will only be done if the runtime component pointers have changed, for example after resize or after ReorderParticles. ## Additional background ## Checklist The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --- Src/Particle/AMReX_ParticleTile.H | 110 +++++++++++++++++------------- 1 file changed, 64 insertions(+), 46 deletions(-) diff --git a/Src/Particle/AMReX_ParticleTile.H b/Src/Particle/AMReX_ParticleTile.H index c072680e61d..142133c2756 100644 --- a/Src/Particle/AMReX_ParticleTile.H +++ b/Src/Particle/AMReX_ParticleTile.H @@ -1109,35 +1109,41 @@ struct ParticleTile ParticleTileDataType getParticleTileData () { - int index = NArrayReal; + m_runtime_r_ptrs.resize(m_soa_tile.NumRealComps() - NArrayReal); + m_runtime_i_ptrs.resize(m_soa_tile.NumIntComps() - NArrayInt); #ifdef AMREX_USE_GPU - Gpu::HostVector h_runtime_r_ptrs(m_runtime_r_ptrs.size()); - for (auto& r_ptr : h_runtime_r_ptrs) { - r_ptr = m_soa_tile.GetRealData(index++).dataPtr(); - } - if (h_runtime_r_ptrs.size() > 0) { - Gpu::htod_memcpy_async(m_runtime_r_ptrs.data(), h_runtime_r_ptrs.data(), - h_runtime_r_ptrs.size()*sizeof(ParticleReal*)); + bool copy_real = false; + m_h_runtime_r_ptrs.resize(m_soa_tile.NumRealComps() - NArrayReal); + for (std::size_t i = 0; i < m_h_runtime_r_ptrs.size(); ++i) { + if (m_h_runtime_r_ptrs[i] != m_soa_tile.GetRealData(i + NArrayReal).dataPtr()) { + m_h_runtime_r_ptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr(); + copy_real = true; + } } -#else - for (auto& r_ptr : m_runtime_r_ptrs) { - r_ptr = m_soa_tile.GetRealData(index++).dataPtr(); + if (copy_real) { + Gpu::htod_memcpy_async(m_runtime_r_ptrs.data(), m_h_runtime_r_ptrs.data(), + m_h_runtime_r_ptrs.size()*sizeof(ParticleReal*)); } -#endif - index = NArrayInt; -#ifdef AMREX_USE_GPU - Gpu::HostVector h_runtime_i_ptrs(m_runtime_i_ptrs.size()); - for (auto& i_ptr : h_runtime_i_ptrs) { - i_ptr = m_soa_tile.GetIntData(index++).dataPtr(); + bool copy_int = false; + m_h_runtime_i_ptrs.resize(m_soa_tile.NumIntComps() - NArrayInt); + for (std::size_t i = 0; i < m_h_runtime_i_ptrs.size(); ++i) { + if (m_h_runtime_i_ptrs[i] != m_soa_tile.GetIntData(i + NArrayInt).dataPtr()) { + m_h_runtime_i_ptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr(); + copy_int = true; + } } - if (h_runtime_i_ptrs.size() > 0) { - Gpu::htod_memcpy_async(m_runtime_i_ptrs.data(), h_runtime_i_ptrs.data(), - h_runtime_i_ptrs.size()*sizeof(int*)); + if (copy_int) { + Gpu::htod_memcpy_async(m_runtime_i_ptrs.data(), m_h_runtime_i_ptrs.data(), + m_h_runtime_i_ptrs.size()*sizeof(int*)); } #else - for (auto& i_ptr : m_runtime_i_ptrs) { - i_ptr = m_soa_tile.GetIntData(index++).dataPtr(); + for (std::size_t i = 0; i < m_runtime_r_ptrs.size(); ++i) { + m_runtime_r_ptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr(); + } + + for (std::size_t i = 0; i < m_runtime_i_ptrs.size(); ++i) { + m_runtime_i_ptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr(); } #endif @@ -1169,7 +1175,7 @@ struct ParticleTile ptd.m_runtime_idata = m_runtime_i_ptrs.dataPtr(); #ifdef AMREX_USE_GPU - if ((h_runtime_r_ptrs.size() > 0) || (h_runtime_i_ptrs.size() > 0)) { + if (copy_real || copy_int) { Gpu::streamSynchronize(); } #endif @@ -1179,35 +1185,41 @@ struct ParticleTile ConstParticleTileDataType getConstParticleTileData () const { - int index = NArrayReal; + m_runtime_r_cptrs.resize(m_soa_tile.NumRealComps() - NArrayReal); + m_runtime_i_cptrs.resize(m_soa_tile.NumIntComps() - NArrayInt); #ifdef AMREX_USE_GPU - Gpu::HostVector h_runtime_r_cptrs(m_runtime_r_cptrs.size()); - for (auto& r_ptr : h_runtime_r_cptrs) { - r_ptr = m_soa_tile.GetRealData(index++).dataPtr(); - } - if (h_runtime_r_cptrs.size() > 0) { - Gpu::htod_memcpy_async(m_runtime_r_cptrs.data(), h_runtime_r_cptrs.data(), - h_runtime_r_cptrs.size()*sizeof(ParticleReal const*)); + bool copy_real = false; + m_h_runtime_r_cptrs.resize(m_soa_tile.NumRealComps() - NArrayReal); + for (std::size_t i = 0; i < m_h_runtime_r_cptrs.size(); ++i) { + if (m_h_runtime_r_cptrs[i] != m_soa_tile.GetRealData(i + NArrayReal).dataPtr()) { + m_h_runtime_r_cptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr(); + copy_real = true; + } } -#else - for (auto& r_ptr : m_runtime_r_cptrs) { - r_ptr = m_soa_tile.GetRealData(index++).dataPtr(); + if (copy_real) { + Gpu::htod_memcpy_async(m_runtime_r_cptrs.data(), m_h_runtime_r_cptrs.data(), + m_h_runtime_r_cptrs.size()*sizeof(ParticleReal*)); } -#endif - index = NArrayInt; -#ifdef AMREX_USE_GPU - Gpu::HostVector h_runtime_i_cptrs(m_runtime_i_cptrs.size()); - for (auto& i_ptr : h_runtime_i_cptrs) { - i_ptr = m_soa_tile.GetIntData(index++).dataPtr(); + bool copy_int = false; + m_h_runtime_i_cptrs.resize(m_soa_tile.NumIntComps() - NArrayInt); + for (std::size_t i = 0; i < m_h_runtime_i_cptrs.size(); ++i) { + if (m_h_runtime_i_cptrs[i] != m_soa_tile.GetIntData(i + NArrayInt).dataPtr()) { + m_h_runtime_i_cptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr(); + copy_int = true; + } } - if (h_runtime_i_cptrs.size() > 0) { - Gpu::htod_memcpy_async(m_runtime_i_cptrs.data(), h_runtime_i_cptrs.data(), - h_runtime_i_cptrs.size()*sizeof(int const*)); + if (copy_int) { + Gpu::htod_memcpy_async(m_runtime_i_cptrs.data(), m_h_runtime_i_cptrs.data(), + m_h_runtime_i_cptrs.size()*sizeof(int*)); } #else - for (auto& i_ptr : m_runtime_i_cptrs) { - i_ptr = m_soa_tile.GetIntData(index++).dataPtr(); + for (std::size_t i = 0; i < m_runtime_r_cptrs.size(); ++i) { + m_runtime_r_cptrs[i] = m_soa_tile.GetRealData(i + NArrayReal).dataPtr(); + } + + for (std::size_t i = 0; i < m_runtime_i_cptrs.size(); ++i) { + m_runtime_i_cptrs[i] = m_soa_tile.GetIntData(i + NArrayInt).dataPtr(); } #endif @@ -1239,7 +1251,7 @@ struct ParticleTile ptd.m_runtime_idata = m_runtime_i_cptrs.dataPtr(); #ifdef AMREX_USE_GPU - if ((h_runtime_r_cptrs.size() > 0) || (h_runtime_i_cptrs.size() > 0)) { + if (copy_real || copy_int) { Gpu::streamSynchronize(); } #endif @@ -1259,6 +1271,12 @@ private: mutable amrex::PODVector > m_runtime_r_cptrs; mutable amrex::PODVector >m_runtime_i_cptrs; + + amrex::Gpu::HostVector m_h_runtime_r_ptrs; + amrex::Gpu::HostVector m_h_runtime_i_ptrs; + + mutable amrex::Gpu::HostVector m_h_runtime_r_cptrs; + mutable amrex::Gpu::HostVector m_h_runtime_i_cptrs; }; } // namespace amrex