diff options
author | 2022-12-21 14:09:26 -0800 | |
---|---|---|
committer | 2022-12-21 23:09:26 +0100 | |
commit | 9cb604ec271dcc37a3609bbad967a27b10982f14 (patch) | |
tree | 92729c826442367c00ea715ee9d3b8d1d490c9ae | |
parent | dc23e049ddfead001bd78004da929b192c11b012 (diff) | |
download | WarpX-9cb604ec271dcc37a3609bbad967a27b10982f14.tar.gz WarpX-9cb604ec271dcc37a3609bbad967a27b10982f14.tar.zst WarpX-9cb604ec271dcc37a3609bbad967a27b10982f14.zip |
Avoid touching device memory on host in FieldProbe (#3579)
The implementation here can be optimized by using shared memory. However,
instead of doing it here, I am going to add a helper function to AMReX.
Also removed an unnecessary synchronization, because there is a
synchronization in the destructor of the particle iterator.
-rw-r--r-- | Source/Diagnostics/ReducedDiags/FieldProbe.cpp | 39 |
1 files changed, 23 insertions, 16 deletions
diff --git a/Source/Diagnostics/ReducedDiags/FieldProbe.cpp b/Source/Diagnostics/ReducedDiags/FieldProbe.cpp index d6621f789..073116627 100644 --- a/Source/Diagnostics/ReducedDiags/FieldProbe.cpp +++ b/Source/Diagnostics/ReducedDiags/FieldProbe.cpp @@ -570,31 +570,38 @@ void FieldProbe::ComputeDiags (int step) });// ParallelFor Close // this check is here because for m_field_probe_integrate == True, we always compute // but we only write when we truly are in an output interval step - if (m_intervals.contains(step+1)) + if (m_intervals.contains(step+1) && np > 0) { - for (auto ip=0; ip < np; ip++) + // This could be optimized by using shared memory. + amrex::Gpu::DeviceVector<amrex::Real> dv(np*noutputs); + amrex::Real* dvp = dv.data(); + amrex::ParallelFor(np, [=] AMREX_GPU_DEVICE (long ip) { amrex::ParticleReal xp, yp, zp; getPosition(ip, xp, yp, zp); - - // push to output vector - m_data.push_back(xp); - m_data.push_back(yp); - m_data.push_back(zp); - m_data.push_back(part_Ex[ip]); - m_data.push_back(part_Ey[ip]); - m_data.push_back(part_Ez[ip]); - m_data.push_back(part_Bx[ip]); - m_data.push_back(part_By[ip]); - m_data.push_back(part_Bz[ip]); - m_data.push_back(part_S[ip]); - } + long idx = ip*noutputs; + dvp[idx++] = xp; + dvp[idx++] = yp; + dvp[idx++] = zp; + dvp[idx++] = part_Ex[ip]; + dvp[idx++] = part_Ey[ip]; + dvp[idx++] = part_Ez[ip]; + dvp[idx++] = part_Bx[ip]; + dvp[idx++] = part_By[ip]; + dvp[idx++] = part_Bz[ip]; + dvp[idx++] = part_S[ip]; + }); + auto oldsize = m_data.size(); + m_data.resize(oldsize + dv.size()); + amrex::Gpu::copyAsync(amrex::Gpu::deviceToHost, + dv.begin(), dv.end(), &m_data[oldsize]); + Gpu::streamSynchronize(); /* m_data now contains up-to-date values for: * [x, y, z, Ex, Ey, Ez, Bx, By, Bz, and S] */ } } } // end particle iterator loop - Gpu::synchronize(); + if (m_intervals.contains(step+1)) { // returns total number of mpi notes into mpisize |