aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Weiqun Zhang <WeiqunZhang@lbl.gov> 2022-12-21 14:09:26 -0800
committerGravatar GitHub <noreply@github.com> 2022-12-21 23:09:26 +0100
commit9cb604ec271dcc37a3609bbad967a27b10982f14 (patch)
tree92729c826442367c00ea715ee9d3b8d1d490c9ae
parentdc23e049ddfead001bd78004da929b192c11b012 (diff)
downloadWarpX-9cb604ec271dcc37a3609bbad967a27b10982f14.tar.gz
WarpX-9cb604ec271dcc37a3609bbad967a27b10982f14.tar.zst
WarpX-9cb604ec271dcc37a3609bbad967a27b10982f14.zip
Avoid touching device memory on host in FieldProbe (#3579)
The implementation here can be optimized by using shared memory. However, instead of doing it here, I am going to add a helper function to AMReX. Also removed an unnecessary synchronization, because there is a synchronization in the destructor of the particle iterator.
-rw-r--r--Source/Diagnostics/ReducedDiags/FieldProbe.cpp39
1 files changed, 23 insertions, 16 deletions
diff --git a/Source/Diagnostics/ReducedDiags/FieldProbe.cpp b/Source/Diagnostics/ReducedDiags/FieldProbe.cpp
index d6621f789..073116627 100644
--- a/Source/Diagnostics/ReducedDiags/FieldProbe.cpp
+++ b/Source/Diagnostics/ReducedDiags/FieldProbe.cpp
@@ -570,31 +570,38 @@ void FieldProbe::ComputeDiags (int step)
});// ParallelFor Close
// this check is here because for m_field_probe_integrate == True, we always compute
// but we only write when we truly are in an output interval step
- if (m_intervals.contains(step+1))
+ if (m_intervals.contains(step+1) && np > 0)
{
- for (auto ip=0; ip < np; ip++)
+ // This could be optimized by using shared memory.
+ amrex::Gpu::DeviceVector<amrex::Real> dv(np*noutputs);
+ amrex::Real* dvp = dv.data();
+ amrex::ParallelFor(np, [=] AMREX_GPU_DEVICE (long ip)
{
amrex::ParticleReal xp, yp, zp;
getPosition(ip, xp, yp, zp);
-
- // push to output vector
- m_data.push_back(xp);
- m_data.push_back(yp);
- m_data.push_back(zp);
- m_data.push_back(part_Ex[ip]);
- m_data.push_back(part_Ey[ip]);
- m_data.push_back(part_Ez[ip]);
- m_data.push_back(part_Bx[ip]);
- m_data.push_back(part_By[ip]);
- m_data.push_back(part_Bz[ip]);
- m_data.push_back(part_S[ip]);
- }
+ long idx = ip*noutputs;
+ dvp[idx++] = xp;
+ dvp[idx++] = yp;
+ dvp[idx++] = zp;
+ dvp[idx++] = part_Ex[ip];
+ dvp[idx++] = part_Ey[ip];
+ dvp[idx++] = part_Ez[ip];
+ dvp[idx++] = part_Bx[ip];
+ dvp[idx++] = part_By[ip];
+ dvp[idx++] = part_Bz[ip];
+ dvp[idx++] = part_S[ip];
+ });
+ auto oldsize = m_data.size();
+ m_data.resize(oldsize + dv.size());
+ amrex::Gpu::copyAsync(amrex::Gpu::deviceToHost,
+ dv.begin(), dv.end(), &m_data[oldsize]);
+ Gpu::streamSynchronize();
/* m_data now contains up-to-date values for:
* [x, y, z, Ex, Ey, Ez, Bx, By, Bz, and S] */
}
}
} // end particle iterator loop
- Gpu::synchronize();
+
if (m_intervals.contains(step+1))
{
// returns total number of mpi notes into mpisize