aboutsummaryrefslogtreecommitdiff
path: root/Source/Particles
diff options
context:
space:
mode:
authorGravatar Weiqun Zhang <WeiqunZhang@lbl.gov> 2022-11-18 10:00:12 -0800
committerGravatar GitHub <noreply@github.com> 2022-11-18 18:00:12 +0000
commit2775ac17fc78b3433d313da46a1c81f932e3912e (patch)
treeaab7dca2a5cebbfbcf3e8075eb0704d6f6729f8c /Source/Particles
parent2c00044641882f35c70528b913a8d9efbb0a5336 (diff)
downloadWarpX-2775ac17fc78b3433d313da46a1c81f932e3912e.tar.gz
WarpX-2775ac17fc78b3433d313da46a1c81f932e3912e.tar.zst
WarpX-2775ac17fc78b3433d313da46a1c81f932e3912e.zip
PushPX: GPU kernel optimization (#3402)
* PushPX: GPU kernel optimization The GatherAndPush kernel in the PushPX function has a very low occupancy due to register pressure. There are a number of reasons. By default, we compile with QED module on, even if we do not use it at run time. Another culprit is the GetExternalEB functor that contains 7 Parsers. Again, we have to pay a high runtime cost, even if we do not use it. In this PR, we move some runtime logic out of the GPU kernel to eleminate the unnecessary cost if QED and GetExternalEB are not used at run time. Here are some performance results before this PR. | QED | GetExternalEB | Time | |-----+---------------+------| | On | On | 2.17 | | Off | On | 1.79 | | Off | Commented out | 1.34 | Note that in the tests neither QED nor GetExternalEB is actually used at run time. But the extra cost is very high. With this PR, the kernel time is the same as that when both QED and GetExternalEB are disabled at compile time, even though both options are disabled at run time. More information on the kernels compiled for MI250X. The most expensive variant with both QED and GetExternalEB on has NumSgprs: 108 NumVgprs: 256 NumAgprs: 40 TotalNumVgprs: 296 ScratchSize: 264 Occupancy: 1 The cheapest variant with both QED and GetExternalEB disabled has NumSgprs: 104 NumVgprs: 249 NumAgprs: 0 TotalNumVgprs: 249 ScratchSize: 144 Occupancy: 2 * Fix Comments Co-authored-by: Axel Huebl <axel.huebl@plasma.ninja>
Diffstat (limited to 'Source/Particles')
-rw-r--r--Source/Particles/Gather/GetExternalFields.H6
-rw-r--r--Source/Particles/PhysicalParticleContainer.cpp73
-rw-r--r--Source/Particles/Pusher/PushSelector.H47
3 files changed, 80 insertions, 46 deletions
diff --git a/Source/Particles/Gather/GetExternalFields.H b/Source/Particles/Gather/GetExternalFields.H
index 92f1a21bc..ff107dce5 100644
--- a/Source/Particles/Gather/GetExternalFields.H
+++ b/Source/Particles/Gather/GetExternalFields.H
@@ -13,13 +13,12 @@
#include <AMReX_Parser.H>
#include <AMReX_REAL.H>
-enum ExternalFieldInitType { None, Constant, Parser, RepeatedPlasmaLens, Unknown };
-
/** \brief Functor class that assigns external
* field values (E and B) to particles.
*/
struct GetExternalEBField
{
+ enum ExternalFieldInitType { None, Constant, Parser, RepeatedPlasmaLens, Unknown };
GetExternalEBField () = default;
@@ -56,6 +55,9 @@ struct GetExternalEBField
const amrex::ParticleReal* AMREX_RESTRICT m_uz = nullptr;
AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+ bool isNoOp () const { return (m_Etype == None && m_Btype == None); }
+
+ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
void operator () (long i,
amrex::ParticleReal& field_Ex,
amrex::ParticleReal& field_Ey,
diff --git a/Source/Particles/PhysicalParticleContainer.cpp b/Source/Particles/PhysicalParticleContainer.cpp
index f37ea5ea3..62d4df594 100644
--- a/Source/Particles/PhysicalParticleContainer.cpp
+++ b/Source/Particles/PhysicalParticleContainer.cpp
@@ -2624,7 +2624,24 @@ PhysicalParticleContainer::PushPX (WarpXParIter& pti,
const auto t_do_not_gather = do_not_gather;
- amrex::ParallelFor( np_to_push, [=] AMREX_GPU_DEVICE (long ip)
+ enum exteb_flags : int { no_exteb, has_exteb };
+ enum qed_flags : int { no_qed, has_qed };
+
+ int exteb_runtime_flag = getExternalEB.isNoOp() ? no_exteb : has_exteb;
+#ifdef WARPX_QED
+ int qed_runtime_flag = (local_has_quantum_sync || do_sync) ? has_qed : no_qed;
+#else
+ int qed_runtime_flag = no_qed;
+#endif
+
+ // Using this version of ParallelFor with compile time options
+ // improves performance when qed or external EB are not used by reducing
+ // register pressure.
+ amrex::ParallelFor(TypeList<CompileTimeOptions<no_exteb,has_exteb>,
+ CompileTimeOptions<no_qed ,has_qed>>{},
+ {exteb_runtime_flag, qed_runtime_flag},
+ np_to_push, [=] AMREX_GPU_DEVICE (long ip, auto exteb_control,
+ [[maybe_unused]] auto qed_control)
{
amrex::ParticleReal xp, yp, zp;
getPosition(ip, xp, yp, zp);
@@ -2650,30 +2667,54 @@ PhysicalParticleContainer::PushPX (WarpXParIter& pti,
dx_arr, xyzmin_arr, lo, n_rz_azimuthal_modes,
nox, galerkin_interpolation);
}
- // Externally applied E and B-field in Cartesian co-ordinates
- getExternalEB(ip, Exp, Eyp, Ezp, Bxp, Byp, Bzp);
+
+ auto const& externeb_fn = getExternalEB; // Have to do this for nvcc
+ if constexpr (exteb_control == has_exteb) {
+ externeb_fn(ip, Exp, Eyp, Ezp, Bxp, Byp, Bzp);
+ }
scaleFields(xp, yp, zp, Exp, Eyp, Ezp, Bxp, Byp, Bzp);
- doParticlePush(getPosition, setPosition, copyAttribs, ip,
- ux[ip], uy[ip], uz[ip],
- Exp, Eyp, Ezp, Bxp, Byp, Bzp,
- ion_lev ? ion_lev[ip] : 0,
- m, q, pusher_algo, do_crr, do_copy,
#ifdef WARPX_QED
- do_sync,
- t_chi_max,
+ if (!do_sync)
#endif
- dt);
-
+ {
+ doParticlePush<0>(getPosition, setPosition, copyAttribs, ip,
+ ux[ip], uy[ip], uz[ip],
+ Exp, Eyp, Ezp, Bxp, Byp, Bzp,
+ ion_lev ? ion_lev[ip] : 0,
+ m, q, pusher_algo, do_crr, do_copy,
#ifdef WARPX_QED
- if (local_has_quantum_sync) {
- evolve_opt(ux[ip], uy[ip], uz[ip],
- Exp, Eyp, Ezp,Bxp, Byp, Bzp,
- dt, p_optical_depth_QSR[ip]);
+ t_chi_max,
+#endif
+ dt);
+ }
+#ifdef WARPX_QED
+ else {
+ if constexpr (qed_control == has_qed) {
+ doParticlePush<1>(getPosition, setPosition, copyAttribs, ip,
+ ux[ip], uy[ip], uz[ip],
+ Exp, Eyp, Ezp, Bxp, Byp, Bzp,
+ ion_lev ? ion_lev[ip] : 0,
+ m, q, pusher_algo, do_crr, do_copy,
+ t_chi_max,
+ dt);
+ }
}
#endif
+#ifdef WARPX_QED
+ auto foo_local_has_quantum_sync = local_has_quantum_sync;
+ auto foo_podq = p_optical_depth_QSR;
+ auto& evolve_opt_fn = evolve_opt; // have to do all these for nvcc
+ if constexpr (qed_control == has_qed) {
+ if (foo_local_has_quantum_sync) {
+ evolve_opt_fn(ux[ip], uy[ip], uz[ip],
+ Exp, Eyp, Ezp,Bxp, Byp, Bzp,
+ dt, foo_podq[ip]);
+ }
+ }
+#endif
});
}
diff --git a/Source/Particles/Pusher/PushSelector.H b/Source/Particles/Pusher/PushSelector.H
index ed439b4b3..a56dda2b9 100644
--- a/Source/Particles/Pusher/PushSelector.H
+++ b/Source/Particles/Pusher/PushSelector.H
@@ -23,6 +23,7 @@
/**
* \brief Push position and momentum for a single particle
*
+ * \tparam do_sync Whether to include quantum synchrotron radiation (QSR)
* \param GetPosition A functor for returning the particle position.
* \param SetPosition A functor for setting the particle position.
* \param copyAttribs A functor for storing the old u and x
@@ -36,10 +37,11 @@
* \param pusher_algo 0: Boris, 1: Vay, 2: HigueraCary
* \param do_crr Whether to do the classical radiation reaction
* \param do_copy Whether to copy the old x and u for the BTD
- * \param do_sync Whether to include quantum synchrotron radiation (QSR)
* \param t_chi_max Cutoff chi for QSR
* \param dt Time step size
*/
+
+template <int do_sync>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
void doParticlePush(const GetParticlePosition& GetPosition,
const SetParticlePosition& SetPosition,
@@ -56,60 +58,53 @@ void doParticlePush(const GetParticlePosition& GetPosition,
const amrex::ParticleReal Bz,
const int ion_lev,
const amrex::ParticleReal m,
- const amrex::ParticleReal q,
+ const amrex::ParticleReal a_q,
const int pusher_algo,
const int do_crr,
const int do_copy,
#ifdef WARPX_QED
- const int do_sync,
const amrex::Real t_chi_max,
#endif
const amrex::Real dt)
{
+ amrex::ParticleReal qp = a_q;
+ if (ion_lev) { qp *= ion_lev; }
+
if (do_copy) copyAttribs(i);
if (do_crr) {
#ifdef WARPX_QED
- if (do_sync) {
+ amrex::ignore_unused(t_chi_max);
+ if constexpr (do_sync) {
auto chi = QedUtils::chi_ele_pos(m*ux, m*uy, m*uz,
Ex, Ey, Ez,
Bx, By, Bz);
if (chi < t_chi_max) {
UpdateMomentumBorisWithRadiationReaction(ux, uy, uz,
Ex, Ey, Ez, Bx,
- By, Bz, q, m, dt);
+ By, Bz, qp, m, dt);
}
else {
UpdateMomentumBoris( ux, uy, uz,
Ex, Ey, Ez, Bx,
- By, Bz, q, m, dt);
+ By, Bz, qp, m, dt);
}
amrex::ParticleReal x, y, z;
GetPosition(i, x, y, z);
UpdatePosition(x, y, z, ux, uy, uz, dt );
SetPosition(i, x, y, z);
- } else {
+ } else
+#endif
+ {
+
UpdateMomentumBorisWithRadiationReaction(ux, uy, uz,
Ex, Ey, Ez, Bx,
- By, Bz, q, m, dt);
+ By, Bz, qp, m, dt);
amrex::ParticleReal x, y, z;
GetPosition(i, x, y, z);
UpdatePosition(x, y, z, ux, uy, uz, dt );
SetPosition(i, x, y, z);
}
-#else
- amrex::ParticleReal qp = q;
- if (ion_lev) { qp *= ion_lev; }
- UpdateMomentumBorisWithRadiationReaction(ux, uy, uz,
- Ex, Ey, Ez, Bx,
- By, Bz, qp, m, dt);
- amrex::ParticleReal x, y, z;
- GetPosition(i, x, y, z);
- UpdatePosition(x, y, z, ux, uy, uz, dt );
- SetPosition(i, x, y, z);
-#endif
} else if (pusher_algo == ParticlePusherAlgo::Boris) {
- amrex::ParticleReal qp = q;
- if (ion_lev) { qp *= ion_lev; }
UpdateMomentumBoris( ux, uy, uz,
Ex, Ey, Ez, Bx,
By, Bz, qp, m, dt);
@@ -118,8 +113,6 @@ void doParticlePush(const GetParticlePosition& GetPosition,
UpdatePosition(x, y, z, ux, uy, uz, dt );
SetPosition(i, x, y, z);
} else if (pusher_algo == ParticlePusherAlgo::Vay) {
- amrex::ParticleReal qp = q;
- if (ion_lev){ qp *= ion_lev; }
UpdateMomentumVay( ux, uy, uz,
Ex, Ey, Ez, Bx,
By, Bz, qp, m, dt);
@@ -128,8 +121,6 @@ void doParticlePush(const GetParticlePosition& GetPosition,
UpdatePosition(x, y, z, ux, uy, uz, dt );
SetPosition(i, x, y, z);
} else if (pusher_algo == ParticlePusherAlgo::HigueraCary) {
- amrex::ParticleReal qp = q;
- if (ion_lev){ qp *= ion_lev; }
UpdateMomentumHigueraCary( ux, uy, uz,
Ex, Ey, Ez, Bx,
By, Bz, qp, m, dt);
@@ -137,9 +128,9 @@ void doParticlePush(const GetParticlePosition& GetPosition,
GetPosition(i, x, y, z);
UpdatePosition(x, y, z, ux, uy, uz, dt );
SetPosition(i, x, y, z);
- } else {
- amrex::Abort("Unknown particle pusher");
- }
+ } //else {
+// amrex::Abort("Unknown particle pusher");
+// }
}
#endif // WARPX_PARTICLES_PUSHER_SELECTOR_H_