diff options
author | 2022-11-18 10:00:12 -0800 | |
---|---|---|
committer | 2022-11-18 18:00:12 +0000 | |
commit | 2775ac17fc78b3433d313da46a1c81f932e3912e (patch) | |
tree | aab7dca2a5cebbfbcf3e8075eb0704d6f6729f8c /Source/Particles/PhysicalParticleContainer.cpp | |
parent | 2c00044641882f35c70528b913a8d9efbb0a5336 (diff) | |
download | WarpX-2775ac17fc78b3433d313da46a1c81f932e3912e.tar.gz WarpX-2775ac17fc78b3433d313da46a1c81f932e3912e.tar.zst WarpX-2775ac17fc78b3433d313da46a1c81f932e3912e.zip |
PushPX: GPU kernel optimization (#3402)
* PushPX: GPU kernel optimization
The GatherAndPush kernel in the PushPX function has a very low occupancy due
to register pressure. There are a number of reasons. By default, we
compile with QED module on, even if we do not use it at run time. Another
culprit is the GetExternalEB functor that contains 7 Parsers. Again, we
have to pay a high runtime cost, even if we do not use it. In this PR, we
move some runtime logic out of the GPU kernel to eleminate the unnecessary
cost if QED and GetExternalEB are not used at run time.
Here are some performance results before this PR.
| QED | GetExternalEB | Time |
|-----+---------------+------|
| On | On | 2.17 |
| Off | On | 1.79 |
| Off | Commented out | 1.34 |
Note that in the tests neither QED nor GetExternalEB is actually used at run
time. But the extra cost is very high. With this PR, the kernel time is
the same as that when both QED and GetExternalEB are disabled at compile
time, even though both options are disabled at run time.
More information on the kernels compiled for MI250X. The most expensive
variant with both QED and GetExternalEB on has
NumSgprs: 108
NumVgprs: 256
NumAgprs: 40
TotalNumVgprs: 296
ScratchSize: 264
Occupancy: 1
The cheapest variant with both QED and GetExternalEB disabled has
NumSgprs: 104
NumVgprs: 249
NumAgprs: 0
TotalNumVgprs: 249
ScratchSize: 144
Occupancy: 2
* Fix Comments
Co-authored-by: Axel Huebl <axel.huebl@plasma.ninja>
Diffstat (limited to 'Source/Particles/PhysicalParticleContainer.cpp')
-rw-r--r-- | Source/Particles/PhysicalParticleContainer.cpp | 73 |
1 files changed, 57 insertions, 16 deletions
diff --git a/Source/Particles/PhysicalParticleContainer.cpp b/Source/Particles/PhysicalParticleContainer.cpp index f37ea5ea3..62d4df594 100644 --- a/Source/Particles/PhysicalParticleContainer.cpp +++ b/Source/Particles/PhysicalParticleContainer.cpp @@ -2624,7 +2624,24 @@ PhysicalParticleContainer::PushPX (WarpXParIter& pti, const auto t_do_not_gather = do_not_gather; - amrex::ParallelFor( np_to_push, [=] AMREX_GPU_DEVICE (long ip) + enum exteb_flags : int { no_exteb, has_exteb }; + enum qed_flags : int { no_qed, has_qed }; + + int exteb_runtime_flag = getExternalEB.isNoOp() ? no_exteb : has_exteb; +#ifdef WARPX_QED + int qed_runtime_flag = (local_has_quantum_sync || do_sync) ? has_qed : no_qed; +#else + int qed_runtime_flag = no_qed; +#endif + + // Using this version of ParallelFor with compile time options + // improves performance when qed or external EB are not used by reducing + // register pressure. + amrex::ParallelFor(TypeList<CompileTimeOptions<no_exteb,has_exteb>, + CompileTimeOptions<no_qed ,has_qed>>{}, + {exteb_runtime_flag, qed_runtime_flag}, + np_to_push, [=] AMREX_GPU_DEVICE (long ip, auto exteb_control, + [[maybe_unused]] auto qed_control) { amrex::ParticleReal xp, yp, zp; getPosition(ip, xp, yp, zp); @@ -2650,30 +2667,54 @@ PhysicalParticleContainer::PushPX (WarpXParIter& pti, dx_arr, xyzmin_arr, lo, n_rz_azimuthal_modes, nox, galerkin_interpolation); } - // Externally applied E and B-field in Cartesian co-ordinates - getExternalEB(ip, Exp, Eyp, Ezp, Bxp, Byp, Bzp); + + auto const& externeb_fn = getExternalEB; // Have to do this for nvcc + if constexpr (exteb_control == has_exteb) { + externeb_fn(ip, Exp, Eyp, Ezp, Bxp, Byp, Bzp); + } scaleFields(xp, yp, zp, Exp, Eyp, Ezp, Bxp, Byp, Bzp); - doParticlePush(getPosition, setPosition, copyAttribs, ip, - ux[ip], uy[ip], uz[ip], - Exp, Eyp, Ezp, Bxp, Byp, Bzp, - ion_lev ? ion_lev[ip] : 0, - m, q, pusher_algo, do_crr, do_copy, #ifdef WARPX_QED - do_sync, - t_chi_max, + if (!do_sync) #endif - dt); - + { + doParticlePush<0>(getPosition, setPosition, copyAttribs, ip, + ux[ip], uy[ip], uz[ip], + Exp, Eyp, Ezp, Bxp, Byp, Bzp, + ion_lev ? ion_lev[ip] : 0, + m, q, pusher_algo, do_crr, do_copy, #ifdef WARPX_QED - if (local_has_quantum_sync) { - evolve_opt(ux[ip], uy[ip], uz[ip], - Exp, Eyp, Ezp,Bxp, Byp, Bzp, - dt, p_optical_depth_QSR[ip]); + t_chi_max, +#endif + dt); + } +#ifdef WARPX_QED + else { + if constexpr (qed_control == has_qed) { + doParticlePush<1>(getPosition, setPosition, copyAttribs, ip, + ux[ip], uy[ip], uz[ip], + Exp, Eyp, Ezp, Bxp, Byp, Bzp, + ion_lev ? ion_lev[ip] : 0, + m, q, pusher_algo, do_crr, do_copy, + t_chi_max, + dt); + } } #endif +#ifdef WARPX_QED + auto foo_local_has_quantum_sync = local_has_quantum_sync; + auto foo_podq = p_optical_depth_QSR; + auto& evolve_opt_fn = evolve_opt; // have to do all these for nvcc + if constexpr (qed_control == has_qed) { + if (foo_local_has_quantum_sync) { + evolve_opt_fn(ux[ip], uy[ip], uz[ip], + Exp, Eyp, Ezp,Bxp, Byp, Bzp, + dt, foo_podq[ip]); + } + } +#endif }); } |