14 files changed, 466 insertions, 143 deletions
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 202b25d8e..e4f6bfeb3 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -193,3 +193,7 @@ created it). Reviewers will interact with you if they have comments/questions.
 ## Style and conventions
 - For indentation, WarpX uses four spaces (no tabs)
 - The number of characters per line should be <80
+- To define a function , for e.g., myfunction() use a space between the name of the function and the paranthesis - myfunction (). To call the function, the space is not required, i.e., just use myfunction(). The reason this is beneficial is that when we do a 'git grep ' to search for myfunction (), we can clearly see the locations where myfunction () is defined and where myfunction() is called. 
+- Also, using 'git grep "myfunction ()"' searches for files only in the git repo, which is more efficient compared to the 'grep "myfunction ()"' command that searches through all the files in a directory, including plotfiles for example. 
+- It is recommended that style changes are not included in the PR where new code is added. Some text editors may do this automatically and it is suggested that any automatic style changes in the text editor are removed. This is to avoid any errors that may be introduced in a PR just to do style change. 
+
diff --git a/Regression/WarpX-tests.ini b/Regression/WarpX-tests.ini
index 437edd4f0..a10b79504 100644
--- a/Regression/WarpX-tests.ini
+++ b/Regression/WarpX-tests.ini
@@ -238,7 +238,7 @@ numthreads = 1
 compileTest = 0
 doVis = 0
 compareParticles = 1
-runtime_params = psatd.fftw_plan_measure=0
+runtime_params = psatd.fftw_plan_measure=0 psatd.hybrid_mpi_decomposition=1
 particleTypes = electrons positrons
 analysisRoutine = Examples/Tests/Langmuir/langmuir_multi_analysis.py
 analysisOutputImage = langmuir_multi_analysis.png
@@ -256,7 +256,7 @@ numthreads = 1
 compileTest = 0
 doVis = 0
 compareParticles = 1
-runtime_params = psatd.fftw_plan_measure=0 warpx.do_dynamic_scheduling=0 warpx.do_nodal=1
+runtime_params = psatd.fftw_plan_measure=0 warpx.do_dynamic_scheduling=0 warpx.do_nodal=1 
 particleTypes = electrons positrons
 analysisRoutine = Examples/Tests/Langmuir/langmuir_multi_analysis.py
 analysisOutputImage = langmuir_multi_analysis.png
@@ -292,7 +292,7 @@ numthreads = 1
 compileTest = 0
 doVis = 0
 compareParticles = 1
-runtime_params = psatd.fftw_plan_measure=0
+runtime_params = psatd.fftw_plan_measure=0 psatd.hybrid_mpi_decomposition=1
 particleTypes = electrons positrons
 analysisRoutine = Examples/Tests/Langmuir/langmuir_multi_2d_analysis.py
 analysisOutputImage = langmuir_multi_2d_analysis.png
diff --git a/Source/FieldSolver/Make.package b/Source/FieldSolver/Make.package
index 7eea52ee7..25e21c146 100644
--- a/Source/FieldSolver/Make.package
+++ b/Source/FieldSolver/Make.package
@@ -2,9 +2,10 @@ CEXE_headers += WarpX_K.H
 CEXE_headers += WarpX_FDTD.H
 CEXE_sources += WarpXPushFieldsEM.cpp
 ifeq ($(USE_PSATD),TRUE)
-  CEXE_sources += WarpXFFT.cpp
-  F90EXE_sources += WarpX_fft.F90
   include $(WARPX_HOME)/Source/FieldSolver/SpectralSolver/Make.package
+  ifeq ($(USE_CUDA),FALSE)
+    include $(WARPX_HOME)/Source/FieldSolver/PicsarHybridSpectralSolver/Make.package
+  endif
 endif
 ifeq ($(USE_OPENBC_POISSON),TRUE)
   F90EXE_sources += openbc_poisson_solver.F90
diff --git a/Source/FieldSolver/PicsarHybridSpectralSolver/Make.package b/Source/FieldSolver/PicsarHybridSpectralSolver/Make.package
new file mode 100644
index 000000000..e185525e6
--- /dev/null
+++ b/Source/FieldSolver/PicsarHybridSpectralSolver/Make.package
@@ -0,0 +1,6 @@
+CEXE_headers += PicsarHybridFFTData.H
+CEXE_sources += PicsarHybridSpectralSolver.cpp
+F90EXE_sources += picsar_hybrid_spectral.F90
+
+INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/FieldSolver/PicsarHybridSpectralSolver
+VPATH_LOCATIONS   += $(WARPX_HOME)/Source/FieldSolver/PicsarHybridSpectralSolver
+\ No newline at end of file
diff --git a/Source/FieldSolver/PicsarHybridSpectralSolver/PicsarHybridFFTData.H b/Source/FieldSolver/PicsarHybridSpectralSolver/PicsarHybridFFTData.H
new file mode 100644
index 000000000..4e97ab675
--- /dev/null
+++ b/Source/FieldSolver/PicsarHybridSpectralSolver/PicsarHybridFFTData.H
@@ -0,0 +1,32 @@
+#ifndef WARPX_PICSAR_HYBRID_FFTDATA_H_
+#define WARPX_PICSAR_HYBRID_FFTDATA_H_
+
+// FFTData is a stuct containing a 22 pointers to auxiliary arrays
+// 1-11: padded arrays in real space (required by FFTW); 12-22: arrays in spectral space
+struct FFTData
+{
+    static constexpr int N = 22;
+    void* data[N] = { nullptr };
+
+    ~FFTData () {
+        for (int i = 0; i < N; ++i) { // The memory is allocated with fftw_alloc.
+            fftw_free(data[i]);
+            data[i] = nullptr;
+        }
+    }
+
+    FFTData () = default;
+
+    FFTData (FFTData && rhs) noexcept {
+        for (int i = 0; i < N; ++i) {
+            data[i] = rhs.data[i];
+            rhs.data[i] = nullptr;
+        }
+    }
+
+    FFTData (FFTData const&) = delete;
+    void operator= (FFTData const&) = delete;
+    void operator= (FFTData&&) = delete;
+};
+
+#endif // WARPX_PICSAR_HYBRID_FFTDATA_H_
+\ No newline at end of file
diff --git a/Source/FieldSolver/WarpXFFT.cpp b/Source/FieldSolver/PicsarHybridSpectralSolver/PicsarHybridSpectralSolver.cpp
index f9cd7570a..26c93086a 100644
--- a/Source/FieldSolver/WarpXFFT.cpp
+++ b/Source/FieldSolver/PicsarHybridSpectralSolver/PicsarHybridSpectralSolver.cpp
@@ -5,10 +5,10 @@
 
 using namespace amrex;
 
-constexpr int WarpX::FFTData::N;
+constexpr int FFTData::N;
 
 namespace {
-static std::unique_ptr<WarpX::FFTData> nullfftdata; // This for process with nz_fft=0
+static std::unique_ptr<FFTData> nullfftdata; // This for process with nz_fft=0
 
 /** \brief Returns an "owner mask" which 1 for all cells, except
  *  for the duplicated (physical) cells of a nodal grid.
@@ -122,6 +122,8 @@ WarpX::AllocLevelDataFFT (int lev)
     static_assert(std::is_standard_layout<FFTData>::value, "FFTData must have standard layout");
     static_assert(sizeof(FFTData) == sizeof(void*)*FFTData::N, "sizeof FFTData is wrong");
 
+
+
     InitFFTComm(lev);
 
     BoxArray ba_fp_fft;
@@ -366,52 +368,8 @@ WarpX::FreeFFT (int lev)
 }
 
 void
-WarpX::PushPSATD (amrex::Real a_dt)
-{
-    for (int lev = 0; lev <= finest_level; ++lev) {
-        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(dt[lev] == a_dt, "dt must be consistent");
-        if (fft_hybrid_mpi_decomposition){
-            PushPSATD_hybridFFT(lev, a_dt);
-        } else {
-            PushPSATD_localFFT(lev, a_dt);
-        }
-    }
-}
-
-void WarpX::PushPSATD_localFFT (int lev, amrex::Real /* dt */)
-{
-    auto& solver = *spectral_solver_fp[lev];
-
-    // Perform forward Fourier transform
-    solver.ForwardTransform(*Efield_fp[lev][0], SpectralFieldIndex::Ex);
-    solver.ForwardTransform(*Efield_fp[lev][1], SpectralFieldIndex::Ey);
-    solver.ForwardTransform(*Efield_fp[lev][2], SpectralFieldIndex::Ez);
-    solver.ForwardTransform(*Bfield_fp[lev][0], SpectralFieldIndex::Bx);
-    solver.ForwardTransform(*Bfield_fp[lev][1], SpectralFieldIndex::By);
-    solver.ForwardTransform(*Bfield_fp[lev][2], SpectralFieldIndex::Bz);
-    solver.ForwardTransform(*current_fp[lev][0], SpectralFieldIndex::Jx);
-    solver.ForwardTransform(*current_fp[lev][1], SpectralFieldIndex::Jy);
-    solver.ForwardTransform(*current_fp[lev][2], SpectralFieldIndex::Jz);
-    solver.ForwardTransform(*rho_fp[lev], SpectralFieldIndex::rho_old, 0);
-    solver.ForwardTransform(*rho_fp[lev], SpectralFieldIndex::rho_new, 1);
-
-    // Advance fields in spectral space
-    solver.pushSpectralFields();
-
-    // Perform backward Fourier Transform
-    solver.BackwardTransform(*Efield_fp[lev][0], SpectralFieldIndex::Ex);
-    solver.BackwardTransform(*Efield_fp[lev][1], SpectralFieldIndex::Ey);
-    solver.BackwardTransform(*Efield_fp[lev][2], SpectralFieldIndex::Ez);
-    solver.BackwardTransform(*Bfield_fp[lev][0], SpectralFieldIndex::Bx);
-    solver.BackwardTransform(*Bfield_fp[lev][1], SpectralFieldIndex::By);
-    solver.BackwardTransform(*Bfield_fp[lev][2], SpectralFieldIndex::Bz);
-}
-
-void
 WarpX::PushPSATD_hybridFFT (int lev, amrex::Real /* dt */)
 {
-#ifndef AMREX_USE_CUDA // Running on CPU ; use PICSAR code for the hybrid FFT
-
     BL_PROFILE_VAR_NS("WarpXFFT::CopyDualGrid", blp_copy);
     BL_PROFILE_VAR_NS("PICSAR::FftPushEB", blp_push_eb);
 
@@ -486,8 +444,4 @@ WarpX::PushPSATD_hybridFFT (int lev, amrex::Real /* dt */)
     {
         amrex::Abort("WarpX::PushPSATD: TODO");
     }
-#else // AMREX_USE_CUDA is defined ; running on GPU
-        amrex::Abort("The option `psatd.fft_hybrid_mpi_decomposition` does not work on GPU.");
-#endif
-
 }
diff --git a/Source/FieldSolver/WarpX_fft.F90 b/Source/FieldSolver/PicsarHybridSpectralSolver/picsar_hybrid_spectral.F90
index 35357a63f..35357a63f 100644
--- a/Source/FieldSolver/WarpX_fft.F90
+++ b/Source/FieldSolver/PicsarHybridSpectralSolver/picsar_hybrid_spectral.F90
diff --git a/Source/FieldSolver/WarpXPushFieldsEM.cpp b/Source/FieldSolver/WarpXPushFieldsEM.cpp
index c53e13f8f..bea008598 100644
--- a/Source/FieldSolver/WarpXPushFieldsEM.cpp
+++ b/Source/FieldSolver/WarpXPushFieldsEM.cpp
@@ -16,6 +16,54 @@
 
 using namespace amrex;
 
+#ifdef WARPX_USE_PSATD
+
+void
+WarpX::PushPSATD (amrex::Real a_dt)
+{
+    for (int lev = 0; lev <= finest_level; ++lev) {
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(dt[lev] == a_dt, "dt must be consistent");
+        if (fft_hybrid_mpi_decomposition){
+#ifndef AMREX_USE_CUDA // Only available on CPU
+            PushPSATD_hybridFFT(lev, a_dt);
+#endif
+        } else {
+            PushPSATD_localFFT(lev, a_dt);
+        }
+    }
+}
+
+void WarpX::PushPSATD_localFFT (int lev, amrex::Real /* dt */)
+{
+    auto& solver = *spectral_solver_fp[lev];
+
+    // Perform forward Fourier transform
+    solver.ForwardTransform(*Efield_fp[lev][0], SpectralFieldIndex::Ex);
+    solver.ForwardTransform(*Efield_fp[lev][1], SpectralFieldIndex::Ey);
+    solver.ForwardTransform(*Efield_fp[lev][2], SpectralFieldIndex::Ez);
+    solver.ForwardTransform(*Bfield_fp[lev][0], SpectralFieldIndex::Bx);
+    solver.ForwardTransform(*Bfield_fp[lev][1], SpectralFieldIndex::By);
+    solver.ForwardTransform(*Bfield_fp[lev][2], SpectralFieldIndex::Bz);
+    solver.ForwardTransform(*current_fp[lev][0], SpectralFieldIndex::Jx);
+    solver.ForwardTransform(*current_fp[lev][1], SpectralFieldIndex::Jy);
+    solver.ForwardTransform(*current_fp[lev][2], SpectralFieldIndex::Jz);
+    solver.ForwardTransform(*rho_fp[lev], SpectralFieldIndex::rho_old, 0);
+    solver.ForwardTransform(*rho_fp[lev], SpectralFieldIndex::rho_new, 1);
+
+    // Advance fields in spectral space
+    solver.pushSpectralFields();
+
+    // Perform backward Fourier Transform
+    solver.BackwardTransform(*Efield_fp[lev][0], SpectralFieldIndex::Ex);
+    solver.BackwardTransform(*Efield_fp[lev][1], SpectralFieldIndex::Ey);
+    solver.BackwardTransform(*Efield_fp[lev][2], SpectralFieldIndex::Ez);
+    solver.BackwardTransform(*Bfield_fp[lev][0], SpectralFieldIndex::Bx);
+    solver.BackwardTransform(*Bfield_fp[lev][1], SpectralFieldIndex::By);
+    solver.BackwardTransform(*Bfield_fp[lev][2], SpectralFieldIndex::Bz);
+}
+
+#endif
+
 void
 WarpX::EvolveB (Real a_dt)
 {
diff --git a/Source/Initialization/WarpXInitData.cpp b/Source/Initialization/WarpXInitData.cpp
index d583b2b0f..2442e0205 100644
--- a/Source/Initialization/WarpXInitData.cpp
+++ b/Source/Initialization/WarpXInitData.cpp
@@ -317,7 +317,7 @@ WarpX::InitLevelData (int lev, Real time)
     }
 }
 
-#ifdef WARPX_USE_PSATD
+#ifdef WARPX_USE_PSATD_HYBRID
 
 void
 WarpX::InitLevelDataFFT (int lev, Real time)
diff --git a/Source/Make.WarpX b/Source/Make.WarpX
index bc837c797..3060ae8f0 100644
--- a/Source/Make.WarpX
+++ b/Source/Make.WarpX
@@ -111,20 +111,23 @@ endif
    
 
 ifeq ($(USE_PSATD),TRUE)
-  FFTW_HOME ?= NOT_SET
-  ifneq ($(FFTW_HOME),NOT_SET)
-    VPATH_LOCATIONS += $(FFTW_HOME)/include
-    INCLUDE_LOCATIONS += $(FFTW_HOME)/include
-    LIBRARY_LOCATIONS += $(FFTW_HOME)/lib
-  endif
   USERSuffix := $(USERSuffix).PSATD
   DEFINES += -DWARPX_USE_PSATD
-  ifeq ($(USE_CUDA),TRUE)
-    DEFINES += -DFFTW -DCUDA_FFT=1 # PICSAR uses it
-    LIBRARIES += -lfftw3_mpi -lfftw3 -lfftw3_threads -lcufft
+  ifeq ($(USE_CUDA),FALSE) # Running on CPU
+     # Use FFTW
+     LIBRARIES += -lfftw3_mpi -lfftw3 -lfftw3_threads
+     FFTW_HOME ?= NOT_SET
+     ifneq ($(FFTW_HOME),NOT_SET)
+       VPATH_LOCATIONS += $(FFTW_HOME)/include
+       INCLUDE_LOCATIONS += $(FFTW_HOME)/include
+       LIBRARY_LOCATIONS += $(FFTW_HOME)/lib
+     endif
+     # Compile with PICSAR's Hybrid-MPI PSATD
+     DEFINES += -DWARPX_USE_PSATD_HYBRID
+     DEFINES += -DFFTW # PICSAR uses it
   else
-    DEFINES += -DFFTW # PICSAR uses it
-    LIBRARIES += -lfftw3_mpi -lfftw3 -lfftw3_threads
+    # Use cuFFT
+    LIBRARIES += -lcufft
   endif
 endif
 
diff --git a/Source/Particles/Deposition/CurrentDeposition.H b/Source/Particles/Deposition/CurrentDeposition.H
index f81b1ffa8..b4531d3b3 100644
--- a/Source/Particles/Deposition/CurrentDeposition.H
+++ b/Source/Particles/Deposition/CurrentDeposition.H
@@ -19,14 +19,18 @@
  * /param q            : species charge.
  */
 template <int depos_order>
-void doDepositionShapeN(const amrex::Real * const xp, const amrex::Real * const yp, const amrex::Real * const zp,
-                        const amrex::Real * const wp, const amrex::Real * const uxp,
-                        const amrex::Real * const uyp, const amrex::Real * const uzp,
-                        const amrex::Array4<amrex::Real>& jx_arr, 
-                        const amrex::Array4<amrex::Real>& jy_arr, 
+void doDepositionShapeN(const amrex::Real * const xp, 
+                        const amrex::Real * const yp, 
+                        const amrex::Real * const zp,
+                        const amrex::Real * const wp,
+                        const amrex::Real * const uxp,
+                        const amrex::Real * const uyp,
+                        const amrex::Real * const uzp,
+                        const amrex::Array4<amrex::Real>& jx_arr,
+                        const amrex::Array4<amrex::Real>& jy_arr,
                         const amrex::Array4<amrex::Real>& jz_arr,
-                        const long np_to_depose, 
-                        const amrex::Real dt, const std::array<amrex::Real,3>& dx,
+                        const long np_to_depose, const amrex::Real dt,
+                        const std::array<amrex::Real,3>& dx,
                         const std::array<Real, 3> xyzmin,
                         const amrex::Dim3 lo,
                         const amrex::Real stagger_shift, 
@@ -129,4 +133,280 @@ void doDepositionShapeN(const amrex::Real * const xp, const amrex::Real * const
         );
 }
 
+// Compute shape factor and return index of leftmost cell where
+// particle writes.
+// Specialized templates are defined below for orders 1, 2 and 3.
+template <int depos_order>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor (Real* const sx, const Real x_old, const int i_new);
+
+// Compute shape factor for order 1.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor <1> (Real* const sx, const Real x_old, const int i_new){
+    const int i = (int) x_old;
+    const int i_shift = i - i_new;
+    const Real xint = x_old - i;
+    sx[1+i_shift] = 1.0 - xint;
+    sx[2+i_shift] = xint;
+    return i;
+}
+
+// Compute shape factor for order 2.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor <2> (Real* const sx, const Real x_old, const int i_new){
+    const int i = (int) (x_old+0.5);
+    const int i_shift = i - (i_new + 1);
+    const Real xint = x_old - i;
+    sx[1+i_shift] = 0.5*(0.5-xint)*(0.5-xint);
+    sx[2+i_shift] = 0.75-xint*xint;
+    sx[3+i_shift] = 0.5*(0.5+xint)*(0.5+xint);
+    // index of the leftmost cell where particle deposits
+    return i-1;
+}
+
+// Compute shape factor for order 3.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor <3> (Real* const sx, const Real x_old, const int i_new){
+    const int i = (int) x_old;
+    const int i_shift = i - (i_new + 1);
+    const Real xint = x_old - i;
+    sx[1+i_shift] = 1.0/6.0*(1.0-xint)*(1.0-xint)*(1.0-xint);
+    sx[2+i_shift] = 2.0/3.0-xint*xint*(1-xint/2.0);
+    sx[3+i_shift] = 2.0/3.0-(1-xint)*(1-xint)*(1.0-0.5*(1-xint));
+    sx[4+i_shift] = 1.0/6.0*xint*xint*xint;
+    // index of the leftmost cell where particle deposits
+    return i-1;
+}
+
+/* \brief Esirkepov Current Deposition for thread thread_num
+ * /param xp, yp, zp   : Pointer to arrays of particle positions.
+ * \param wp           : Pointer to array of particle weights.
+ * \param uxp uyp uzp  : Pointer to arrays of particle momentum.
+ * \param Jx_arr       : Array4 of current density, either full array or tile.
+ * \param Jy_arr       : Array4 of current density, either full array or tile.
+ * \param Jz_arr       : Array4 of current density, either full array or tile.
+ * \param np_to_depose : Number of particles for which current is deposited.
+ * \param dt           : Time step for particle level
+ * \param dx           : 3D cell size
+ * \param xyzmin       : Physical lower bounds of domain.
+ * \param lo           : Index lower bounds of domain.
+ * /param q            : species charge.
+ */
+template <int depos_order>
+void doEsirkepovDepositionShapeN (const Real * const xp, const Real * const yp, const Real * const zp,
+                                  const Real * const wp, const Real * const uxp,
+                                  const Real * const uyp, const Real * const uzp,
+                                  const amrex::Array4<amrex::Real>& Jx_arr,
+                                  const amrex::Array4<amrex::Real>& Jy_arr,
+                                  const amrex::Array4<amrex::Real>& Jz_arr,
+                                  const long np_to_depose,
+                                  const amrex::Real dt, const std::array<amrex::Real,3>& dx,
+                                  const std::array<Real, 3> xyzmin,
+                                  const Dim3 lo,
+                                  const amrex::Real q)
+{
+
+#if (AMREX_SPACEDIM == 3)
+
+    const Real dxi = 1.0/dx[0];
+    const Real dyi = 1.0/dx[1];
+    const Real dzi = 1.0/dx[2];
+    const Real dtsdx0 = dt*dxi;
+    const Real dtsdy0 = dt*dyi;
+    const Real dtsdz0 = dt*dzi;
+    const Real invdtdx = 1.0/(dt*dx[1]*dx[2]);
+    const Real invdtdy = 1.0/(dt*dx[0]*dx[2]);
+    const Real invdtdz = 1.0/(dt*dx[0]*dx[1]);
+
+    const Real xmin = xyzmin[0];
+    const Real ymin = xyzmin[1];
+    const Real zmin = xyzmin[2];
+    const Real clightsq = 1.0/PhysConst::c/PhysConst::c;
+
+    // Loop over particles and deposit into Jx_arr, Jy_arr and Jz_arr
+    ParallelFor( np_to_depose,
+                 [=] AMREX_GPU_DEVICE (long ip) {
+                     // --- Get particle quantities
+                     const Real gaminv = 1.0/std::sqrt(1.0 + uxp[ip]*uxp[ip]*clightsq
+                                                           + uyp[ip]*uyp[ip]*clightsq
+                                                           + uzp[ip]*uzp[ip]*clightsq);
+
+                     // wqx, wqy wqz are particle current in each direction
+                     const Real wq = q*wp[ip];
+                     const Real wqx = wq*invdtdx;
+                     const Real wqy = wq*invdtdy;
+                     const Real wqz = wq*invdtdz;
+
+                     // computes current position in grid units
+                     const Real x_new = (xp[ip] - xmin)*dxi;
+                     const Real y_new = (yp[ip] - ymin)*dyi;
+                     const Real z_new = (zp[ip] - zmin)*dzi;
+
+                     // computes old position in grid units
+                     const Real x_old = x_new - dtsdx0*uxp[ip]*gaminv;
+                     const Real y_old = y_new - dtsdy0*uyp[ip]*gaminv;
+                     const Real z_old = z_new - dtsdz0*uzp[ip]*gaminv;
+
+                     // Shape factor arrays
+                     // Note that there are extra values above and below
+                     // to possibly hold the factor for the old particle
+                     // which can be at a different grid location.
+                     Real AMREX_RESTRICT sx_new[depos_order + 3] = {0.};
+                     Real AMREX_RESTRICT sy_new[depos_order + 3] = {0.};
+                     Real AMREX_RESTRICT sz_new[depos_order + 3] = {0.};
+                     Real AMREX_RESTRICT sx_old[depos_order + 3] = {0.};
+                     Real AMREX_RESTRICT sy_old[depos_order + 3] = {0.};
+                     Real AMREX_RESTRICT sz_old[depos_order + 3] = {0.};
+
+                     // --- Compute shape factors
+                     // Compute shape factors for position as they are now and at old positions
+                     // [ijk]_new: leftmost grid point that the particle touches
+                     const int i_new = compute_shape_factor<depos_order>(sx_new+1, x_new);
+                     const int j_new = compute_shape_factor<depos_order>(sy_new+1, y_new);
+                     const int k_new = compute_shape_factor<depos_order>(sz_new+1, z_new);
+
+                     const int i_old = compute_shifted_shape_factor<depos_order>(sx_old, x_old, i_new);
+                     const int j_old = compute_shifted_shape_factor<depos_order>(sy_old, y_old, j_new);
+                     const int k_old = compute_shifted_shape_factor<depos_order>(sz_old, z_old, k_new);
+
+                     // computes min/max positions of current contributions
+                     int dil = 1, diu = 1;
+                     if (i_old < i_new) dil = 0;
+                     if (i_old > i_new) diu = 0;
+                     int djl = 1, dju = 1;
+                     if (j_old < j_new) djl = 0;
+                     if (j_old > j_new) dju = 0;
+                     int dkl = 1, dku = 1;
+                     if (k_old < k_new) dkl = 0;
+                     if (k_old > k_new) dku = 0;
+
+                     for (int k=dkl; k<=depos_order+2-dku; k++) {
+                         for (int j=djl; j<=depos_order+2-dju; j++) {
+                             Real sdxi = 0.;
+                             for (int i=dil; i<=depos_order+1-diu; i++) {
+                                 sdxi += wqx*(sx_old[i] - sx_new[i])*((sy_new[j] + 0.5*(sy_old[j] - sy_new[j]))*sz_new[k] +
+                                         (0.5*sy_new[j] + 1./3.*(sy_old[j] - sy_new[j]))*(sz_old[k] - sz_new[k]));
+                                 amrex::Gpu::Atomic::Add( &Jx_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdxi);
+                             }
+                         }
+                     }
+
+                     for (int k=dkl; k<=depos_order+2-dku; k++) {
+                         for (int i=dil; i<=depos_order+2-diu; i++) {
+                             Real sdyj = 0.;
+                             for (int j=djl; j<=depos_order+1-dju; j++) {
+                                 sdyj += wqy*(sy_old[j] - sy_new[j])*((sz_new[k] + 0.5*(sz_old[k] - sz_new[k]))*sx_new[i] +
+                                         (0.5*sz_new[k] + 1./3.*(sz_old[k] - sz_new[k]))*(sx_old[i] - sx_new[i]));
+                                 amrex::Gpu::Atomic::Add( &Jy_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdyj);
+                             }
+                         }
+                     }
+
+                     for (int j=djl; j<=depos_order+2-dju; j++) {
+                         for (int i=dil; i<=depos_order+2-diu; i++) {
+                             Real sdzk = 0.;
+                             for (int k=dkl; k<=depos_order+1-dku; k++) {
+                                 sdzk += wqz*(sz_old[k] - sz_new[k])*((sx_new[i] + 0.5*(sx_old[i] - sx_new[i]))*sy_new[j] +
+                                         (0.5*sx_new[i] + 1./3.*(sx_old[i] - sx_new[i]))*(sy_old[j] - sy_new[j]));
+                                 amrex::Gpu::Atomic::Add( &Jz_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdzk);
+                             }
+                         }
+                     }
+                 }
+        );
+
+#elif (AMREX_SPACEDIM == 2)
+
+    const Real dxi = 1.0/dx[0];
+    const Real dzi = 1.0/dx[2];
+    const Real dtsdx0 = dt*dxi;
+    const Real dtsdz0 = dt*dzi;
+    const Real invdtdx = 1.0/(dt*dx[2]);
+    const Real invdtdz = 1.0/(dt*dx[0]);
+    const Real invvol = 1.0/(dx[0]*dx[2]);
+
+    const Real xmin = xyzmin[0];
+    const Real zmin = xyzmin[2];
+    const Real clightsq = 1.0/PhysConst::c/PhysConst::c;
+
+    // Loop over particles and deposit into Jx_arr, Jy_arr and Jz_arr
+    ParallelFor( np_to_depose,
+                 [=] AMREX_GPU_DEVICE (long ip) {
+                     // --- Get particle quantities
+                     const Real gaminv = 1.0/std::sqrt(1.0 + uxp[ip]*uxp[ip]*clightsq
+                                                           + uyp[ip]*uyp[ip]*clightsq
+                                                           + uzp[ip]*uzp[ip]*clightsq);
+
+                     // wqx, wqy wqz are particle current in each direction
+                     const Real wq = q*wp[ip];
+                     const Real wqx = wq*invdtdx;
+                     const Real wqz = wq*invdtdz;
+
+                     // computes current position in grid units
+                     const Real x_new = (xp[ip] - xmin)*dxi;
+                     const Real z_new = (zp[ip] - zmin)*dzi;
+
+                     // computes old position in grid units
+                     const Real x_old = x_new - dtsdx0*uxp[ip]*gaminv;
+                     const Real z_old = z_new - dtsdz0*uzp[ip]*gaminv;
+
+                     // Shape factor arrays
+                     // Note that there are extra values above and below
+                     // to possibly hold the factor for the old particle
+                     // which can be at a different grid location.
+                     Real AMREX_RESTRICT sx_new[depos_order + 3] = {0.};
+                     Real AMREX_RESTRICT sz_new[depos_order + 3] = {0.};
+                     Real AMREX_RESTRICT sx_old[depos_order + 3] = {0.};
+                     Real AMREX_RESTRICT sz_old[depos_order + 3] = {0.};
+
+                     // --- Compute shape factors
+                     // Compute shape factors for position as they are now and at old positions
+                     // [ik]_new: leftmost grid point that the particle touches
+                     const int i_new = compute_shape_factor<depos_order>(sx_new+1, x_new);
+                     const int k_new = compute_shape_factor<depos_order>(sz_new+1, z_new);
+
+                     const int i_old = compute_shifted_shape_factor<depos_order>(sx_old, x_old, i_new);
+                     const int k_old = compute_shifted_shape_factor<depos_order>(sz_old, z_old, k_new);
+
+                     // computes min/max positions of current contributions
+                     int dil = 1, diu = 1;
+                     if (i_old < i_new) dil = 0;
+                     if (i_old > i_new) diu = 0;
+                     int dkl = 1, dku = 1;
+                     if (k_old < k_new) dkl = 0;
+                     if (k_old > k_new) dku = 0;
+
+                     for (int k=dkl; k<=depos_order+2-dku; k++) {
+                         Real sdxi = 0.;
+                         for (int i=dil; i<=depos_order+1-diu; i++) {
+                             sdxi += wqx*(sx_old[i] - sx_new[i])*(sz_new[k] + 0.5*(sz_old[k] - sz_new[k]));
+                             amrex::Gpu::Atomic::Add( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdxi);
+                         }
+                     }
+
+                     for (int k=dkl; k<=depos_order+2-dku; k++) {
+                         for (int i=dil; i<=depos_order+2-diu; i++) {
+                             const Real sdyj = wq*uyp[ip]*gaminv*invvol*((sz_new[k] + 0.5*(sz_old[k] - sz_new[k]))*sx_new[i] +
+                                         (0.5*sz_new[k] + 1./3.*(sz_old[k] - sz_new[k]))*(sx_old[i] - sx_new[i]));
+                             amrex::Gpu::Atomic::Add( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdyj);
+                         }
+                     }
+
+                     for (int i=dil; i<=depos_order+2-diu; i++) {
+                         Real sdzk = 0.;
+                         for (int k=dkl; k<=depos_order+1-dku; k++) {
+                             sdzk += wqz*(sz_old[k] - sz_new[k])*(sx_new[i] + 0.5*(sx_old[i] - sx_new[i]));
+                             amrex::Gpu::Atomic::Add( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdzk);
+                         }
+                     }
+                 }
+        );
+
+#endif
+
+}
+
 #endif // CURRENTDEPOSITION_H_
diff --git a/Source/Particles/WarpXParticleContainer.cpp b/Source/Particles/WarpXParticleContainer.cpp
index 314e3f61f..89f233b2c 100644
--- a/Source/Particles/WarpXParticleContainer.cpp
+++ b/Source/Particles/WarpXParticleContainer.cpp
@@ -6,6 +6,7 @@
 #include <AMReX_AmrParGDB.H>
 #include <WarpX_f.H>
 #include <WarpX.H>
+#include <WarpXAlgorithmSelection.H>
 
 // Import low-level single-particle kernels
 #include <GetAndSetPosition.H>
@@ -510,21 +511,40 @@ WarpXParticleContainer::DepositCurrent(WarpXParIter& pti,
     // Better for memory? worth trying?    
     const Dim3 lo = lbound(tilebox);
 
-    if        (WarpX::nox == 1){
-        doDepositionShapeN<1>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                              uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
-                              jz_arr, np_to_depose, dt, dx,
-                              xyzmin, lo, stagger_shift, q);
-    } else if (WarpX::nox == 2){
-        doDepositionShapeN<2>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                              uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
-                              jz_arr, np_to_depose, dt, dx,
-                              xyzmin, lo, stagger_shift, q);
-    } else if (WarpX::nox == 3){
-        doDepositionShapeN<3>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                              uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
-                              jz_arr, np_to_depose, dt, dx,
-                              xyzmin, lo, stagger_shift, q);
+    if (WarpX::current_deposition_algo == CurrentDepositionAlgo::Esirkepov) {
+        if        (WarpX::nox == 1){
+            doEsirkepovDepositionShapeN<1>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                           uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                           jz_arr, np_to_depose, dt, dx,
+                                           xyzmin, lo, q);
+        } else if (WarpX::nox == 2){
+            doEsirkepovDepositionShapeN<2>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                           uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                           jz_arr, np_to_depose, dt, dx,
+                                           xyzmin, lo, q);
+        } else if (WarpX::nox == 3){
+            doEsirkepovDepositionShapeN<3>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                           uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                           jz_arr, np_to_depose, dt, dx,
+                                           xyzmin, lo, q);
+        }
+    } else {
+        if        (WarpX::nox == 1){
+            doDepositionShapeN<1>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                  uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                  jz_arr, np_to_depose, dt, dx,
+                                  xyzmin, lo, stagger_shift, q);
+        } else if (WarpX::nox == 2){
+            doDepositionShapeN<2>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                  uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                  jz_arr, np_to_depose, dt, dx,
+                                  xyzmin, lo, stagger_shift, q);
+        } else if (WarpX::nox == 3){
+            doDepositionShapeN<3>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                  uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                  jz_arr, np_to_depose, dt, dx,
+                                  xyzmin, lo, stagger_shift, q);
+        }
     }
 
 #ifndef AMREX_USE_GPU
diff --git a/Source/WarpX.H b/Source/WarpX.H
index ca7596589..35de5c88d 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -26,9 +26,11 @@
 #include <NCIGodfreyFilter.H>
 
 #ifdef WARPX_USE_PSATD
-#include <fftw3.h>
 #include <SpectralSolver.H>
 #endif
+#ifdef WARPX_USE_PSATD_HYBRID
+#include <PicsarHybridFFTData.H>
+#endif
 
 #if defined(BL_USE_SENSEI_INSITU)
 namespace amrex {
@@ -578,7 +580,7 @@ private:
     amrex::Vector<std::array< std::unique_ptr<amrex::MultiFab>, 3 > > Efield_slice;
     amrex::Vector<std::array< std::unique_ptr<amrex::MultiFab>, 3 > > Bfield_slice;
 
-#ifdef WARPX_USE_PSATD
+#ifdef WARPX_USE_PSATD_HYBRID
     // Store fields in real space on the dual grid (i.e. the grid for the FFT push of the fields)
     // This includes data for the FFT guard cells (between FFT groups)
     amrex::Vector<std::array< std::unique_ptr<amrex::MultiFab>, 3 > > Efield_fp_fft;
@@ -590,44 +592,30 @@ private:
     amrex::Vector<std::array< std::unique_ptr<amrex::MultiFab>, 3 > > Bfield_cp_fft;
     amrex::Vector<std::array< std::unique_ptr<amrex::MultiFab>, 3 > > current_cp_fft;
     amrex::Vector<            std::unique_ptr<amrex::MultiFab>      > rho_cp_fft;
+#endif
 
-public:
-    // FFTData is a stuct containing a 22 pointers to auxiliary arrays
-    // 1-11: padded arrays in real space (required by FFTW); 12-22: arrays in spectral space
-    struct FFTData
-    {
-        static constexpr int N = 22;
-        void* data[N] = { nullptr };
-
-        ~FFTData () {
-            for (int i = 0; i < N; ++i) { // The memory is allocated with fftw_alloc.
-                fftw_free(data[i]);
-                data[i] = nullptr;
-            }
-        }
-
-        FFTData () = default;
+#ifdef WARPX_USE_PSATD
+private:
+    void EvolvePSATD (int numsteps);
+    void PushPSATD (amrex::Real dt);
+    void PushPSATD_localFFT (int lev, amrex::Real dt);
 
-        FFTData (FFTData && rhs) noexcept {
-            for (int i = 0; i < N; ++i) {
-                data[i] = rhs.data[i];
-                rhs.data[i] = nullptr;
-            }
-        }
+    bool fft_hybrid_mpi_decomposition = false;
+    int ngroups_fft = 4;
+    int fftw_plan_measure = 1;
+    int nox_fft = 16;
+    int noy_fft = 16;
+    int noz_fft = 16;
 
-        FFTData (FFTData const&) = delete;
-        void operator= (FFTData const&) = delete;
-        void operator= (FFTData&&) = delete;
-    };
+    amrex::Vector<std::unique_ptr<SpectralSolver>> spectral_solver_fp;
+    amrex::Vector<std::unique_ptr<SpectralSolver>> spectral_solver_cp;
+#endif
 
+#ifdef WARPX_USE_PSATD_HYBRID
 private:
-
     amrex::Vector<std::unique_ptr<amrex::LayoutData<FFTData> > > dataptr_fp_fft;
     amrex::Vector<std::unique_ptr<amrex::LayoutData<FFTData> > > dataptr_cp_fft;
 
-    amrex::Vector<std::unique_ptr<SpectralSolver>> spectral_solver_fp;
-    amrex::Vector<std::unique_ptr<SpectralSolver>> spectral_solver_cp;
-
   // Domain decomposition containing the valid part of the dual grids (i.e. without FFT guard cells)
     amrex::Vector<amrex::BoxArray> ba_valid_fp_fft;
     amrex::Vector<amrex::BoxArray> ba_valid_cp_fft;
@@ -638,13 +626,6 @@ private:
     amrex::Vector<MPI_Comm> comm_fft;
     amrex::Vector<int>      color_fft;
 
-    bool fft_hybrid_mpi_decomposition = false;
-    int ngroups_fft = 4;
-    int fftw_plan_measure = 1;
-    int nox_fft = 16;
-    int noy_fft = 16;
-    int noz_fft = 16;
-
     void AllocLevelDataFFT (int lev);
     void InitLevelDataFFT (int lev, amrex::Real time);
 
@@ -654,12 +635,7 @@ private:
                                  const amrex::Box& domain);
     void InitFFTDataPlan (int lev);
     void FreeFFT (int lev);
-
-    void EvolvePSATD (int numsteps);
-    void PushPSATD (amrex::Real dt);
-    void PushPSATD_localFFT (int lev, amrex::Real dt);
     void PushPSATD_hybridFFT (int lev, amrex::Real dt);
-
 #endif
 
 #if defined(BL_USE_SENSEI_INSITU)
diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp
index 08948227c..1c0d01ce5 100644
--- a/Source/WarpX.cpp
+++ b/Source/WarpX.cpp
@@ -202,6 +202,10 @@ WarpX::WarpX ()
     costs.resize(nlevs_max);
 
 #ifdef WARPX_USE_PSATD
+    spectral_solver_fp.resize(nlevs_max);
+    spectral_solver_cp.resize(nlevs_max);
+#endif
+#ifdef WARPX_USE_PSATD_HYBRID
     Efield_fp_fft.resize(nlevs_max);
     Bfield_fp_fft.resize(nlevs_max);
     current_fp_fft.resize(nlevs_max);
@@ -215,9 +219,6 @@ WarpX::WarpX ()
     dataptr_fp_fft.resize(nlevs_max);
     dataptr_cp_fft.resize(nlevs_max);
 
-    spectral_solver_fp.resize(nlevs_max);
-    spectral_solver_cp.resize(nlevs_max);
-
     ba_valid_fp_fft.resize(nlevs_max);
     ba_valid_cp_fft.resize(nlevs_max);
 
@@ -492,10 +493,6 @@ WarpX::ReadParameters ()
         pp.query("use_picsar_deposition", use_picsar_deposition);
 #endif
         current_deposition_algo = GetAlgorithmInteger(pp, "current_deposition");
-        if (!use_picsar_deposition){
-            AMREX_ALWAYS_ASSERT_WITH_MESSAGE( current_deposition_algo >= 2, 
-                "if not use_picsar_deposition, cannot use Esirkepov deposition.");
-        }
         charge_deposition_algo = GetAlgorithmInteger(pp, "charge_deposition");
         field_gathering_algo = GetAlgorithmInteger(pp, "field_gathering");
         particle_pusher_algo = GetAlgorithmInteger(pp, "particle_pusher");
@@ -511,8 +508,6 @@ WarpX::ReadParameters ()
         pp.query("nox", nox_fft);
         pp.query("noy", noy_fft);
         pp.query("noz", noz_fft);
-        // Override value
-        if (fft_hybrid_mpi_decomposition==false) ngroups_fft=ParallelDescriptor::NProcs();
     }
 #endif
 
@@ -568,8 +563,12 @@ WarpX::MakeNewLevelFromScratch (int lev, Real time, const BoxArray& new_grids,
 
 #ifdef WARPX_USE_PSATD
     if (fft_hybrid_mpi_decomposition){
+#ifdef WARPX_USE_PSATD_HYBRID
         AllocLevelDataFFT(lev);
         InitLevelDataFFT(lev, time);
+#else
+    amrex::Abort("The option `psatd.fft_hybrid_mpi_decomposition` does not work on GPU.");
+#endif
     }
 #endif
 }
@@ -614,7 +613,7 @@ WarpX::ClearLevel (int lev)
 
     costs[lev].reset();
 
-#ifdef WARPX_USE_PSATD
+#ifdef WARPX_USE_PSATD_HYBRID
     for (int i = 0; i < 3; ++i) {
         Efield_fp_fft[lev][i].reset();
         Bfield_fp_fft[lev][i].reset();