12 files changed, 403 insertions, 135 deletions
diff --git a/Source/FieldSolver/Make.package b/Source/FieldSolver/Make.package
index 7eea52ee7..25e21c146 100644
--- a/Source/FieldSolver/Make.package
+++ b/Source/FieldSolver/Make.package
@@ -2,9 +2,10 @@ CEXE_headers += WarpX_K.H
 CEXE_headers += WarpX_FDTD.H
 CEXE_sources += WarpXPushFieldsEM.cpp
 ifeq ($(USE_PSATD),TRUE)
-  CEXE_sources += WarpXFFT.cpp
-  F90EXE_sources += WarpX_fft.F90
   include $(WARPX_HOME)/Source/FieldSolver/SpectralSolver/Make.package
+  ifeq ($(USE_CUDA),FALSE)
+    include $(WARPX_HOME)/Source/FieldSolver/PicsarHybridSpectralSolver/Make.package
+  endif
 endif
 ifeq ($(USE_OPENBC_POISSON),TRUE)
   F90EXE_sources += openbc_poisson_solver.F90
diff --git a/Source/FieldSolver/PicsarHybridSpectralSolver/Make.package b/Source/FieldSolver/PicsarHybridSpectralSolver/Make.package
new file mode 100644
index 000000000..e185525e6
--- /dev/null
+++ b/Source/FieldSolver/PicsarHybridSpectralSolver/Make.package
@@ -0,0 +1,6 @@
+CEXE_headers += PicsarHybridFFTData.H
+CEXE_sources += PicsarHybridSpectralSolver.cpp
+F90EXE_sources += picsar_hybrid_spectral.F90
+
+INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/FieldSolver/PicsarHybridSpectralSolver
+VPATH_LOCATIONS   += $(WARPX_HOME)/Source/FieldSolver/PicsarHybridSpectralSolver
+\ No newline at end of file
diff --git a/Source/FieldSolver/PicsarHybridSpectralSolver/PicsarHybridFFTData.H b/Source/FieldSolver/PicsarHybridSpectralSolver/PicsarHybridFFTData.H
new file mode 100644
index 000000000..4e97ab675
--- /dev/null
+++ b/Source/FieldSolver/PicsarHybridSpectralSolver/PicsarHybridFFTData.H
@@ -0,0 +1,32 @@
+#ifndef WARPX_PICSAR_HYBRID_FFTDATA_H_
+#define WARPX_PICSAR_HYBRID_FFTDATA_H_
+
+// FFTData is a stuct containing a 22 pointers to auxiliary arrays
+// 1-11: padded arrays in real space (required by FFTW); 12-22: arrays in spectral space
+struct FFTData
+{
+    static constexpr int N = 22;
+    void* data[N] = { nullptr };
+
+    ~FFTData () {
+        for (int i = 0; i < N; ++i) { // The memory is allocated with fftw_alloc.
+            fftw_free(data[i]);
+            data[i] = nullptr;
+        }
+    }
+
+    FFTData () = default;
+
+    FFTData (FFTData && rhs) noexcept {
+        for (int i = 0; i < N; ++i) {
+            data[i] = rhs.data[i];
+            rhs.data[i] = nullptr;
+        }
+    }
+
+    FFTData (FFTData const&) = delete;
+    void operator= (FFTData const&) = delete;
+    void operator= (FFTData&&) = delete;
+};
+
+#endif // WARPX_PICSAR_HYBRID_FFTDATA_H_
+\ No newline at end of file
diff --git a/Source/FieldSolver/WarpXFFT.cpp b/Source/FieldSolver/PicsarHybridSpectralSolver/PicsarHybridSpectralSolver.cpp
index f9cd7570a..26c93086a 100644
--- a/Source/FieldSolver/WarpXFFT.cpp
+++ b/Source/FieldSolver/PicsarHybridSpectralSolver/PicsarHybridSpectralSolver.cpp
@@ -5,10 +5,10 @@
 
 using namespace amrex;
 
-constexpr int WarpX::FFTData::N;
+constexpr int FFTData::N;
 
 namespace {
-static std::unique_ptr<WarpX::FFTData> nullfftdata; // This for process with nz_fft=0
+static std::unique_ptr<FFTData> nullfftdata; // This for process with nz_fft=0
 
 /** \brief Returns an "owner mask" which 1 for all cells, except
  *  for the duplicated (physical) cells of a nodal grid.
@@ -122,6 +122,8 @@ WarpX::AllocLevelDataFFT (int lev)
     static_assert(std::is_standard_layout<FFTData>::value, "FFTData must have standard layout");
     static_assert(sizeof(FFTData) == sizeof(void*)*FFTData::N, "sizeof FFTData is wrong");
 
+
+
     InitFFTComm(lev);
 
     BoxArray ba_fp_fft;
@@ -366,52 +368,8 @@ WarpX::FreeFFT (int lev)
 }
 
 void
-WarpX::PushPSATD (amrex::Real a_dt)
-{
-    for (int lev = 0; lev <= finest_level; ++lev) {
-        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(dt[lev] == a_dt, "dt must be consistent");
-        if (fft_hybrid_mpi_decomposition){
-            PushPSATD_hybridFFT(lev, a_dt);
-        } else {
-            PushPSATD_localFFT(lev, a_dt);
-        }
-    }
-}
-
-void WarpX::PushPSATD_localFFT (int lev, amrex::Real /* dt */)
-{
-    auto& solver = *spectral_solver_fp[lev];
-
-    // Perform forward Fourier transform
-    solver.ForwardTransform(*Efield_fp[lev][0], SpectralFieldIndex::Ex);
-    solver.ForwardTransform(*Efield_fp[lev][1], SpectralFieldIndex::Ey);
-    solver.ForwardTransform(*Efield_fp[lev][2], SpectralFieldIndex::Ez);
-    solver.ForwardTransform(*Bfield_fp[lev][0], SpectralFieldIndex::Bx);
-    solver.ForwardTransform(*Bfield_fp[lev][1], SpectralFieldIndex::By);
-    solver.ForwardTransform(*Bfield_fp[lev][2], SpectralFieldIndex::Bz);
-    solver.ForwardTransform(*current_fp[lev][0], SpectralFieldIndex::Jx);
-    solver.ForwardTransform(*current_fp[lev][1], SpectralFieldIndex::Jy);
-    solver.ForwardTransform(*current_fp[lev][2], SpectralFieldIndex::Jz);
-    solver.ForwardTransform(*rho_fp[lev], SpectralFieldIndex::rho_old, 0);
-    solver.ForwardTransform(*rho_fp[lev], SpectralFieldIndex::rho_new, 1);
-
-    // Advance fields in spectral space
-    solver.pushSpectralFields();
-
-    // Perform backward Fourier Transform
-    solver.BackwardTransform(*Efield_fp[lev][0], SpectralFieldIndex::Ex);
-    solver.BackwardTransform(*Efield_fp[lev][1], SpectralFieldIndex::Ey);
-    solver.BackwardTransform(*Efield_fp[lev][2], SpectralFieldIndex::Ez);
-    solver.BackwardTransform(*Bfield_fp[lev][0], SpectralFieldIndex::Bx);
-    solver.BackwardTransform(*Bfield_fp[lev][1], SpectralFieldIndex::By);
-    solver.BackwardTransform(*Bfield_fp[lev][2], SpectralFieldIndex::Bz);
-}
-
-void
 WarpX::PushPSATD_hybridFFT (int lev, amrex::Real /* dt */)
 {
-#ifndef AMREX_USE_CUDA // Running on CPU ; use PICSAR code for the hybrid FFT
-
     BL_PROFILE_VAR_NS("WarpXFFT::CopyDualGrid", blp_copy);
     BL_PROFILE_VAR_NS("PICSAR::FftPushEB", blp_push_eb);
 
@@ -486,8 +444,4 @@ WarpX::PushPSATD_hybridFFT (int lev, amrex::Real /* dt */)
     {
         amrex::Abort("WarpX::PushPSATD: TODO");
     }
-#else // AMREX_USE_CUDA is defined ; running on GPU
-        amrex::Abort("The option `psatd.fft_hybrid_mpi_decomposition` does not work on GPU.");
-#endif
-
 }
diff --git a/Source/FieldSolver/WarpX_fft.F90 b/Source/FieldSolver/PicsarHybridSpectralSolver/picsar_hybrid_spectral.F90
index 35357a63f..35357a63f 100644
--- a/Source/FieldSolver/WarpX_fft.F90
+++ b/Source/FieldSolver/PicsarHybridSpectralSolver/picsar_hybrid_spectral.F90
diff --git a/Source/FieldSolver/WarpXPushFieldsEM.cpp b/Source/FieldSolver/WarpXPushFieldsEM.cpp
index c53e13f8f..bea008598 100644
--- a/Source/FieldSolver/WarpXPushFieldsEM.cpp
+++ b/Source/FieldSolver/WarpXPushFieldsEM.cpp
@@ -16,6 +16,54 @@
 
 using namespace amrex;
 
+#ifdef WARPX_USE_PSATD
+
+void
+WarpX::PushPSATD (amrex::Real a_dt)
+{
+    for (int lev = 0; lev <= finest_level; ++lev) {
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(dt[lev] == a_dt, "dt must be consistent");
+        if (fft_hybrid_mpi_decomposition){
+#ifndef AMREX_USE_CUDA // Only available on CPU
+            PushPSATD_hybridFFT(lev, a_dt);
+#endif
+        } else {
+            PushPSATD_localFFT(lev, a_dt);
+        }
+    }
+}
+
+void WarpX::PushPSATD_localFFT (int lev, amrex::Real /* dt */)
+{
+    auto& solver = *spectral_solver_fp[lev];
+
+    // Perform forward Fourier transform
+    solver.ForwardTransform(*Efield_fp[lev][0], SpectralFieldIndex::Ex);
+    solver.ForwardTransform(*Efield_fp[lev][1], SpectralFieldIndex::Ey);
+    solver.ForwardTransform(*Efield_fp[lev][2], SpectralFieldIndex::Ez);
+    solver.ForwardTransform(*Bfield_fp[lev][0], SpectralFieldIndex::Bx);
+    solver.ForwardTransform(*Bfield_fp[lev][1], SpectralFieldIndex::By);
+    solver.ForwardTransform(*Bfield_fp[lev][2], SpectralFieldIndex::Bz);
+    solver.ForwardTransform(*current_fp[lev][0], SpectralFieldIndex::Jx);
+    solver.ForwardTransform(*current_fp[lev][1], SpectralFieldIndex::Jy);
+    solver.ForwardTransform(*current_fp[lev][2], SpectralFieldIndex::Jz);
+    solver.ForwardTransform(*rho_fp[lev], SpectralFieldIndex::rho_old, 0);
+    solver.ForwardTransform(*rho_fp[lev], SpectralFieldIndex::rho_new, 1);
+
+    // Advance fields in spectral space
+    solver.pushSpectralFields();
+
+    // Perform backward Fourier Transform
+    solver.BackwardTransform(*Efield_fp[lev][0], SpectralFieldIndex::Ex);
+    solver.BackwardTransform(*Efield_fp[lev][1], SpectralFieldIndex::Ey);
+    solver.BackwardTransform(*Efield_fp[lev][2], SpectralFieldIndex::Ez);
+    solver.BackwardTransform(*Bfield_fp[lev][0], SpectralFieldIndex::Bx);
+    solver.BackwardTransform(*Bfield_fp[lev][1], SpectralFieldIndex::By);
+    solver.BackwardTransform(*Bfield_fp[lev][2], SpectralFieldIndex::Bz);
+}
+
+#endif
+
 void
 WarpX::EvolveB (Real a_dt)
 {
diff --git a/Source/Initialization/WarpXInitData.cpp b/Source/Initialization/WarpXInitData.cpp
index d583b2b0f..2442e0205 100644
--- a/Source/Initialization/WarpXInitData.cpp
+++ b/Source/Initialization/WarpXInitData.cpp
@@ -317,7 +317,7 @@ WarpX::InitLevelData (int lev, Real time)
     }
 }
 
-#ifdef WARPX_USE_PSATD
+#ifdef WARPX_USE_PSATD_HYBRID
 
 void
 WarpX::InitLevelDataFFT (int lev, Real time)
diff --git a/Source/Make.WarpX b/Source/Make.WarpX
index bc837c797..3060ae8f0 100644
--- a/Source/Make.WarpX
+++ b/Source/Make.WarpX
@@ -111,20 +111,23 @@ endif
    
 
 ifeq ($(USE_PSATD),TRUE)
-  FFTW_HOME ?= NOT_SET
-  ifneq ($(FFTW_HOME),NOT_SET)
-    VPATH_LOCATIONS += $(FFTW_HOME)/include
-    INCLUDE_LOCATIONS += $(FFTW_HOME)/include
-    LIBRARY_LOCATIONS += $(FFTW_HOME)/lib
-  endif
   USERSuffix := $(USERSuffix).PSATD
   DEFINES += -DWARPX_USE_PSATD
-  ifeq ($(USE_CUDA),TRUE)
-    DEFINES += -DFFTW -DCUDA_FFT=1 # PICSAR uses it
-    LIBRARIES += -lfftw3_mpi -lfftw3 -lfftw3_threads -lcufft
+  ifeq ($(USE_CUDA),FALSE) # Running on CPU
+     # Use FFTW
+     LIBRARIES += -lfftw3_mpi -lfftw3 -lfftw3_threads
+     FFTW_HOME ?= NOT_SET
+     ifneq ($(FFTW_HOME),NOT_SET)
+       VPATH_LOCATIONS += $(FFTW_HOME)/include
+       INCLUDE_LOCATIONS += $(FFTW_HOME)/include
+       LIBRARY_LOCATIONS += $(FFTW_HOME)/lib
+     endif
+     # Compile with PICSAR's Hybrid-MPI PSATD
+     DEFINES += -DWARPX_USE_PSATD_HYBRID
+     DEFINES += -DFFTW # PICSAR uses it
   else
-    DEFINES += -DFFTW # PICSAR uses it
-    LIBRARIES += -lfftw3_mpi -lfftw3 -lfftw3_threads
+    # Use cuFFT
+    LIBRARIES += -lcufft
   endif
 endif
 
diff --git a/Source/Particles/Deposition/CurrentDeposition.H b/Source/Particles/Deposition/CurrentDeposition.H
index efda97892..97bc53c20 100644
--- a/Source/Particles/Deposition/CurrentDeposition.H
+++ b/Source/Particles/Deposition/CurrentDeposition.H
@@ -3,12 +3,12 @@
 
 using namespace amrex;
 
-// Compute shape factor and return index of leftmost cell where 
+// Compute shape factor and return index of leftmost cell where
 // particle writes.
 // Specialized templates are defined below for orders 1, 2 and 3.
 template <int depos_order>
 AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int compute_shape_factor(Real* const sx, Real xint) {return 0;};
+int compute_shape_factor(Real* const sx, Real xint);
 
 // Compute shape factor for order 1.
 template <>
@@ -176,4 +176,233 @@ void doDepositionShapeN(const Real * const xp, const Real * const yp, const Real
         );
 }
 
+// Compute shape factor and return index of leftmost cell where
+// particle writes.
+// Specialized templates are defined below for orders 1, 2 and 3.
+template <int depos_order>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor (Real* const sx, const Real x_old, const int i_new);
+
+// Compute shape factor for order 1.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor <1> (Real* const sx, const Real x_old, const int i_new){
+    const int i = (int) x_old;
+    const int i_shift = i - i_new;
+    const Real xint = x_old - i;
+    sx[1+i_shift] = 1.0 - xint;
+    sx[2+i_shift] = xint;
+    return i;
+}
+
+// Compute shape factor for order 2.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor <2> (Real* const sx, const Real x_old, const int i_new){
+    const int i = (int) (x_old+0.5);
+    const int i_shift = i - (i_new + 1);
+    const Real xint = x_old - i;
+    sx[1+i_shift] = 0.5*(0.5-xint)*(0.5-xint);
+    sx[2+i_shift] = 0.75-xint*xint;
+    sx[3+i_shift] = 0.5*(0.5+xint)*(0.5+xint);
+    // index of the leftmost cell where particle deposits
+    return i-1;
+}
+
+// Compute shape factor for order 3.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor <3> (Real* const sx, const Real x_old, const int i_new){
+    const int i = (int) x_old;
+    const int i_shift = i - (i_new + 1);
+    const Real xint = x_old - i;
+    sx[1+i_shift] = 1.0/6.0*(1.0-xint)*(1.0-xint)*(1.0-xint);
+    sx[2+i_shift] = 2.0/3.0-xint*xint*(1-xint/2.0);
+    sx[3+i_shift] = 2.0/3.0-(1-xint)*(1-xint)*(1.0-0.5*(1-xint));
+    sx[4+i_shift] = 1.0/6.0*xint*xint*xint;
+    // index of the leftmost cell where particle deposits
+    return i-1;
+}
+
+/* \brief Esirkepov Current Deposition for thread thread_num
+ * /param xp, yp, zp   : Pointer to arrays of particle positions.
+ * \param wp           : Pointer to array of particle weights.
+ * \param uxp uyp uzp  : Pointer to arrays of particle momentum.
+ * \param Jx_arr       : Array4 of current density, either full array or tile.
+ * \param Jy_arr       : Array4 of current density, either full array or tile.
+ * \param Jz_arr       : Array4 of current density, either full array or tile.
+ * \param np_to_depose : Number of particles for which current is deposited.
+ * \param dt           : Time step for particle level
+ * \param dx           : 3D cell size
+ * \param xyzmin       : Physical lower bounds of domain.
+ * \param lo           : Index lower bounds of domain.
+ * /param q            : species charge.
+ */
+template <int depos_order>
+void doEsirkepovDepositionShapeN (const Real * const xp, const Real * const yp, const Real * const zp,
+                                  const Real * const wp, const Real * const uxp,
+                                  const Real * const uyp, const Real * const uzp,
+                                  const amrex::Array4<amrex::Real>& Jx_arr,
+                                  const amrex::Array4<amrex::Real>& Jy_arr,
+                                  const amrex::Array4<amrex::Real>& Jz_arr,
+                                  const long np_to_depose,
+                                  const amrex::Real dt, const std::array<amrex::Real,3>& dx,
+                                  const std::array<Real, 3> xyzmin,
+                                  const Dim3 lo,
+                                  const amrex::Real q)
+{
+    const Real dxi = 1.0/dx[0];
+    const Real dtsdx0 = dt*dxi;
+    const Real xmin = xyzmin[0];
+#if (AMREX_SPACEDIM == 3)
+    const Real dyi = 1.0/dx[1];
+    const Real dtsdy0 = dt*dyi;
+    const Real ymin = xyzmin[1];
+#endif
+    const Real dzi = 1.0/dx[2];
+    const Real dtsdz0 = dt*dzi;
+    const Real zmin = xyzmin[2];
+
+#if (AMREX_SPACEDIM == 3)
+    const Real invdtdx = 1.0/(dt*dx[1]*dx[2]);
+    const Real invdtdy = 1.0/(dt*dx[0]*dx[2]);
+    const Real invdtdz = 1.0/(dt*dx[0]*dx[1]);
+#elif (AMREX_SPACEDIM == 2)
+    const Real invdtdx = 1.0/(dt*dx[2]);
+    const Real invdtdz = 1.0/(dt*dx[0]);
+    const Real invvol = 1.0/(dx[0]*dx[2]);
+#endif
+
+    const Real clightsq = 1.0/PhysConst::c/PhysConst::c;
+
+    // Loop over particles and deposit into Jx_arr, Jy_arr and Jz_arr
+    ParallelFor( np_to_depose,
+                 [=] AMREX_GPU_DEVICE (long ip) {
+
+                     // --- Get particle quantities
+                     const Real gaminv = 1.0/std::sqrt(1.0 + uxp[ip]*uxp[ip]*clightsq
+                                                           + uyp[ip]*uyp[ip]*clightsq
+                                                           + uzp[ip]*uzp[ip]*clightsq);
+
+                     // wqx, wqy wqz are particle current in each direction
+                     const Real wq = q*wp[ip];
+                     const Real wqx = wq*invdtdx;
+#if (AMREX_SPACEDIM == 3)
+                     const Real wqy = wq*invdtdy;
+#endif
+                     const Real wqz = wq*invdtdz;
+
+                     // computes current and old position in grid units
+                     const Real x_new = (xp[ip] - xmin)*dxi;
+                     const Real x_old = x_new - dtsdx0*uxp[ip]*gaminv;
+#if (AMREX_SPACEDIM == 3)        
+                     const Real y_new = (yp[ip] - ymin)*dyi;
+                     const Real y_old = y_new - dtsdy0*uyp[ip]*gaminv;
+#endif
+                     const Real z_new = (zp[ip] - zmin)*dzi;
+                     const Real z_old = z_new - dtsdz0*uzp[ip]*gaminv;
+
+                     // Shape factor arrays
+                     // Note that there are extra values above and below
+                     // to possibly hold the factor for the old particle
+                     // which can be at a different grid location.
+                     Real AMREX_RESTRICT sx_new[depos_order + 3] = {0.};
+                     Real AMREX_RESTRICT sx_old[depos_order + 3] = {0.};
+#if (AMREX_SPACEDIM == 3)
+                     Real AMREX_RESTRICT sy_new[depos_order + 3] = {0.};
+                     Real AMREX_RESTRICT sy_old[depos_order + 3] = {0.};
+#endif
+                     Real AMREX_RESTRICT sz_new[depos_order + 3] = {0.};
+                     Real AMREX_RESTRICT sz_old[depos_order + 3] = {0.};
+
+                     // --- Compute shape factors
+                     // Compute shape factors for position as they are now and at old positions
+                     // [ijk]_new: leftmost grid point that the particle touches
+                     const int i_new = compute_shape_factor<depos_order>(sx_new+1, x_new);
+                     const int i_old = compute_shifted_shape_factor<depos_order>(sx_old, x_old, i_new);
+#if (AMREX_SPACEDIM == 3)
+                     const int j_new = compute_shape_factor<depos_order>(sy_new+1, y_new);
+                     const int j_old = compute_shifted_shape_factor<depos_order>(sy_old, y_old, j_new);
+#endif 
+                     const int k_new = compute_shape_factor<depos_order>(sz_new+1, z_new);
+                     const int k_old = compute_shifted_shape_factor<depos_order>(sz_old, z_old, k_new);
+
+                     // computes min/max positions of current contributions
+                     int dil = 1, diu = 1;
+                     if (i_old < i_new) dil = 0;
+                     if (i_old > i_new) diu = 0;
+#if (AMREX_SPACEDIM == 3)
+                     int djl = 1, dju = 1;
+                     if (j_old < j_new) djl = 0;
+                     if (j_old > j_new) dju = 0;
+#endif
+                     int dkl = 1, dku = 1;
+                     if (k_old < k_new) dkl = 0;
+                     if (k_old > k_new) dku = 0;
+
+#if (AMREX_SPACEDIM == 3)
+
+                     for (int k=dkl; k<=depos_order+2-dku; k++) {
+                         for (int j=djl; j<=depos_order+2-dju; j++) {
+                             Real sdxi = 0.;
+                             for (int i=dil; i<=depos_order+1-diu; i++) {
+                                 sdxi += wqx*(sx_old[i] - sx_new[i])*((sy_new[j] + 0.5*(sy_old[j] - sy_new[j]))*sz_new[k] +
+                                         (0.5*sy_new[j] + 1./3.*(sy_old[j] - sy_new[j]))*(sz_old[k] - sz_new[k]));
+                                 amrex::Gpu::Atomic::Add( &Jx_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdxi);
+                             }
+                         }
+                     }
+                     for (int k=dkl; k<=depos_order+2-dku; k++) {
+                         for (int i=dil; i<=depos_order+2-diu; i++) {
+                             Real sdyj = 0.;
+                             for (int j=djl; j<=depos_order+1-dju; j++) {
+                                 sdyj += wqy*(sy_old[j] - sy_new[j])*((sz_new[k] + 0.5*(sz_old[k] - sz_new[k]))*sx_new[i] +
+                                         (0.5*sz_new[k] + 1./3.*(sz_old[k] - sz_new[k]))*(sx_old[i] - sx_new[i]));
+                                 amrex::Gpu::Atomic::Add( &Jy_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdyj);
+                             }
+                         }
+                     }
+                     for (int j=djl; j<=depos_order+2-dju; j++) {
+                         for (int i=dil; i<=depos_order+2-diu; i++) {
+                             Real sdzk = 0.;
+                             for (int k=dkl; k<=depos_order+1-dku; k++) {
+                                 sdzk += wqz*(sz_old[k] - sz_new[k])*((sx_new[i] + 0.5*(sx_old[i] - sx_new[i]))*sy_new[j] +
+                                         (0.5*sx_new[i] + 1./3.*(sx_old[i] - sx_new[i]))*(sy_old[j] - sy_new[j]));
+                                 amrex::Gpu::Atomic::Add( &Jz_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdzk);
+                             }
+                         }
+                     }
+
+#elif (AMREX_SPACEDIM == 2)
+
+                    for (int k=dkl; k<=depos_order+2-dku; k++) {
+                        Real sdxi = 0.;
+                        for (int i=dil; i<=depos_order+1-diu; i++) {
+                            sdxi += wqx*(sx_old[i] - sx_new[i])*(sz_new[k] + 0.5*(sz_old[k] - sz_new[k]));
+                            amrex::Gpu::Atomic::Add( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdxi);
+                        }
+                    }
+                    for (int k=dkl; k<=depos_order+2-dku; k++) {
+                        for (int i=dil; i<=depos_order+2-diu; i++) {
+                            const Real sdyj = wq*uyp[ip]*gaminv*invvol*((sz_new[k] + 0.5*(sz_old[k] - sz_new[k]))*sx_new[i] +
+                                        (0.5*sz_new[k] + 1./3.*(sz_old[k] - sz_new[k]))*(sx_old[i] - sx_new[i]));
+                            amrex::Gpu::Atomic::Add( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdyj);
+                        }
+                    }
+                    for (int i=dil; i<=depos_order+2-diu; i++) {
+                        Real sdzk = 0.;
+                        for (int k=dkl; k<=depos_order+1-dku; k++) {
+                            sdzk += wqz*(sz_old[k] - sz_new[k])*(sx_new[i] + 0.5*(sx_old[i] - sx_new[i]));
+                            amrex::Gpu::Atomic::Add( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdzk);
+                        }
+                    }
+
+#endif
+                 }
+        );
+
+
+
+}
+
 #endif // CURRENTDEPOSITION_H_
diff --git a/Source/Particles/WarpXParticleContainer.cpp b/Source/Particles/WarpXParticleContainer.cpp
index 7316dcc95..a20f0035e 100644
--- a/Source/Particles/WarpXParticleContainer.cpp
+++ b/Source/Particles/WarpXParticleContainer.cpp
@@ -6,6 +6,7 @@
 #include <AMReX_AmrParGDB.H>
 #include <WarpX_f.H>
 #include <WarpX.H>
+#include <WarpXAlgorithmSelection.H>
 
 // Import low-level single-particle kernels
 #include <GetAndSetPosition.H>
@@ -510,21 +511,40 @@ WarpXParticleContainer::DepositCurrent(WarpXParIter& pti,
     // Better for memory? worth trying?    
     const Dim3 lo = lbound(tilebox);
 
-    if        (WarpX::nox == 1){
-        doDepositionShapeN<1>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                              uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
-                              jz_arr, offset, np_to_depose, dt, dx,
-                              xyzmin, lo, stagger_shift, q);
-    } else if (WarpX::nox == 2){
-        doDepositionShapeN<2>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                              uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
-                              jz_arr, offset, np_to_depose, dt, dx,
-                              xyzmin, lo, stagger_shift, q);
-    } else if (WarpX::nox == 3){
-        doDepositionShapeN<3>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                              uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
-                              jz_arr, offset, np_to_depose, dt, dx,
-                              xyzmin, lo, stagger_shift, q);
+    if (WarpX::current_deposition_algo == CurrentDepositionAlgo::Esirkepov) {
+        if        (WarpX::nox == 1){
+            doEsirkepovDepositionShapeN<1>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                           uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                           jz_arr, np_to_depose, dt, dx,
+                                           xyzmin, lo, q);
+        } else if (WarpX::nox == 2){
+            doEsirkepovDepositionShapeN<2>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                           uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                           jz_arr, np_to_depose, dt, dx,
+                                           xyzmin, lo, q);
+        } else if (WarpX::nox == 3){
+            doEsirkepovDepositionShapeN<3>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                           uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                           jz_arr, np_to_depose, dt, dx,
+                                           xyzmin, lo, q);
+        }
+    } else {
+        if        (WarpX::nox == 1){
+            doDepositionShapeN<1>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                  uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                  jz_arr, offset, np_to_depose, dt, dx,
+                                  xyzmin, lo, stagger_shift, q);
+        } else if (WarpX::nox == 2){
+            doDepositionShapeN<2>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                  uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                  jz_arr, offset, np_to_depose, dt, dx,
+                                  xyzmin, lo, stagger_shift, q);
+        } else if (WarpX::nox == 3){
+            doDepositionShapeN<3>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
+                                  uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+                                  jz_arr, offset, np_to_depose, dt, dx,
+                                  xyzmin, lo, stagger_shift, q);
+        }
     }
 
 #ifndef AMREX_USE_GPU
diff --git a/Source/WarpX.H b/Source/WarpX.H
index ca7596589..35de5c88d 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -26,9 +26,11 @@
 #include <NCIGodfreyFilter.H>
 
 #ifdef WARPX_USE_PSATD
-#include <fftw3.h>
 #include <SpectralSolver.H>
 #endif
+#ifdef WARPX_USE_PSATD_HYBRID
+#include <PicsarHybridFFTData.H>
+#endif
 
 #if defined(BL_USE_SENSEI_INSITU)
 namespace amrex {
@@ -578,7 +580,7 @@ private:
     amrex::Vector<std::array< std::unique_ptr<amrex::MultiFab>, 3 > > Efield_slice;
     amrex::Vector<std::array< std::unique_ptr<amrex::MultiFab>, 3 > > Bfield_slice;
 
-#ifdef WARPX_USE_PSATD
+#ifdef WARPX_USE_PSATD_HYBRID
     // Store fields in real space on the dual grid (i.e. the grid for the FFT push of the fields)
     // This includes data for the FFT guard cells (between FFT groups)
     amrex::Vector<std::array< std::unique_ptr<amrex::MultiFab>, 3 > > Efield_fp_fft;
@@ -590,44 +592,30 @@ private:
     amrex::Vector<std::array< std::unique_ptr<amrex::MultiFab>, 3 > > Bfield_cp_fft;
     amrex::Vector<std::array< std::unique_ptr<amrex::MultiFab>, 3 > > current_cp_fft;
     amrex::Vector<            std::unique_ptr<amrex::MultiFab>      > rho_cp_fft;
+#endif
 
-public:
-    // FFTData is a stuct containing a 22 pointers to auxiliary arrays
-    // 1-11: padded arrays in real space (required by FFTW); 12-22: arrays in spectral space
-    struct FFTData
-    {
-        static constexpr int N = 22;
-        void* data[N] = { nullptr };
-
-        ~FFTData () {
-            for (int i = 0; i < N; ++i) { // The memory is allocated with fftw_alloc.
-                fftw_free(data[i]);
-                data[i] = nullptr;
-            }
-        }
-
-        FFTData () = default;
+#ifdef WARPX_USE_PSATD
+private:
+    void EvolvePSATD (int numsteps);
+    void PushPSATD (amrex::Real dt);
+    void PushPSATD_localFFT (int lev, amrex::Real dt);
 
-        FFTData (FFTData && rhs) noexcept {
-            for (int i = 0; i < N; ++i) {
-                data[i] = rhs.data[i];
-                rhs.data[i] = nullptr;
-            }
-        }
+    bool fft_hybrid_mpi_decomposition = false;
+    int ngroups_fft = 4;
+    int fftw_plan_measure = 1;
+    int nox_fft = 16;
+    int noy_fft = 16;
+    int noz_fft = 16;
 
-        FFTData (FFTData const&) = delete;
-        void operator= (FFTData const&) = delete;
-        void operator= (FFTData&&) = delete;
-    };
+    amrex::Vector<std::unique_ptr<SpectralSolver>> spectral_solver_fp;
+    amrex::Vector<std::unique_ptr<SpectralSolver>> spectral_solver_cp;
+#endif
 
+#ifdef WARPX_USE_PSATD_HYBRID
 private:
-
     amrex::Vector<std::unique_ptr<amrex::LayoutData<FFTData> > > dataptr_fp_fft;
     amrex::Vector<std::unique_ptr<amrex::LayoutData<FFTData> > > dataptr_cp_fft;
 
-    amrex::Vector<std::unique_ptr<SpectralSolver>> spectral_solver_fp;
-    amrex::Vector<std::unique_ptr<SpectralSolver>> spectral_solver_cp;
-
   // Domain decomposition containing the valid part of the dual grids (i.e. without FFT guard cells)
     amrex::Vector<amrex::BoxArray> ba_valid_fp_fft;
     amrex::Vector<amrex::BoxArray> ba_valid_cp_fft;
@@ -638,13 +626,6 @@ private:
     amrex::Vector<MPI_Comm> comm_fft;
     amrex::Vector<int>      color_fft;
 
-    bool fft_hybrid_mpi_decomposition = false;
-    int ngroups_fft = 4;
-    int fftw_plan_measure = 1;
-    int nox_fft = 16;
-    int noy_fft = 16;
-    int noz_fft = 16;
-
     void AllocLevelDataFFT (int lev);
     void InitLevelDataFFT (int lev, amrex::Real time);
 
@@ -654,12 +635,7 @@ private:
                                  const amrex::Box& domain);
     void InitFFTDataPlan (int lev);
     void FreeFFT (int lev);
-
-    void EvolvePSATD (int numsteps);
-    void PushPSATD (amrex::Real dt);
-    void PushPSATD_localFFT (int lev, amrex::Real dt);
     void PushPSATD_hybridFFT (int lev, amrex::Real dt);
-
 #endif
 
 #if defined(BL_USE_SENSEI_INSITU)
diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp
index 08948227c..1c0d01ce5 100644
--- a/Source/WarpX.cpp
+++ b/Source/WarpX.cpp
@@ -202,6 +202,10 @@ WarpX::WarpX ()
     costs.resize(nlevs_max);
 
 #ifdef WARPX_USE_PSATD
+    spectral_solver_fp.resize(nlevs_max);
+    spectral_solver_cp.resize(nlevs_max);
+#endif
+#ifdef WARPX_USE_PSATD_HYBRID
     Efield_fp_fft.resize(nlevs_max);
     Bfield_fp_fft.resize(nlevs_max);
     current_fp_fft.resize(nlevs_max);
@@ -215,9 +219,6 @@ WarpX::WarpX ()
     dataptr_fp_fft.resize(nlevs_max);
     dataptr_cp_fft.resize(nlevs_max);
 
-    spectral_solver_fp.resize(nlevs_max);
-    spectral_solver_cp.resize(nlevs_max);
-
     ba_valid_fp_fft.resize(nlevs_max);
     ba_valid_cp_fft.resize(nlevs_max);
 
@@ -492,10 +493,6 @@ WarpX::ReadParameters ()
         pp.query("use_picsar_deposition", use_picsar_deposition);
 #endif
         current_deposition_algo = GetAlgorithmInteger(pp, "current_deposition");
-        if (!use_picsar_deposition){
-            AMREX_ALWAYS_ASSERT_WITH_MESSAGE( current_deposition_algo >= 2, 
-                "if not use_picsar_deposition, cannot use Esirkepov deposition.");
-        }
         charge_deposition_algo = GetAlgorithmInteger(pp, "charge_deposition");
         field_gathering_algo = GetAlgorithmInteger(pp, "field_gathering");
         particle_pusher_algo = GetAlgorithmInteger(pp, "particle_pusher");
@@ -511,8 +508,6 @@ WarpX::ReadParameters ()
         pp.query("nox", nox_fft);
         pp.query("noy", noy_fft);
         pp.query("noz", noz_fft);
-        // Override value
-        if (fft_hybrid_mpi_decomposition==false) ngroups_fft=ParallelDescriptor::NProcs();
     }
 #endif
 
@@ -568,8 +563,12 @@ WarpX::MakeNewLevelFromScratch (int lev, Real time, const BoxArray& new_grids,
 
 #ifdef WARPX_USE_PSATD
     if (fft_hybrid_mpi_decomposition){
+#ifdef WARPX_USE_PSATD_HYBRID
         AllocLevelDataFFT(lev);
         InitLevelDataFFT(lev, time);
+#else
+    amrex::Abort("The option `psatd.fft_hybrid_mpi_decomposition` does not work on GPU.");
+#endif
     }
 #endif
 }
@@ -614,7 +613,7 @@ WarpX::ClearLevel (int lev)
 
     costs[lev].reset();
 
-#ifdef WARPX_USE_PSATD
+#ifdef WARPX_USE_PSATD_HYBRID
     for (int i = 0; i < 3; ++i) {
         Efield_fp_fft[lev][i].reset();
         Bfield_fp_fft[lev][i].reset();