8 files changed, 318 insertions, 375 deletions
diff --git a/Source/FieldSolver/Make.package b/Source/FieldSolver/Make.package
index 12dc44e5b..7eea52ee7 100644
--- a/Source/FieldSolver/Make.package
+++ b/Source/FieldSolver/Make.package
@@ -1,4 +1,5 @@
 CEXE_headers += WarpX_K.H
+CEXE_headers += WarpX_FDTD.H
 CEXE_sources += WarpXPushFieldsEM.cpp
 ifeq ($(USE_PSATD),TRUE)
   CEXE_sources += WarpXFFT.cpp
diff --git a/Source/FieldSolver/WarpX_FDTD.H b/Source/FieldSolver/WarpX_FDTD.H
new file mode 100644
index 000000000..865277446
--- /dev/null
+++ b/Source/FieldSolver/WarpX_FDTD.H
@@ -0,0 +1,68 @@
+#ifndef WARPX_FDTD_H_
+#define WARPX_FDTD_H_
+
+#include <AMReX_FArrayBox.H>
+
+using namespace amrex;
+
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+void warpx_computedivb(int i, int j, int k, int dcomp, Array4<Real> const& divB,
+                       Array4<Real const> const& Bx, Array4<Real const> const& By, Array4<Real const> const& Bz,
+                       Real dxinv, Real dyinv, Real dzinv
+#ifdef WARPX_RZ
+                       ,const Real rmin
+#endif
+                       )
+{
+#if (AMREX_SPACEDIM == 3)
+    divB(i,j,k,dcomp) = (Bx(i+1,j  ,k  ) - Bx(i,j,k))*dxinv
+               +        (By(i  ,j+1,k  ) - By(i,j,k))*dyinv
+               +        (Bz(i  ,j  ,k+1) - Bz(i,j,k))*dzinv;
+#else
+#ifndef WARPX_RZ
+    divB(i,j,0,dcomp) = (Bx(i+1,j  ,0) - Bx(i,j,0))*dxinv
+               +        (Bz(i  ,j+1,0) - Bz(i,j,0))*dzinv;
+#else
+    Real ru = 1. + 0.5/(rmin*dxinv + i + 0.5);
+    Real rd = 1. - 0.5/(rmin*dxinv + i + 0.5);
+    divB(i,j,0,dcomp) = (ru*Bx(i+1,j,0) - rd*Bx(i,j,0))*dxinv
+                       + (Bz(i,j+1,0) - Bz(i,j,0))*dzinv;
+#endif
+#endif
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+void warpx_computedive(int i, int j, int k, int dcomp, Array4<Real> const& divE,
+                       Array4<Real const> const& Ex, Array4<Real const> const& Ey, Array4<Real const> const& Ez,
+                       Real dxinv, Real dyinv, Real dzinv
+#ifdef WARPX_RZ
+                       ,const Real rmin
+#endif
+                       )
+{
+#if (AMREX_SPACEDIM == 3)
+    divE(i,j,k,dcomp) = (Ex(i,j,k) - Ex(i-1,j,k))*dxinv
+               +        (Ey(i,j,k) - Ey(i,j-1,k))*dyinv
+               +        (Ez(i,j,k) - Ez(i,j,k-1))*dzinv;
+#else
+#ifndef WARPX_RZ
+    divE(i,j,0,dcomp) = (Ex(i,j,0) - Ex(i-1,j,0))*dxinv
+                 +      (Ez(i,j,0) - Ez(i,j-1,0))*dzinv;
+#else
+    if (i == 0 && rmin == 0.) {
+        // the bulk equation diverges on axis
+        // (due to the 1/r terms). The following expressions regularize
+        // these divergences.
+        divE(i,j,0,dcomp) = 4.*Ex(i,j,0)*dxinv
+                            + (Ez(i,j,0) - Ez(i,j-1,0))*dzinv;
+    } else {
+        Real ru = 1. + 0.5/(rmin*dxinv + i);
+        Real rd = 1. - 0.5/(rmin*dxinv + i);
+        divE(i,j,0,dcomp) = (ru*Ex(i,j,0) - rd*Ex(i-1,j,0))*dxinv
+                           + (Ez(i,j,0) - Ez(i,j-1,0))*dzinv;
+    }
+#endif
+#endif
+}
+
+#endif
diff --git a/Source/FortranInterface/WarpX_f.F90 b/Source/FortranInterface/WarpX_f.F90
index d7f7a8053..0560a9981 100644
--- a/Source/FortranInterface/WarpX_f.F90
+++ b/Source/FortranInterface/WarpX_f.F90
@@ -84,143 +84,6 @@ contains
 
   end subroutine warpx_compute_E
 
-  subroutine warpx_compute_divb_3d (lo, hi, divB, dlo, dhi, &
-       Bx, xlo, xhi, By, ylo, yhi, Bz, zlo, zhi, dx) &
-       bind(c, name='warpx_compute_divb_3d')
-    integer, intent(in) :: lo(3),hi(3),dlo(3),dhi(3),xlo(3),xhi(3),ylo(3),yhi(3),zlo(3),zhi(3)
-    real(amrex_real), intent(in) :: dx(3)
-    real(amrex_real), intent(in   ) :: Bx  (xlo(1):xhi(1),xlo(2):xhi(2),xlo(3):xhi(3))
-    real(amrex_real), intent(in   ) :: By  (ylo(1):yhi(1),ylo(2):yhi(2),ylo(3):yhi(3))
-    real(amrex_real), intent(in   ) :: Bz  (zlo(1):zhi(1),zlo(2):zhi(2),zlo(3):zhi(3))
-    real(amrex_real), intent(inout) :: divB(dlo(1):dhi(1),dlo(2):dhi(2),dlo(3):dhi(3))
-
-    integer :: i,j,k
-    real(amrex_real) :: dxinv(3)
-
-    dxinv = 1.d0/dx
-
-    do       k = lo(3), hi(3)
-       do    j = lo(2), hi(2)
-          do i = lo(1), hi(1)
-             divB(i,j,k) = dxinv(1) * (Bx(i+1,j  ,k  ) - Bx(i,j,k)) &
-                  +        dxinv(2) * (By(i  ,j+1,k  ) - By(i,j,k)) &
-                  +        dxinv(3) * (Bz(i  ,j  ,k+1) - Bz(i,j,k))
-          end do
-       end do
-    end do
-  end subroutine warpx_compute_divb_3d
-
-
-  subroutine warpx_compute_divb_2d (lo, hi, divB, dlo, dhi, &
-       Bx, xlo, xhi, By, ylo, yhi, Bz, zlo, zhi, dx) &
-       bind(c, name='warpx_compute_divb_2d')
-    integer, intent(in) :: lo(2),hi(2),dlo(2),dhi(2),xlo(2),xhi(2),ylo(2),yhi(2),zlo(2),zhi(2)
-    real(amrex_real), intent(in) :: dx(3)
-    real(amrex_real), intent(in   ) :: Bx  (xlo(1):xhi(1),xlo(2):xhi(2))
-    real(amrex_real), intent(in   ) :: By  (ylo(1):yhi(1),ylo(2):yhi(2))
-    real(amrex_real), intent(in   ) :: Bz  (zlo(1):zhi(1),zlo(2):zhi(2))
-    real(amrex_real), intent(inout) :: divB(dlo(1):dhi(1),dlo(2):dhi(2))
-
-    integer :: i,k
-    real(amrex_real) :: dxinv(3)
-
-    dxinv = 1.d0/dx
-
-    do    k = lo(2), hi(2)
-       do i = lo(1), hi(1)
-          divB(i,k) = dxinv(1) * (Bx(i+1,k  ) - Bx(i,k)) &
-               +      dxinv(3) * (Bz(i  ,k+1) - Bz(i,k))
-       end do
-    end do
-  end subroutine warpx_compute_divb_2d
-
-
-  subroutine warpx_compute_dive_3d (lo, hi, dive, dlo, dhi, &
-       Ex, xlo, xhi, Ey, ylo, yhi, Ez, zlo, zhi, dx) &
-       bind(c, name='warpx_compute_dive_3d')
-    integer, intent(in) :: lo(3),hi(3),dlo(3),dhi(3),xlo(3),xhi(3),ylo(3),yhi(3),zlo(3),zhi(3)
-    real(amrex_real), intent(in) :: dx(3)
-    real(amrex_real), intent(in   ) :: Ex  (xlo(1):xhi(1),xlo(2):xhi(2),xlo(3):xhi(3))
-    real(amrex_real), intent(in   ) :: Ey  (ylo(1):yhi(1),ylo(2):yhi(2),ylo(3):yhi(3))
-    real(amrex_real), intent(in   ) :: Ez  (zlo(1):zhi(1),zlo(2):zhi(2),zlo(3):zhi(3))
-    real(amrex_real), intent(inout) :: dive(dlo(1):dhi(1),dlo(2):dhi(2),dlo(3):dhi(3))
-
-    integer :: i,j,k
-    real(amrex_real) :: dxinv(3)
-
-    dxinv = 1.d0/dx
-
-    do       k = lo(3), hi(3)
-       do    j = lo(2), hi(2)
-          do i = lo(1), hi(1)
-             dive(i,j,k) = dxinv(1) * (Ex(i,j,k) - Ex(i-1,j,k)) &
-                  +        dxinv(2) * (Ey(i,j,k) - Ey(i,j-1,k)) &
-                  +        dxinv(3) * (Ez(i,j,k) - Ez(i,j,k-1))
-          end do
-       end do
-    end do
-  end subroutine warpx_compute_dive_3d
-
-
-  subroutine warpx_compute_dive_2d (lo, hi, dive, dlo, dhi, &
-       Ex, xlo, xhi, Ey, ylo, yhi, Ez, zlo, zhi, dx) &
-       bind(c, name='warpx_compute_dive_2d')
-    integer, intent(in) :: lo(2),hi(2),dlo(2),dhi(2),xlo(2),xhi(2),ylo(2),yhi(2),zlo(2),zhi(2)
-    real(amrex_real), intent(in) :: dx(3)
-    real(amrex_real), intent(in   ) :: Ex  (xlo(1):xhi(1),xlo(2):xhi(2))
-    real(amrex_real), intent(in   ) :: Ey  (ylo(1):yhi(1),ylo(2):yhi(2))
-    real(amrex_real), intent(in   ) :: Ez  (zlo(1):zhi(1),zlo(2):zhi(2))
-    real(amrex_real), intent(inout) :: dive(dlo(1):dhi(1),dlo(2):dhi(2))
-
-    integer :: i,k
-    real(amrex_real) :: dxinv(3)
-
-    dxinv = 1.d0/dx
-
-    do    k = lo(2), hi(2)
-       do i = lo(1), hi(1)
-          dive(i,k) = dxinv(1) * (Ex(i,k) - Ex(i-1,k)) &
-               +      dxinv(3) * (Ez(i,k) - Ez(i,k-1))
-       end do
-    end do
-  end subroutine warpx_compute_dive_2d
-
-
-  subroutine warpx_compute_dive_rz (lo, hi, dive, dlo, dhi, &
-       Ex, xlo, xhi, Ey, ylo, yhi, Ez, zlo, zhi, dx, rmin) &
-       bind(c, name='warpx_compute_dive_rz')
-    integer, intent(in) :: lo(2),hi(2),dlo(2),dhi(2),xlo(2),xhi(2),ylo(2),yhi(2),zlo(2),zhi(2)
-    real(amrex_real), intent(in) :: dx(3), rmin
-    real(amrex_real), intent(in   ) :: Ex  (xlo(1):xhi(1),xlo(2):xhi(2))
-    real(amrex_real), intent(in   ) :: Ey  (ylo(1):yhi(1),ylo(2):yhi(2))
-    real(amrex_real), intent(in   ) :: Ez  (zlo(1):zhi(1),zlo(2):zhi(2))
-    real(amrex_real), intent(inout) :: dive(dlo(1):dhi(1),dlo(2):dhi(2))
-
-    integer :: i,k
-    real(amrex_real) :: dxinv(3)
-    real(amrex_real) :: ru, rd
-
-    dxinv = 1.d0/dx
-
-    do    k = lo(2), hi(2)
-       do i = lo(1), hi(1)
-          if (i == 0 .and. rmin == 0.) then
-             ! the bulk equation diverges on axis
-             ! (due to the 1/r terms). The following expressions regularize
-             ! these divergences.
-             dive(i,k) = 4.*dxinv(1) * Ex(i,k) &
-                         + dxinv(3) * (Ez(i,k) - Ez(i,k-1))
-          else
-             ru = 1.d0 + 0.5d0/(rmin*dxinv(1) + i)
-             rd = 1.d0 - 0.5d0/(rmin*dxinv(1) + i)
-             dive(i,k) = dxinv(1) * (ru*Ex(i,k) - rd*Ex(i-1,k)) &
-                       + dxinv(3) * (Ez(i,k) - Ez(i,k-1))
-          end if
-       end do
-    end do
-  end subroutine warpx_compute_dive_rz
-
-
   subroutine warpx_sync_current_2d (lo, hi, crse, clo, chi, fine, flo, fhi, dir) &
        bind(c, name='warpx_sync_current_2d')
     integer, intent(in) :: lo(2), hi(2), flo(2), fhi(2), clo(2), chi(2), dir
diff --git a/Source/Laser/LaserParticleContainer.cpp b/Source/Laser/LaserParticleContainer.cpp
index de410b31f..515aa1f5d 100644
--- a/Source/Laser/LaserParticleContainer.cpp
+++ b/Source/Laser/LaserParticleContainer.cpp
@@ -503,8 +503,17 @@ LaserParticleContainer::Evolve (int lev,
             //
             // Current Deposition
             //
-            DepositCurrent(pti, wp, uxp, uyp, uzp, jx, jy, jz,
-                           cjx, cjy, cjz, np_current, np, thread_num, lev, dt);
+            // Deposit inside domains
+            DepositCurrent(pti, wp, uxp, uyp, uzp, &jx, &jy, &jz,
+                           0, np_current, thread_num,
+                           lev, lev, dt);
+            bool has_buffer = cjx;
+            if (has_buffer){
+                // Deposit in buffers
+                DepositCurrent(pti, wp, uxp, uyp, uzp, cjx, cjy, cjz,
+                               np_current, np-np_current, thread_num,
+                               lev, lev-1, dt);
+            }
 
             //
             // copy particle data back
diff --git a/Source/Particles/PhysicalParticleContainer.cpp b/Source/Particles/PhysicalParticleContainer.cpp
index 7e7c9534e..43b46ec49 100644
--- a/Source/Particles/PhysicalParticleContainer.cpp
+++ b/Source/Particles/PhysicalParticleContainer.cpp
@@ -1529,8 +1529,16 @@ PhysicalParticleContainer::Evolve (int lev,
                 //
                 // Current Deposition
                 //
-                DepositCurrent(pti, wp, uxp, uyp, uzp, jx, jy, jz,
-                               cjx, cjy, cjz, np_current, np, thread_num, lev, dt);
+                // Deposit inside domains
+                DepositCurrent(pti, wp, uxp, uyp, uzp, &jx, &jy, &jz,
+                               0, np_current, thread_num,
+                               lev, lev, dt);
+                if (has_buffer){
+                    // Deposit in buffers
+                    DepositCurrent(pti, wp, uxp, uyp, uzp, cjx, cjy, cjz,
+                                   np_current, np-np_current, thread_num,
+                                   lev, lev-1, dt);
+                }
   
                 //
                 // copy particle data back
diff --git a/Source/Particles/WarpXParticleContainer.H b/Source/Particles/WarpXParticleContainer.H
index 1e3dfb2ae..0e800bf1d 100644
--- a/Source/Particles/WarpXParticleContainer.H
+++ b/Source/Particles/WarpXParticleContainer.H
@@ -164,32 +164,30 @@ public:
                        bool local = false);
     std::unique_ptr<amrex::MultiFab> GetChargeDensity(int lev, bool local = false);
 
-   virtual void DepositCharge(WarpXParIter& pti,
-                             RealVector& wp,
-                             amrex::MultiFab* rhomf,
-                             amrex::MultiFab* crhomf,
-                             int icomp,
-                             const long np_current,
-                             const long np,
-                             int thread_num,
-                             int lev );
-  
-  virtual void DepositCurrent(WarpXParIter& pti,
-                              RealVector& wp,
-                              RealVector& uxp,
-                              RealVector& uyp,
-                              RealVector& uzp,
-                              amrex::MultiFab& jx,
-                              amrex::MultiFab& jy,
-                              amrex::MultiFab& jz,
-                              amrex::MultiFab* cjx,
-                              amrex::MultiFab* cjy,
-                              amrex::MultiFab* cjz,
-                              const long np_current,
-                              const long np,
-                              int thread_num,
-                              int lev,
-                              amrex::Real dt );
+    virtual void DepositCharge(WarpXParIter& pti,
+                               RealVector& wp,
+                               amrex::MultiFab* rhomf,
+                               amrex::MultiFab* crhomf,
+                               int icomp,
+                               const long np_current,
+                               const long np,
+                               int thread_num,
+                               int lev );
+
+    virtual void DepositCurrent(WarpXParIter& pti,
+                                RealVector& wp,
+                                RealVector& uxp,
+                                RealVector& uyp,
+                                RealVector& uzp,
+                                amrex::MultiFab* jx,
+                                amrex::MultiFab* jy,
+                                amrex::MultiFab* jz,
+                                const long offset,
+                                const long np_to_depose,
+                                int thread_num,
+                                int lev,
+                                int depos_lev,
+                                amrex::Real dt);
 
     // If particles start outside of the domain, ContinuousInjection 
     // makes sure that they are initialized when they enter the domain, and 
diff --git a/Source/Particles/WarpXParticleContainer.cpp b/Source/Particles/WarpXParticleContainer.cpp
index ac532912d..5cdcdd1e3 100644
--- a/Source/Particles/WarpXParticleContainer.cpp
+++ b/Source/Particles/WarpXParticleContainer.cpp
@@ -279,187 +279,140 @@ WarpXParticleContainer::AddNParticles (int lev,
     Redistribute();
 }
 
-
+/* \brief Current Deposition for thread thread_num
+ * \param pti         : Particle iterator
+ * \param wp          : Array of particle weights
+ * \param uxp uyp uzp : Array of particle
+ * \param jx jy jz    : Full array of current density
+ * \param offset      : Index of first particle for which current is deposited
+ * \param np_to_depose: Number of particles for which current is deposited.
+                        Particles [offset,offset+np_tp_depose] deposit their
+                        current
+ * \param thread_num  : Thread number (if tiling)
+ * \param lev         : Level of box that contains particles
+ * \param depos_lev   : Level on which particles deposit (if buffers are used)
+ * \param dt          : Time step for particle level
+ * \param ngJ         : Number of ghosts cells (of lev)
+ */
 void
 WarpXParticleContainer::DepositCurrent(WarpXParIter& pti,
                                        RealVector& wp, RealVector& uxp,
                                        RealVector& uyp, RealVector& uzp,
-                                       MultiFab& jx, MultiFab& jy, MultiFab& jz,
-                                       MultiFab* cjx, MultiFab* cjy, MultiFab* cjz,
-                                       const long np_current, const long np,
-                                       int thread_num, int lev, Real dt )
+                                       MultiFab* jx, MultiFab* jy, MultiFab* jz,
+                                       const long offset, const long np_to_depose,
+                                       int thread_num, int lev, int depos_lev,
+                                       Real dt)
 {
-  Real *jx_ptr, *jy_ptr, *jz_ptr;
-  const std::array<Real,3>& xyzmin_tile = WarpX::LowerCorner(pti.tilebox(), lev);
-  const std::array<Real,3>& dx = WarpX::CellSize(lev);
-  const std::array<Real,3>& cdx = WarpX::CellSize(std::max(lev-1,0));
-  const std::array<Real, 3>& xyzmin = xyzmin_tile;
-  const long lvect = 8;
-
-  BL_PROFILE_VAR_NS("PICSAR::CurrentDeposition", blp_pxr_cd);
-  BL_PROFILE_VAR_NS("PPC::Evolve::Accumulate", blp_accumulate);
-
-  Box tbx = convert(pti.tilebox(), WarpX::jx_nodal_flag);
-  Box tby = convert(pti.tilebox(), WarpX::jy_nodal_flag);
-  Box tbz = convert(pti.tilebox(), WarpX::jz_nodal_flag);
-
-  // WarpX assumes the same number of guard cells for Jx, Jy, Jz
-  long ngJ = jx.nGrow();
-
-  int j_is_nodal = jx.is_nodal() and jy.is_nodal() and jz.is_nodal();
-
-  // Deposit charge for particles that are not in the current buffers
-  if (np_current > 0)
-  {
-#ifdef AMREX_USE_GPU
-      jx_ptr = jx[pti].dataPtr();
-      jy_ptr = jy[pti].dataPtr();
-      jz_ptr = jz[pti].dataPtr();
-
-      auto jxntot = jx[pti].length();
-      auto jyntot = jy[pti].length();
-      auto jzntot = jz[pti].length();
-#else
-      tbx.grow(ngJ);
-      tby.grow(ngJ);
-      tbz.grow(ngJ);
-
-      local_jx[thread_num].resize(tbx);
-      local_jy[thread_num].resize(tby);
-      local_jz[thread_num].resize(tbz);
-
-      jx_ptr = local_jx[thread_num].dataPtr();
-      jy_ptr = local_jy[thread_num].dataPtr();
-      jz_ptr = local_jz[thread_num].dataPtr();
-
-      local_jx[thread_num].setVal(0.0);
-      local_jy[thread_num].setVal(0.0);
-      local_jz[thread_num].setVal(0.0);
-
-      auto jxntot = local_jx[thread_num].length();
-      auto jyntot = local_jy[thread_num].length();
-      auto jzntot = local_jz[thread_num].length();
-#endif
-
-      BL_PROFILE_VAR_START(blp_pxr_cd);
-      warpx_current_deposition(
-           jx_ptr, &ngJ, jxntot.getVect(),
-           jy_ptr, &ngJ, jyntot.getVect(),
-           jz_ptr, &ngJ, jzntot.getVect(),
-           &np_current,
-           m_xp[thread_num].dataPtr(),
-           m_yp[thread_num].dataPtr(),
-           m_zp[thread_num].dataPtr(),
-           uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
-           m_giv[thread_num].dataPtr(),
-           wp.dataPtr(), &this->charge,
-           &xyzmin[0], &xyzmin[1], &xyzmin[2],
-           &dt, &dx[0], &dx[1], &dx[2],
-           &WarpX::nox,&WarpX::noy,&WarpX::noz, &j_is_nodal,
-           &lvect,&WarpX::current_deposition_algo);
-
-#ifdef WARPX_RZ
-       warpx_current_deposition_rz_volume_scaling(
-                                  jx_ptr, &ngJ, jxntot.getVect(),
-                                  jy_ptr, &ngJ, jyntot.getVect(),
-                                  jz_ptr, &ngJ, jzntot.getVect(),
-                                  &xyzmin[0], &dx[0]);
-#endif
-      BL_PROFILE_VAR_STOP(blp_pxr_cd);
-
-#ifndef AMREX_USE_GPU
-      BL_PROFILE_VAR_START(blp_accumulate);
-
-      jx[pti].atomicAdd(local_jx[thread_num], tbx, tbx, 0, 0, 1);
-      jy[pti].atomicAdd(local_jy[thread_num], tby, tby, 0, 0, 1);
-      jz[pti].atomicAdd(local_jz[thread_num], tbz, tbz, 0, 0, 1);
-
-      BL_PROFILE_VAR_STOP(blp_accumulate);
-#endif
-  }
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE((depos_lev==(lev-1)) ||
+                                     (depos_lev==(lev  )),
+                                     "Deposition buffers only work for lev-1");
+    // If no particles, do not do anything
+    if (np_to_depose == 0) return;
+
+    const long ngJ = jx->nGrow();
+    const std::array<Real,3>& dx = WarpX::CellSize(std::max(depos_lev,0));
+    const long lvect = 8;
+    int j_is_nodal = jx->is_nodal() and jy->is_nodal() and jz->is_nodal();
+
+    BL_PROFILE_VAR_NS("PICSAR::CurrentDeposition", blp_pxr_cd);
+    BL_PROFILE_VAR_NS("PPC::Evolve::Accumulate", blp_accumulate);
+
+    // Get tile box where current is deposited.
+    // The tile box is different when depositing in the buffers (depos_lev<lev)
+    // or when depositing inside the level (depos_lev=lev)
+    Box tilebox;
+    if (lev == depos_lev) {
+        tilebox = pti.tilebox();
+    } else {
+        const IntVect& ref_ratio = WarpX::RefRatio(depos_lev);
+        tilebox = amrex::coarsen(pti.tilebox(),ref_ratio);
+    }
+    
+    // Staggered tile boxes (different in each direction)
+    Box tbx = convert(tilebox, WarpX::jx_nodal_flag);
+    Box tby = convert(tilebox, WarpX::jy_nodal_flag);
+    Box tbz = convert(tilebox, WarpX::jz_nodal_flag);
 
-  // Deposit charge for particles that are in the current buffers
-  if (np_current < np)
-  {
-      const IntVect& ref_ratio = WarpX::RefRatio(lev-1);
-      const Box& ctilebox = amrex::coarsen(pti.tilebox(),ref_ratio);
-      const std::array<Real,3>& cxyzmin_tile = WarpX::LowerCorner(ctilebox, lev-1);
+    // Lower corner of tile box physical domain
+    const std::array<Real,3>& xyzmin_tile = WarpX::LowerCorner(tilebox, depos_lev);
+    const std::array<Real, 3>& xyzmin = xyzmin_tile;
 
 #ifdef AMREX_USE_GPU
-      jx_ptr = (*cjx)[pti].dataPtr();
-      jy_ptr = (*cjy)[pti].dataPtr();
-      jz_ptr = (*cjz)[pti].dataPtr();
-
-      auto jxntot = jx[pti].length();
-      auto jyntot = jy[pti].length();
-      auto jzntot = jz[pti].length();
+    // No tiling on GPU: jx_ptr points to the full
+    // jx array (same for jy_ptr and jz_ptr).
+    Real* jx_ptr = (*jx)[pti].dataPtr();
+    Real* jy_ptr = (*jy)[pti].dataPtr();
+    Real* jz_ptr = (*jz)[pti].dataPtr();
+    
+    auto jxntot = jx[pti].length();
+    auto jyntot = jy[pti].length();
+    auto jzntot = jz[pti].length();
 #else
+    // Tiling is on: jx_ptr points to local_jx[thread_num]
+    // (same for jy_ptr and jz_ptr)
+    tbx.grow(ngJ);
+    tby.grow(ngJ);
+    tbz.grow(ngJ);
+
+    local_jx[thread_num].resize(tbx);
+    local_jy[thread_num].resize(tby);
+    local_jz[thread_num].resize(tbz);
+
+    Real* jx_ptr = local_jx[thread_num].dataPtr();
+    Real* jy_ptr = local_jy[thread_num].dataPtr();
+    Real* jz_ptr = local_jz[thread_num].dataPtr();
+
+    // local_jx[thread_num] is set to zero
+    local_jx[thread_num].setVal(0.0);
+    local_jy[thread_num].setVal(0.0);
+    local_jz[thread_num].setVal(0.0);
+
+    auto jxntot = local_jx[thread_num].length();
+    auto jyntot = local_jy[thread_num].length();
+    auto jzntot = local_jz[thread_num].length();
+#endif
+    // GPU, no tiling: deposit directly in jx
+    // CPU, tiling: deposit into local_jx
+    // (same for jx and jz)
+    BL_PROFILE_VAR_START(blp_pxr_cd);
+    warpx_current_deposition(
+        jx_ptr, &ngJ, jxntot.getVect(),
+        jy_ptr, &ngJ, jyntot.getVect(),
+        jz_ptr, &ngJ, jzntot.getVect(),
+        &np_to_depose,
+        m_xp[thread_num].dataPtr() + offset,
+        m_yp[thread_num].dataPtr() + offset,
+        m_zp[thread_num].dataPtr() + offset,
+        uxp.dataPtr() + offset,
+        uyp.dataPtr() + offset,
+        uzp.dataPtr() + offset,
+        m_giv[thread_num].dataPtr() + offset,
+        wp.dataPtr() + offset, &this->charge,
+        &xyzmin[0], &xyzmin[1], &xyzmin[2],
+        &dt, &dx[0], &dx[1], &dx[2],
+        &WarpX::nox,&WarpX::noy,&WarpX::noz, &j_is_nodal,
+        &lvect,&WarpX::current_deposition_algo);
 
-      tbx = amrex::convert(ctilebox, WarpX::jx_nodal_flag);
-      tby = amrex::convert(ctilebox, WarpX::jy_nodal_flag);
-      tbz = amrex::convert(ctilebox, WarpX::jz_nodal_flag);
-      tbx.grow(ngJ);
-      tby.grow(ngJ);
-      tbz.grow(ngJ);
-
-      local_jx[thread_num].resize(tbx);
-      local_jy[thread_num].resize(tby);
-      local_jz[thread_num].resize(tbz);
-
-      jx_ptr = local_jx[thread_num].dataPtr();
-      jy_ptr = local_jy[thread_num].dataPtr();
-      jz_ptr = local_jz[thread_num].dataPtr();
-
-      local_jx[thread_num].setVal(0.0);
-      local_jy[thread_num].setVal(0.0);
-      local_jz[thread_num].setVal(0.0);
-
-      auto jxntot = local_jx[thread_num].length();
-      auto jyntot = local_jy[thread_num].length();
-      auto jzntot = local_jz[thread_num].length();
-#endif
-
-      long ncrse = np - np_current;
-      BL_PROFILE_VAR_START(blp_pxr_cd);
-      warpx_current_deposition(
-                           jx_ptr, &ngJ, jxntot.getVect(),
-                           jy_ptr, &ngJ, jyntot.getVect(),
-                           jz_ptr, &ngJ, jzntot.getVect(),
-                           &ncrse,
-                           m_xp[thread_num].dataPtr() +np_current,
-                           m_yp[thread_num].dataPtr() +np_current,
-                           m_zp[thread_num].dataPtr() +np_current,
-                           uxp.dataPtr()+np_current,
-                           uyp.dataPtr()+np_current,
-                           uzp.dataPtr()+np_current,
-                           m_giv[thread_num].dataPtr()+np_current,
-                           wp.dataPtr()+np_current, &this->charge,
-                           &cxyzmin_tile[0], &cxyzmin_tile[1], &cxyzmin_tile[2],
-                           &dt, &cdx[0], &cdx[1], &cdx[2],
-                           &WarpX::nox,&WarpX::noy,&WarpX::noz, &j_is_nodal,
-                           &lvect,&WarpX::current_deposition_algo);
 #ifdef WARPX_RZ
-       warpx_current_deposition_rz_volume_scaling(
-                                  jx_ptr, &ngJ, jxntot.getVect(),
-                                  jy_ptr, &ngJ, jyntot.getVect(),
-                                  jz_ptr, &ngJ, jzntot.getVect(),
-                                  &xyzmin[0], &dx[0]);
+    // Rescale current in r-z mode
+    warpx_current_deposition_rz_volume_scaling(
+        jx_ptr, &ngJ, jxntot.getVect(),
+        jy_ptr, &ngJ, jyntot.getVect(),
+        jz_ptr, &ngJ, jzntot.getVect(),
+        &xyzmin[0], &dx[0]);
 #endif
-
-      BL_PROFILE_VAR_STOP(blp_pxr_cd);
+    BL_PROFILE_VAR_STOP(blp_pxr_cd);
 
 #ifndef AMREX_USE_GPU
-      BL_PROFILE_VAR_START(blp_accumulate);
-
-      (*cjx)[pti].atomicAdd(local_jx[thread_num], tbx, tbx, 0, 0, 1);
-      (*cjy)[pti].atomicAdd(local_jy[thread_num], tby, tby, 0, 0, 1);
-      (*cjz)[pti].atomicAdd(local_jz[thread_num], tbz, tbz, 0, 0, 1);
-
-      BL_PROFILE_VAR_STOP(blp_accumulate);
+    BL_PROFILE_VAR_START(blp_accumulate);
+    // CPU, tiling: atomicAdd local_jx into jx
+    // (same for jx and jz)
+    (*jx)[pti].atomicAdd(local_jx[thread_num], tbx, tbx, 0, 0, 1);
+    (*jy)[pti].atomicAdd(local_jy[thread_num], tby, tby, 0, 0, 1);
+    (*jz)[pti].atomicAdd(local_jz[thread_num], tbz, tbz, 0, 0, 1);
+    BL_PROFILE_VAR_STOP(blp_accumulate);
 #endif
-    }
-};
-
+}
 
 void
 WarpXParticleContainer::DepositCharge ( WarpXParIter& pti, RealVector& wp,
diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp
index c2cf97f30..877882037 100644
--- a/Source/WarpX.cpp
+++ b/Source/WarpX.cpp
@@ -18,6 +18,7 @@
 #include <WarpXWrappers.h>
 #include <WarpXUtil.H>
 #include <WarpXAlgorithmSelection.H>
+#include <WarpX_FDTD.H>
 
 #ifdef BL_USE_SENSEI_INSITU
 #include <AMReX_AmrMeshInSituBridge.H>
@@ -924,18 +925,32 @@ WarpX::ComputeDivB (MultiFab& divB, int dcomp,
                     const std::array<const MultiFab*, 3>& B,
                     const std::array<Real,3>& dx)
 {
+    Real dxinv = 1./dx[0], dyinv = 1./dx[1], dzinv = 1./dx[2];
+
+#ifdef WARPX_RZ
+    const Real rmin = GetInstance().Geom(0).ProbLo(0);
+#endif
+
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-    for (MFIter mfi(divB, true); mfi.isValid(); ++mfi)
+    for (MFIter mfi(divB, TilingIfNotGPU()); mfi.isValid(); ++mfi)
     {
         const Box& bx = mfi.tilebox();
-        WRPX_COMPUTE_DIVB(bx.loVect(), bx.hiVect(),
-                           BL_TO_FORTRAN_N_ANYD(divB[mfi],dcomp),
-                           BL_TO_FORTRAN_ANYD((*B[0])[mfi]),
-                           BL_TO_FORTRAN_ANYD((*B[1])[mfi]),
-                           BL_TO_FORTRAN_ANYD((*B[2])[mfi]),
-                           dx.data());
+        auto const& Bxfab = B[0]->array(mfi);
+        auto const& Byfab = B[1]->array(mfi);
+        auto const& Bzfab = B[2]->array(mfi);
+        auto const& divBfab = divB.array(mfi);
+
+        ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+        {
+            warpx_computedivb(i, j, k, dcomp, divBfab, Bxfab, Byfab, Bzfab, dxinv, dyinv, dzinv
+#ifdef WARPX_RZ
+                              ,rmin
+#endif
+                              );
+        });
     }
 }
 
@@ -944,18 +959,32 @@ WarpX::ComputeDivB (MultiFab& divB, int dcomp,
                     const std::array<const MultiFab*, 3>& B,
                     const std::array<Real,3>& dx, int ngrow)
 {
+    Real dxinv = 1./dx[0], dyinv = 1./dx[1], dzinv = 1./dx[2];
+
+#ifdef WARPX_RZ
+    const Real rmin = GetInstance().Geom(0).ProbLo(0);
+#endif
+
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-    for (MFIter mfi(divB, true); mfi.isValid(); ++mfi)
+    for (MFIter mfi(divB, TilingIfNotGPU()); mfi.isValid(); ++mfi)
     {
         Box bx = mfi.growntilebox(ngrow);
-        WRPX_COMPUTE_DIVB(bx.loVect(), bx.hiVect(),
-                           BL_TO_FORTRAN_N_ANYD(divB[mfi],dcomp),
-                           BL_TO_FORTRAN_ANYD((*B[0])[mfi]),
-                           BL_TO_FORTRAN_ANYD((*B[1])[mfi]),
-                           BL_TO_FORTRAN_ANYD((*B[2])[mfi]),
-                           dx.data());
+        auto const& Bxfab = B[0]->array(mfi);
+        auto const& Byfab = B[1]->array(mfi);
+        auto const& Bzfab = B[2]->array(mfi);
+        auto const& divBfab = divB.array(mfi);
+
+        ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+        {
+            warpx_computedivb(i, j, k, dcomp, divBfab, Bxfab, Byfab, Bzfab, dxinv, dyinv, dzinv
+#ifdef WARPX_RZ
+                              ,rmin
+#endif
+                              );
+        });
     }
 }
 
@@ -964,25 +993,32 @@ WarpX::ComputeDivE (MultiFab& divE, int dcomp,
                     const std::array<const MultiFab*, 3>& E,
                     const std::array<Real,3>& dx)
 {
+    Real dxinv = 1./dx[0], dyinv = 1./dx[1], dzinv = 1./dx[2];
+
+#ifdef WARPX_RZ
+    const Real rmin = GetInstance().Geom(0).ProbLo(0);
+#endif
+
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-    for (MFIter mfi(divE, true); mfi.isValid(); ++mfi)
+    for (MFIter mfi(divE, TilingIfNotGPU()); mfi.isValid(); ++mfi)
     {
         const Box& bx = mfi.tilebox();
+        auto const& Exfab = E[0]->array(mfi);
+        auto const& Eyfab = E[1]->array(mfi);
+        auto const& Ezfab = E[2]->array(mfi);
+        auto const& divEfab = divE.array(mfi);
+
+        ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+        {
+            warpx_computedive(i, j, k, dcomp, divEfab, Exfab, Eyfab, Ezfab, dxinv, dyinv, dzinv
 #ifdef WARPX_RZ
-        const Real xmin = GetInstance().Geom(0).ProbLo(0);
-#endif
-        WRPX_COMPUTE_DIVE(bx.loVect(), bx.hiVect(),
-                           BL_TO_FORTRAN_N_ANYD(divE[mfi],dcomp),
-                           BL_TO_FORTRAN_ANYD((*E[0])[mfi]),
-                           BL_TO_FORTRAN_ANYD((*E[1])[mfi]),
-                           BL_TO_FORTRAN_ANYD((*E[2])[mfi]),
-                           dx.data()
-#ifdef WARPX_RZ
-                           ,&xmin
+                              ,rmin
 #endif
-                           );
+                              );
+        });
     }
 }
 
@@ -991,25 +1027,32 @@ WarpX::ComputeDivE (MultiFab& divE, int dcomp,
                     const std::array<const MultiFab*, 3>& E,
                     const std::array<Real,3>& dx, int ngrow)
 {
+    Real dxinv = 1./dx[0], dyinv = 1./dx[1], dzinv = 1./dx[2];
+
+#ifdef WARPX_RZ
+    const Real rmin = GetInstance().Geom(0).ProbLo(0);
+#endif
+
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-    for (MFIter mfi(divE, true); mfi.isValid(); ++mfi)
+    for (MFIter mfi(divE, TilingIfNotGPU()); mfi.isValid(); ++mfi)
     {
         Box bx = mfi.growntilebox(ngrow);
+        auto const& Exfab = E[0]->array(mfi);
+        auto const& Eyfab = E[1]->array(mfi);
+        auto const& Ezfab = E[2]->array(mfi);
+        auto const& divEfab = divE.array(mfi);
+
+        ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+        {
+            warpx_computedive(i, j, k, dcomp, divEfab, Exfab, Eyfab, Ezfab, dxinv, dyinv, dzinv
 #ifdef WARPX_RZ
-        const Real xmin = GetInstance().Geom(0).ProbLo(0);
-#endif
-        WRPX_COMPUTE_DIVE(bx.loVect(), bx.hiVect(),
-                           BL_TO_FORTRAN_N_ANYD(divE[mfi],dcomp),
-                           BL_TO_FORTRAN_ANYD((*E[0])[mfi]),
-                           BL_TO_FORTRAN_ANYD((*E[1])[mfi]),
-                           BL_TO_FORTRAN_ANYD((*E[2])[mfi]),
-                           dx.data()
-#ifdef WARPX_RZ
-                           ,&xmin
+                              ,rmin
 #endif
-                           );
+                              );
+        });
     }
 }