20 files changed, 526 insertions, 286 deletions
diff --git a/Source/BoundaryConditions/WarpX_PML_kernels.H b/Source/BoundaryConditions/WarpX_PML_kernels.H
index 8f779a5c2..89fdb4911 100644
--- a/Source/BoundaryConditions/WarpX_PML_kernels.H
+++ b/Source/BoundaryConditions/WarpX_PML_kernels.H
@@ -85,7 +85,8 @@ void warpx_push_pml_bx_ckc(int i, int j, int k, Array4<Real> const&Bx,
                           - Ez(i+1,j  ,k-1,0) - Ez(i+1,j  ,k-1,1) - Ez(i+1,j  ,k-1,2)
                           + Ez(i-1,j+1,k-1,0) + Ez(i-1,j+1,k-1,1) + Ez(i-1,j+1,k-1,2)
                           - Ez(i-1,j  ,k-1,0) - Ez(i-1,j  ,k-1,1) - Ez(i-1,j  ,k-1,2));
-   Bx(i,j,k,1) +=   alphaz*(Ey(i  ,j  ,k+1,0) + Ey(i  ,j  ,k+1,1) + Ez(i  ,j  ,k+1,2)
+
+   Bx(i,j,k,1) +=   alphaz*(Ey(i  ,j  ,k+1,0) + Ey(i  ,j  ,k+1,1) + Ey(i  ,j  ,k+1,2)
                           - Ey(i  ,j  ,k  ,0) - Ey(i  ,j  ,k  ,1) - Ey(i  ,j  ,k  ,2))
                    +betazx*(Ey(i+1,j  ,k+1,0) + Ey(i+1,j  ,k+1,1) + Ey(i+1,j  ,k+1,2)
                           - Ey(i+1,j  ,k  ,0) - Ey(i+1,j  ,k  ,1) - Ey(i+1,j  ,k  ,2)
@@ -101,7 +102,7 @@ void warpx_push_pml_bx_ckc(int i, int j, int k, Array4<Real> const&Bx,
                           - Ey(i-1,j+1,k  ,0) - Ey(i-1,j+1,k  ,1) - Ey(i-1,j+1,k  ,2)
                           + Ey(i+1,j-1,k+1,0) + Ey(i+1,j-1,k+1,1) + Ey(i+1,j-1,k+1,2)
                           - Ey(i+1,j-1,k  ,0) - Ey(i+1,j-1,k  ,1) - Ey(i+1,j-1,k  ,2)
-                          + Ey(i-1,j-1,k+1,0) + Ey(i-1,j-1,k+1,1) + Ey(i-1,j-1,k+1,1)
+                          + Ey(i-1,j-1,k+1,0) + Ey(i-1,j-1,k+1,1) + Ey(i-1,j-1,k+1,2)
                           - Ey(i-1,j-1,k  ,0) - Ey(i-1,j-1,k  ,1) - Ey(i-1,j-1,k  ,2));
 
 #else
diff --git a/Source/Evolve/WarpXEvolveEM.cpp b/Source/Evolve/WarpXEvolveEM.cpp
index a517a91f5..1e4aa4d5c 100644
--- a/Source/Evolve/WarpXEvolveEM.cpp
+++ b/Source/Evolve/WarpXEvolveEM.cpp
@@ -562,6 +562,18 @@ WarpX::ComputeDt ()
     if (do_electrostatic) {
         dt[0] = const_dt;
     }
+
+    for (int lev=0; lev <= max_level; lev++) {
+        const Real* dx_lev = geom[lev].CellSize();
+        Print()<<"Level "<<lev<<": dt = "<<dt[lev]
+               <<" ; dx = "<<dx_lev[0]
+#if (defined WARPX_DIM_XZ) || (defined WARPX_DIM_RZ)
+               <<" ; dz = "<<dx_lev[1]<<'\n';
+#elif (defined WARPX_DIM_3D)
+               <<" ; dy = "<<dx_lev[1]
+               <<" ; dz = "<<dx_lev[2]<<'\n';
+#endif
+    }
 }
 
 /* \brief computes max_step for wakefield simulation in boosted frame.
diff --git a/Source/FortranInterface/WarpX_f.F90 b/Source/FortranInterface/WarpX_f.F90
index 2ddf9409d..762ed2cd8 100644
--- a/Source/FortranInterface/WarpX_f.F90
+++ b/Source/FortranInterface/WarpX_f.F90
@@ -51,65 +51,6 @@ contains
 
   end subroutine warpx_compute_E
 
-  subroutine warpx_sync_rho_2d (lo, hi, crse, clo, chi, fine, flo, fhi, nc) &
-       bind(c, name='warpx_sync_rho_2d')
-    integer, intent(in) :: lo(2), hi(2), flo(2), fhi(2), clo(2), chi(2), nc
-    real(amrex_real), intent(in   ) :: fine(flo(1):fhi(1),flo(2):fhi(2),nc)
-    real(amrex_real), intent(inout) :: crse(clo(1):chi(1),clo(2):chi(2),nc)
-
-    integer :: i,j,ii,jj,m
-
-    do m = 1, nc
-       do j = lo(2), hi(2)
-          jj = j*2
-          do i = lo(1), hi(1)
-             ii = i*2
-             crse(i,j,m) = 0.25d0 * &
-                  ( fine(ii,jj,m) + 0.5d0 *(fine(ii-1,jj  ,m)+fine(ii+1,jj  ,m) &
-                  &                       + fine(ii  ,jj-1,m)+fine(ii  ,jj+1,m)) &
-                  &               + 0.25d0*(fine(ii-1,jj-1,m)+fine(ii+1,jj-1,m) &
-                  &                       + fine(ii-1,jj+1,m)+fine(ii+1,jj+1,m)) )
-          end do
-       end do
-    end do
-  end subroutine warpx_sync_rho_2d
-
-  subroutine warpx_sync_rho_3d (lo, hi, crse, clo, chi, fine, flo, fhi, nc) &
-       bind(c, name='warpx_sync_rho_3d')
-    integer, intent(in) :: lo(3), hi(3), flo(3), fhi(3), clo(3), chi(3), nc
-    real(amrex_real), intent(in   ) :: fine(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),nc)
-    real(amrex_real), intent(inout) :: crse(clo(1):chi(1),clo(2):chi(2),clo(3):chi(3),nc)
-
-    integer :: i,j,k,ii,jj,kk,m
-
-    do m = 1, nc
-       do k = lo(3), hi(3)
-          kk = k*2
-          do j = lo(2), hi(2)
-             jj = j*2
-             do i = lo(1), hi(1)
-                ii = i*2
-                crse(i,j,k,m) = 0.125d0 * &
-                     (fine(ii,jj,kk,m) + 0.5d0  *(fine(ii-1,jj  ,kk  ,m)+fine(ii+1,jj  ,kk  ,m) &
-                     &                          + fine(ii  ,jj-1,kk  ,m)+fine(ii  ,jj+1,kk  ,m) &
-                     &                          + fine(ii  ,jj  ,kk-1,m)+fine(ii  ,jj  ,kk+1,m)) &
-                     &                 + 0.25d0 *(fine(ii-1,jj-1,kk  ,m)+fine(ii+1,jj-1,kk  ,m) &
-                     &                          + fine(ii-1,jj+1,kk  ,m)+fine(ii+1,jj+1,kk  ,m) &
-                     &                          + fine(ii-1,jj  ,kk-1,m)+fine(ii+1,jj  ,kk-1,m) &
-                     &                          + fine(ii-1,jj  ,kk+1,m)+fine(ii+1,jj  ,kk+1,m) &
-                     &                          + fine(ii  ,jj-1,kk-1,m)+fine(ii  ,jj+1,kk-1,m) &
-                     &                          + fine(ii  ,jj-1,kk+1,m)+fine(ii  ,jj+1,kk+1,m)) &
-                     &                 + 0.125d0*(fine(ii-1,jj-1,kk-1,m)+fine(ii-1,jj-1,kk+1,m) &
-                     &                          + fine(ii-1,jj+1,kk-1,m)+fine(ii-1,jj+1,kk+1,m) &
-                     &                          + fine(ii+1,jj-1,kk-1,m)+fine(ii+1,jj-1,kk+1,m) &
-                     &                          + fine(ii+1,jj+1,kk-1,m)+fine(ii+1,jj+1,kk+1,m)))
-             end do
-          end do
-       end do
-    end do
-
-  end subroutine warpx_sync_rho_3d
-
   subroutine warpx_build_buffer_masks (lo, hi, msk, mlo, mhi, gmsk, glo, ghi, ng) &
        bind(c, name='warpx_build_buffer_masks')
     integer, dimension(3), intent(in) :: lo, hi, mlo, mhi, glo, ghi
diff --git a/Source/FortranInterface/WarpX_f.H b/Source/FortranInterface/WarpX_f.H
index 2d5d346e2..b51a83387 100644
--- a/Source/FortranInterface/WarpX_f.H
+++ b/Source/FortranInterface/WarpX_f.H
@@ -16,8 +16,6 @@
 
 #if (AMREX_SPACEDIM == 3)
 
-#define WRPX_SYNC_RHO                    warpx_sync_rho_3d
-
 #define WRPX_SUM_FINE_TO_CRSE_NODAL      warpx_sum_fine_to_crse_nodal_3d
 #define WRPX_ZERO_OUT_BNDRY              warpx_zero_out_bndry_3d
 #define WRPX_BUILD_MASK                  warpx_build_mask_3d
@@ -30,8 +28,6 @@
 
 #elif (AMREX_SPACEDIM == 2)
 
-#define WRPX_SYNC_RHO                    warpx_sync_rho_2d
-
 #define WRPX_SUM_FINE_TO_CRSE_NODAL      warpx_sum_fine_to_crse_nodal_2d
 #define WRPX_ZERO_OUT_BNDRY              warpx_zero_out_bndry_2d
 #define WRPX_BUILD_MASK                  warpx_build_mask_2d
@@ -229,11 +225,6 @@ extern "C"
 #endif
                           const amrex::Real* sigbz, int sigbz_lo, int sigbz_hi);
 
-    void WRPX_SYNC_RHO (const int* lo, const int* hi,
-                        BL_FORT_FAB_ARG_ANYD(crse),
-                        const BL_FORT_FAB_ARG_ANYD(fine),
-                        const int* ncomp);
-
 #ifdef WARPX_USE_PSATD
     void warpx_fft_mpi_init (int fcomm);
     void warpx_fft_domain_decomp (int* warpx_local_nz, int* warpx_local_z0,
diff --git a/Source/Make.WarpX b/Source/Make.WarpX
index 13d4f4554..6f9ae6d2f 100644
--- a/Source/Make.WarpX
+++ b/Source/Make.WarpX
@@ -9,7 +9,7 @@ ifeq ($(USE_GPU),TRUE)
   USE_OMP  = FALSE
   USE_CUDA = TRUE
   USE_ACC  = FALSE
-  NVCC_HOST_COMP = gcc
+  NVCC_HOST_COMP = gnu
 endif
 
 ifeq ($(USE_RZ),TRUE)
@@ -27,24 +27,24 @@ ifeq ($(USE_ASCENT_INSITU),TRUE)
   USE_ASCENT = TRUE
 endif
 
-include $(AMREX_HOME)/Tools/GNUMake/Make.defs
-
 ifndef USE_PYTHON_MAIN
   USE_PYTHON_MAIN = FALSE
 endif
 
 ifeq ($(USE_PYTHON_MAIN),TRUE)
-  CXXFLAGS += -fPIC
+  XTRA_CXXFLAGS += -fPIC
 ifeq ($(USE_OMP),TRUE)
   LDFLAGS += -fopenmp
 endif
-  CFLAGS   += -fPIC
-  FFLAGS   += -fPIC
-  F90FLAGS += -fPIC
+  XTRA_CFLAGS   += -fPIC
+  XTRA_FFLAGS   += -fPIC
+  XTRA_F90FLAGS += -fPIC
   USERSuffix := .Lib
   DEFINES += -DWARPX_USE_PY
 endif
 
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
 ifeq ($(DIM),3)
   DEFINES += -DWARPX_DIM_3D
 else
@@ -209,7 +209,11 @@ libwarpx$(PYDIM).a: $(objForExecs)
 
 libwarpx$(PYDIM).so: $(objForExecs)
 	@echo Making dynamic library $@ ...
+ifeq ($(USE_CUDA),TRUE)
+	$(SILENT) $(CXX) $(LINKFLAGS) $(SHARED_OPTION) -Xlinker=$(SHARED_OPTION) -Xlinker=-fPIC -o $@ $^ $(LDFLAGS) $(libraries)
+else
 	$(SILENT) $(CXX) $(SHARED_OPTION) -fPIC -o $@ $^ $(LDFLAGS) $(libraries)
+endif
 	$(SILENT) $(RM) AMReX_buildInfo.cpp
 	@echo SUCCESS
 
diff --git a/Source/Parallelization/InterpolateDensityFineToCoarse.H b/Source/Parallelization/InterpolateDensityFineToCoarse.H
new file mode 100644
index 000000000..947db2aac
--- /dev/null
+++ b/Source/Parallelization/InterpolateDensityFineToCoarse.H
@@ -0,0 +1,120 @@
+/* Copyright 2019 Axel Huebl, Weiqun Zhang
+ *
+ * This file is part of WarpX.
+ *
+ * License: BSD-3-Clause-LBNL
+ */
+
+#ifndef INTERPOLATE_DENSITY_FINE_TO_COARSE_H
+#define INTERPOLATE_DENSITY_FINE_TO_COARSE_H
+
+#include <AMReX_Array4.H>
+#include <AMReX_REAL.H>
+#include <AMReX_BLassert.H>
+#include <AMReX_Extension.H>
+#include <AMReX_GpuQualifiers.H>
+
+#include <utility> // std::move
+
+
+/** Fill a charge density (rho) coarse patch with averaged values from a fine patch
+ *
+ * Fills the values of the charge density on the coarse patch
+ * by averaging the values of the charge density of the fine patch.
+ */
+class InterpolateDensityFineToCoarse
+{
+public:
+    /** Construct with fine and coarse patch and their refinement ratio
+     *
+     * @param[in]     fine read-only fine patch
+     * @param[in,out] coarse overwritten coarse patch
+     * @param[in]     refinement_ratio ratio between coarse and fine patch granularity
+     *                (currently, only a value of is implemented)
+     * @param[in]     number_of_components the number of components
+     */
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    InterpolateDensityFineToCoarse(
+        amrex::Array4<amrex::Real const> const fine,
+        amrex::Array4<amrex::Real      > const coarse,
+        int const refinement_ratio,
+        int const number_of_components
+    ) : m_fine(std::move(fine)),
+        m_coarse(std::move(coarse)),
+        m_refinement_ratio(std::move(refinement_ratio)),
+        m_number_of_components(std::move(number_of_components))
+    {
+        //! @note constants and stencils in operator() implementation assume 2x refinement
+        BL_ASSERT(refinement_ratio == 2);
+    }
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    void
+    operator()(
+        int const i,
+        int const j,
+        int const k
+    ) const noexcept
+    {
+        using namespace amrex;
+
+        auto const & fine_unsafe = m_fine;   // out-of-bounds access not secured with zero-values yet
+        auto const & coarse      = m_coarse; // out-of-bounds access not secured but will also not occur
+
+        // return zero for out-of-bounds accesses during interpolation
+        // this is efficiently used as a method to add neutral elements beyond guards in the average below
+        auto const fine = [fine_unsafe] AMREX_GPU_DEVICE (int const j, int const k, int const l, int const m) noexcept
+        {
+            return fine_unsafe.contains(j, k, l) ? fine_unsafe(j, k, l, m) :  amrex::Real{0.};
+        };
+
+        int const ii = i * m_refinement_ratio;
+        int const jj = j * m_refinement_ratio;
+        int const kk = k * m_refinement_ratio;
+        for( int m = 0; m < m_number_of_components; ++m ) {
+#if AMREX_SPACEDIM == 2
+            coarse(i,j,kk,m) = 0.25_rt * (
+                fine(ii,jj,kk,m) +
+                0.5_rt  * (
+                    fine(ii-1,jj  ,kk,m) + fine(ii+1,jj  ,kk,m) +
+                    fine(ii  ,jj-1,kk,m) + fine(ii  ,jj+1,kk,m)
+                ) +
+                0.25_rt * (
+                    fine(ii-1,jj-1,kk,m) + fine(ii+1,jj-1,kk,m) +
+                    fine(ii-1,jj+1,kk,m) + fine(ii+1,jj+1,kk,m)
+                )
+            );
+#elif AMREX_SPACEDIM == 3
+            coarse(i,j,k,m) = 0.125_rt * (
+                fine(ii,jj,kk,m) +
+                0.5_rt   * (
+                    fine(ii-1,jj  ,kk  ,m) + fine(ii+1,jj  ,kk  ,m) +
+                    fine(ii  ,jj-1,kk  ,m) + fine(ii  ,jj+1,kk  ,m) +
+                    fine(ii  ,jj  ,kk-1,m) + fine(ii  ,jj  ,kk+1,m)
+                ) +
+                0.25_rt  * (
+                    fine(ii-1,jj-1,kk  ,m) + fine(ii+1,jj-1,kk  ,m) +
+                    fine(ii-1,jj+1,kk  ,m) + fine(ii+1,jj+1,kk  ,m) +
+                    fine(ii-1,jj  ,kk-1,m) + fine(ii+1,jj  ,kk-1,m) +
+                    fine(ii-1,jj  ,kk+1,m) + fine(ii+1,jj  ,kk+1,m) +
+                    fine(ii  ,jj-1,kk-1,m) + fine(ii  ,jj+1,kk-1,m) +
+                    fine(ii  ,jj-1,kk+1,m) + fine(ii  ,jj+1,kk+1,m)
+                ) +
+                0.125_rt * (
+                    fine(ii-1,jj-1,kk-1,m) + fine(ii-1,jj-1,kk+1,m) +
+                    fine(ii-1,jj+1,kk-1,m) + fine(ii-1,jj+1,kk+1,m) +
+                    fine(ii+1,jj-1,kk-1,m) + fine(ii+1,jj-1,kk+1,m) +
+                    fine(ii+1,jj+1,kk-1,m) + fine(ii+1,jj+1,kk+1,m)
+                )
+            );
+#endif
+        }
+    }
+private:
+    amrex::Array4< amrex::Real const > m_fine;
+    amrex::Array4< amrex::Real       > m_coarse;
+    int m_refinement_ratio;
+    int m_number_of_components;
+};
+
+#endif // INTERPOLATE_DENSITY_FINE_TO_COARSE_H
diff --git a/Source/Parallelization/WarpXComm.cpp b/Source/Parallelization/WarpXComm.cpp
index 4f870e79c..92f0b4f09 100644
--- a/Source/Parallelization/WarpXComm.cpp
+++ b/Source/Parallelization/WarpXComm.cpp
@@ -3,10 +3,12 @@
 #include <WarpX_f.H>
 #include <WarpXSumGuardCells.H>
 #include <Parallelization/InterpolateCurrentFineToCoarse.H>
+#include <Parallelization/InterpolateDensityFineToCoarse.H>
 
 #include <algorithm>
 #include <cstdlib>
 
+
 using namespace amrex;
 
 void
@@ -354,7 +356,7 @@ WarpX::interpolateCurrentFineToCoarse ( std::array< amrex::MultiFab const *, 3 >
                                         std::array< amrex::MultiFab       *, 3 > const & coarse,
                                         int const refinement_ratio)
 {
-    BL_PROFILE("InterpolateCurrentFineToCoarse()");
+    BL_PROFILE("interpolateCurrentFineToCoarse()");
     BL_ASSERT(refinement_ratio == 2);
     const IntVect& ng = (fine[0]->nGrowVect() + 1) / refinement_ratio; // add equivalent no. of guards to coarse patch
 
@@ -386,6 +388,8 @@ WarpX::interpolateCurrentFineToCoarse ( std::array< amrex::MultiFab const *, 3 >
 void
 WarpX::SyncRho ()
 {
+    BL_PROFILE("SyncRho()");
+
     if (!rho_fp[0]) return;
     const int ncomp = rho_fp[0]->nComp();
 
@@ -395,7 +399,7 @@ WarpX::SyncRho ()
     {
         rho_cp[lev]->setVal(0.0);
         const IntVect& refinement_ratio = refRatio(lev-1);
-        SyncRho(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
+        interpolateDensityFineToCoarse(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
     }
 
     // For each level
@@ -408,29 +412,26 @@ WarpX::SyncRho ()
 }
 
 void
-WarpX::SyncRho (const MultiFab& fine, MultiFab& crse, int refinement_ratio)
+WarpX::interpolateDensityFineToCoarse (const MultiFab& fine, MultiFab& coarse, int const refinement_ratio)
 {
+    BL_PROFILE("interpolateDensityFineToCoarse()");
     BL_ASSERT(refinement_ratio == 2);
-    const IntVect& ng = (fine.nGrowVect()+1)/refinement_ratio;
+    const IntVect& ng = (fine.nGrowVect() + 1) / refinement_ratio;  // add equivalent no. of guards to coarse patch
     const int nc = fine.nComp();
 
 #ifdef _OPEMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
     {
-        FArrayBox ffab;
-        for (MFIter mfi(crse,true); mfi.isValid(); ++mfi)
+        // OMP in-box decomposition of coarse into tilebox
+        for (MFIter mfi(coarse, TilingIfNotGPU()); mfi.isValid(); ++mfi)
         {
-            const Box& bx = mfi.growntilebox(ng);
-            Box fbx = amrex::grow(amrex::refine(bx,refinement_ratio),1);
-            ffab.resize(fbx, nc);
-            fbx &= fine[mfi].box();
-            ffab.setVal(0.0);
-            ffab.copy(fine[mfi], fbx, 0, fbx, 0, nc);
-            WRPX_SYNC_RHO(bx.loVect(), bx.hiVect(),
-                          BL_TO_FORTRAN_ANYD(crse[mfi]),
-                          BL_TO_FORTRAN_ANYD(ffab),
-                          &nc);
+            const Box& bx = mfi.growntilebox(ng); // only grow to outer directions of tileboxes for filling guards
+
+            amrex::ParallelFor(
+                bx,
+                InterpolateDensityFineToCoarse(fine.const_array(mfi), coarse.array(mfi), refinement_ratio, nc)
+            );
         }
     }
 }
@@ -562,7 +563,7 @@ WarpX::RestrictRhoFromFineToCoarsePatch (int lev)
     if (rho_fp[lev]) {
         rho_cp[lev]->setVal(0.0);
         const IntVect& refinement_ratio = refRatio(lev-1);
-        SyncRho(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
+        interpolateDensityFineToCoarse(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
     }
 }
 
diff --git a/Source/Particles/Gather/FieldGather.H b/Source/Particles/Gather/FieldGather.H
index c5ec6fb5b..bdeab16e8 100644
--- a/Source/Particles/Gather/FieldGather.H
+++ b/Source/Particles/Gather/FieldGather.H
@@ -90,13 +90,6 @@ void doGatherShapeN(const amrex::ParticleReal * const xp,
             const int l0 = compute_shape_factor<depos_order - lower_in_v>(
                 sz0, z-stagger_shift);
 
-            // Set fields on particle to zero
-            Exp[ip] = 0;
-            Eyp[ip] = 0;
-            Ezp[ip] = 0;
-            Bxp[ip] = 0;
-            Byp[ip] = 0;
-            Bzp[ip] = 0;
             // Each field is gathered in a separate block of
             // AMREX_SPACEDIM nested loops because the deposition
             // order can differ for each component of each field
diff --git a/Source/Particles/PhotonParticleContainer.H b/Source/Particles/PhotonParticleContainer.H
index 5d93e5631..98e8e0aab 100644
--- a/Source/Particles/PhotonParticleContainer.H
+++ b/Source/Particles/PhotonParticleContainer.H
@@ -63,7 +63,7 @@ public:
                                 int thread_num,
                                 int lev,
                                 int depos_lev,
-                                amrex::Real dt)  {};
+                                amrex::Real dt) override {};
 
 private:
 
diff --git a/Source/Particles/PhysicalParticleContainer.H b/Source/Particles/PhysicalParticleContainer.H
index f535a4c5b..79685c08b 100644
--- a/Source/Particles/PhysicalParticleContainer.H
+++ b/Source/Particles/PhysicalParticleContainer.H
@@ -136,7 +136,15 @@ public:
                          amrex::Real q_tot, long npart, int do_symmetrize);
 
     void CheckAndAddParticle(amrex::Real x, amrex::Real y, amrex::Real z,
-                             std::array<amrex::Real, 3> u, amrex::Real weight);
+                             std::array<amrex::Real, 3> u,
+                             amrex::Real weight,
+                             amrex::Gpu::HostVector<amrex::ParticleReal>& particle_x,
+                             amrex::Gpu::HostVector<amrex::ParticleReal>& particle_y,
+                             amrex::Gpu::HostVector<amrex::ParticleReal>& particle_z,
+                             amrex::Gpu::HostVector<amrex::ParticleReal>& particle_ux,
+                             amrex::Gpu::HostVector<amrex::ParticleReal>& particle_uy,
+                             amrex::Gpu::HostVector<amrex::ParticleReal>& particle_uz,
+                             amrex::Gpu::HostVector<amrex::ParticleReal>& particle_w);
 
     virtual void GetParticleSlice(const int direction, const amrex::Real z_old,
                                   const amrex::Real z_new, const amrex::Real t_boost,
diff --git a/Source/Particles/PhysicalParticleContainer.cpp b/Source/Particles/PhysicalParticleContainer.cpp
index 352745265..9e12eb26e 100644
--- a/Source/Particles/PhysicalParticleContainer.cpp
+++ b/Source/Particles/PhysicalParticleContainer.cpp
@@ -190,6 +190,16 @@ PhysicalParticleContainer::AddGaussianBeam(Real x_m, Real y_m, Real z_m,
     std::normal_distribution<double> disty(y_m, y_rms);
     std::normal_distribution<double> distz(z_m, z_rms);
 
+    // Allocate temporary vectors on the CPU
+    Gpu::HostVector<ParticleReal> particle_x;
+    Gpu::HostVector<ParticleReal> particle_y;
+    Gpu::HostVector<ParticleReal> particle_z;
+    Gpu::HostVector<ParticleReal> particle_ux;
+    Gpu::HostVector<ParticleReal> particle_uy;
+    Gpu::HostVector<ParticleReal> particle_uz;
+    Gpu::HostVector<ParticleReal> particle_w;
+    int np = 0;
+
     if (ParallelDescriptor::IOProcessor()) {
         // If do_symmetrize, create 4x fewer particles, and
         // Replicate each particle 4 times (x,y) (-x,y) (x,-y) (-x,-y)
@@ -215,44 +225,61 @@ PhysicalParticleContainer::AddGaussianBeam(Real x_m, Real y_m, Real z_m,
                 u.z *= PhysConst::c;
                 if (do_symmetrize){
                     // Add four particles to the beam:
-                    CheckAndAddParticle( x, y, z, { u.x, u.y, u.z}, weight/4. );
-                    CheckAndAddParticle( x,-y, z, { u.x,-u.y, u.z}, weight/4. );
-                    CheckAndAddParticle(-x, y, z, {-u.x, u.y, u.z}, weight/4. );
-                    CheckAndAddParticle(-x,-y, z, {-u.x,-u.y, u.z}, weight/4. );
+                    CheckAndAddParticle(x, y, z, { u.x, u.y, u.z}, weight/4.,
+                                        particle_x,  particle_y,  particle_z,
+                                        particle_ux, particle_uy, particle_uz,
+                                        particle_w);
+                    CheckAndAddParticle(x, -y, z, { u.x, -u.y, u.z}, weight/4.,
+                                        particle_x,  particle_y,  particle_z,
+                                        particle_ux, particle_uy, particle_uz,
+                                        particle_w);
+                    CheckAndAddParticle(-x, y, z, { -u.x, u.y, u.z}, weight/4.,
+                                        particle_x,  particle_y,  particle_z,
+                                        particle_ux, particle_uy, particle_uz,
+                                        particle_w);
+                    CheckAndAddParticle(-x, -y, z, { -u.x, -u.y, u.z}, weight/4.,
+                                        particle_x,  particle_y,  particle_z,
+                                        particle_ux, particle_uy, particle_uz,
+                                        particle_w);
                 } else {
-                    CheckAndAddParticle(x, y, z, {u.x,u.y,u.z}, weight);
+                    CheckAndAddParticle(x, y, z, { u.x, u.y, u.z}, weight,
+                                        particle_x,  particle_y,  particle_z,
+                                        particle_ux, particle_uy, particle_uz,
+                                        particle_w);
                 }
             }
         }
     }
-    Redistribute();
+    // Add the temporary CPU vectors to the particle structure
+    np = particle_z.size();
+    AddNParticles(0,np,
+                  particle_x.dataPtr(),  particle_y.dataPtr(),  particle_z.dataPtr(),
+                  particle_ux.dataPtr(), particle_uy.dataPtr(), particle_uz.dataPtr(),
+                  1, particle_w.dataPtr(),1);
 }
 
 void
 PhysicalParticleContainer::CheckAndAddParticle(Real x, Real y, Real z,
                                                std::array<Real, 3> u,
-                                               Real weight)
+                                               Real weight,
+                                               Gpu::HostVector<ParticleReal>& particle_x,
+                                               Gpu::HostVector<ParticleReal>& particle_y,
+                                               Gpu::HostVector<ParticleReal>& particle_z,
+                                               Gpu::HostVector<ParticleReal>& particle_ux,
+                                               Gpu::HostVector<ParticleReal>& particle_uy,
+                                               Gpu::HostVector<ParticleReal>& particle_uz,
+                                               Gpu::HostVector<ParticleReal>& particle_w)
 {
-    std::array<ParticleReal,PIdx::nattribs> attribs;
-    attribs.fill(0.0);
-
-    // update attribs with input arguments
     if (WarpX::gamma_boost > 1.) {
         MapParticletoBoostedFrame(x, y, z, u);
     }
-    attribs[PIdx::ux] = u[0];
-    attribs[PIdx::uy] = u[1];
-    attribs[PIdx::uz] = u[2];
-    attribs[PIdx::w ] = weight;
-
-    if ( (NumRuntimeRealComps()>0) || (NumRuntimeIntComps()>0) )
-    {
-        // need to create old values
-        auto& particle_tile = DefineAndReturnParticleTile(0, 0, 0);
-    }
-
-    // add particle
-    AddOneParticle(0, 0, 0, x, y, z, attribs);
+    particle_x.push_back(x);
+    particle_y.push_back(y);
+    particle_z.push_back(z);
+    particle_ux.push_back(u[0]);
+    particle_uy.push_back(u[1]);
+    particle_uz.push_back(u[2]);
+    particle_w.push_back(weight);
 }
 
 void
diff --git a/Source/Particles/Sorting/Make.package b/Source/Particles/Sorting/Make.package
index 750d2607e..f9c708e5b 100644
--- a/Source/Particles/Sorting/Make.package
+++ b/Source/Particles/Sorting/Make.package
@@ -1,3 +1,4 @@
+CEXE_headers += SortingUtils.H
 CEXE_sources += Partition.cpp
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Particles/Sorting
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Particles/Sorting
diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 683dbbd04..d1455249a 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -1,5 +1,6 @@
-#include <WarpX.H>
+#include <SortingUtils.H>
 #include <PhysicalParticleContainer.H>
+#include <WarpX.H>
 #include <AMReX_Particles.H>
 
 using namespace amrex;
@@ -41,56 +42,64 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
 {
     BL_PROFILE("PPC::Evolve::partition");
 
-    std::vector<bool> inexflag;
+    // Initialize temporary arrays
+    Gpu::DeviceVector<int> inexflag;
     inexflag.resize(np);
+    Gpu::DeviceVector<long> pid;
+    pid.resize(np);
 
-    auto& aos = pti.GetArrayOfStructs();
+    // First, partition particles into the larger buffer
 
-    // We need to partition the large buffer first
+    // - Select the larger buffer
     iMultiFab const* bmasks =
         (WarpX::n_field_gather_buffer >= WarpX::n_current_deposition_buffer) ?
         gather_masks : current_masks;
-    int i = 0;
-    const auto& msk = (*bmasks)[pti];
-    for (const auto& p : aos) {
-        const IntVect& iv = Index(p, lev);
-        inexflag[i++] = msk(iv);
-    }
-
-    Vector<long> pid;
-    pid.resize(np);
-    std::iota(pid.begin(), pid.end(), 0L);
-    auto sep = std::stable_partition(pid.begin(), pid.end(),
-                                     [&inexflag](long id) { return inexflag[id]; });
+    // - For each particle, find whether it is in the larger buffer,
+    //   by looking up the mask. Store the answer in `inexflag`.
+    amrex::ParallelFor( np, fillBufferFlag(pti, bmasks, inexflag, Geom(lev), 0) );
+    // - Find the indices that reorder particles so that the last particles
+    //   are in the larger buffer
+    fillWithConsecutiveIntegers( pid );
+    auto const sep = stablePartition( pid.begin(), pid.end(), inexflag );
+    // At the end of this step, `pid` contains the indices that should be used to
+    // reorder the particles, and `sep` is the position in the array that
+    // separates the particles that deposit/gather on the fine patch (first part)
+    // and the particles that deposit/gather in the buffers (last part)
+    long const n_fine = iteratorDistance(pid.begin(), sep);
+    // Number of particles on fine patch, i.e. outside of the larger buffer
+
+    // Second, among particles that are in the larger buffer, partition
+    // particles into the smaller buffer
 
     if (WarpX::n_current_deposition_buffer == WarpX::n_field_gather_buffer) {
-        nfine_current = nfine_gather = std::distance(pid.begin(), sep);
-    } else if (sep != pid.end()) {
+        // No need to do anything if the buffers have the same size
+        nfine_current = nfine_gather = iteratorDistance(pid.begin(), sep);
+    } else if (sep == pid.end()) {
+        // No need to do anything if there are no particles in the larger buffer
+        nfine_current = nfine_gather = np;
+    } else {
         int n_buf;
         if (bmasks == gather_masks) {
-            nfine_gather = std::distance(pid.begin(), sep);
+            nfine_gather = n_fine;
             bmasks = current_masks;
             n_buf = WarpX::n_current_deposition_buffer;
         } else {
-            nfine_current = std::distance(pid.begin(), sep);
+            nfine_current = n_fine;
             bmasks = gather_masks;
             n_buf = WarpX::n_field_gather_buffer;
         }
         if (n_buf > 0)
         {
-            const auto& msk2 = (*bmasks)[pti];
-            for (auto it = sep; it != pid.end(); ++it) {
-                const long id = *it;
-                const IntVect& iv = Index(aos[id], lev);
-                inexflag[id] = msk2(iv);
-            }
+            // - For each particle in the large buffer, find whether it is in
+            // the smaller buffer, by looking up the mask. Store the answer in `inexflag`.
+            amrex::ParallelFor( np - n_fine,
+               fillBufferFlag(pti, bmasks, inexflag, Geom(lev), n_fine) );
+            auto const sep2 = stablePartition( sep, pid.end(), inexflag );
 
-            auto sep2 = std::stable_partition(sep, pid.end(),
-                                              [&inexflag](long id) {return inexflag[id];});
             if (bmasks == gather_masks) {
-                nfine_gather = std::distance(pid.begin(), sep2);
+                nfine_gather = iteratorDistance(pid.begin(), sep2);
             } else {
-                nfine_current = std::distance(pid.begin(), sep2);
+                nfine_current = iteratorDistance(pid.begin(), sep2);
             }
         }
     }
@@ -103,39 +112,38 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
         nfine_gather = 0;
     }
 
+    // Reorder the actual particle array, using the `pid` indices
     if (nfine_current != np || nfine_gather != np)
     {
-
+        // Temporary array for particle AoS
         ParticleVector particle_tmp;
         particle_tmp.resize(np);
 
-        for (long ip = 0; ip < np; ++ip) {
-            particle_tmp[ip] = aos[pid[ip]];
-        }
+        // Copy particle AoS
+        auto& aos = pti.GetArrayOfStructs();
+        amrex::ParallelFor( np,
+            copyAndReorder<ParticleType>( aos(), particle_tmp, pid ) );
         std::swap(aos(), particle_tmp);
 
+        // Temporary array for particle individual attributes
         RealVector tmp;
         tmp.resize(np);
 
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = wp[pid[ip]];
-        }
+        // Copy individual attributes
+        amrex::ParallelFor( np, copyAndReorder<Real>( wp, tmp, pid ) );
         std::swap(wp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uxp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uxp, tmp, pid ) );
         std::swap(uxp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uyp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uyp, tmp, pid ) );
         std::swap(uyp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uzp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uzp, tmp, pid ) );
         std::swap(uzp, tmp);
-    }
 
+        // Make sure that the temporary arrays are not destroyed before
+        // the GPU kernels finish running
+        Gpu::streamSynchronize();
+    }
+    // Make sure that the temporary arrays are not destroyed before
+    // the GPU kernels finish running
+    Gpu::streamSynchronize();
 }
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
new file mode 100644
index 000000000..852b8a73f
--- /dev/null
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -0,0 +1,164 @@
+#ifndef WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
+#define WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
+
+#include <WarpXParticleContainer.H>
+#include <AMReX_CudaContainers.H>
+#include <AMReX_Gpu.H>
+#ifdef AMREX_USE_GPU
+    #include <thrust/partition.h>
+    #include <thrust/distance.h>
+#endif
+
+/* \brief Fill the elements of the input vector with consecutive integer,
+ *        starting from 0
+ *
+ * \param[inout] v Vector of integers, to be filled by this routine
+ */
+void fillWithConsecutiveIntegers( amrex::Gpu::DeviceVector<long>& v )
+{
+#ifdef AMREX_USE_GPU
+    // On GPU: Use thrust
+    thrust::sequence( v.begin(), v.end() );
+#else
+    // On CPU: Use std library
+    std::iota( v.begin(), v.end(), 0L );
+#endif
+}
+
+/* \brief Find the indices that would reorder the elements of `predicate`
+ * so that the elements with non-zero value precede the other elements
+ *
+ * \param[in, out] index_begin Point to the beginning of the vector which is
+ *            to be filled with these indices
+ * \param[in, out] index_begin Point to the end of the vector which is
+ *            to be filled with these indices
+ * \param[in] Vector that indicates the elements that need to be reordered first
+ */
+template< typename ForwardIterator >
+ForwardIterator stablePartition(ForwardIterator const index_begin,
+                               ForwardIterator const index_end,
+                               amrex::Gpu::DeviceVector<int> const& predicate)
+{
+#ifdef AMREX_USE_GPU
+    // On GPU: Use thrust
+    int const* AMREX_RESTRICT predicate_ptr = predicate.dataPtr();
+    ForwardIterator const sep = thrust::stable_partition(
+        thrust::cuda::par(amrex::Cuda::The_ThrustCachedAllocator()),
+        index_begin, index_end,
+        [predicate_ptr] AMREX_GPU_DEVICE (long i) { return predicate_ptr[i]; }
+    );
+#else
+    // On CPU: Use std library
+    ForwardIterator const sep = std::stable_partition(
+        index_begin, index_end,
+        [&predicate](long i) { return predicate[i]; }
+    );
+#endif
+    return sep;
+}
+
+/* \brief Return the number of elements between `first` and `last`
+ *
+ * \param[in] fist Points to a position in a vector
+ * \param[in] last Points to another position in a vector
+ * \return The number of elements between `first` and `last`
+ */
+template< typename ForwardIterator >
+int iteratorDistance(ForwardIterator const first,
+                     ForwardIterator const last)
+{
+#ifdef AMREX_USE_GPU
+    // On GPU: Use thrust
+    return thrust::distance( first, last );
+#else
+    return std::distance( first, last );
+#endif
+}
+
+/* \brief Functor that fills the elements of the particle array `inexflag`
+ *  with the value of the spatial array `bmasks`, at the corresponding particle position.
+ *
+ * This is done only for the elements from `start_index` to the end of `inexflag`
+ *
+ * \param[in] pti Contains information on the particle positions
+ * \param[in] bmasks Spatial array, that contains a flag indicating
+ *         whether each cell is part of the gathering/deposition buffers
+ * \param[out] inexflag Vector to be filled with the value of `bmasks`
+ * \param[in] geom Geometry object, necessary to locate particles within the array `bmasks`
+ * \param[in] start_index Index that which elements start to be modified
+ */
+class fillBufferFlag
+{
+    public:
+        fillBufferFlag( WarpXParIter const& pti, amrex::iMultiFab const* bmasks,
+                        amrex::Gpu::DeviceVector<int>& inexflag,
+                        amrex::Geometry const& geom, long const start_index=0 ) :
+                        m_start_index(start_index) {
+
+            // Extract simple structure that can be used directly on the GPU
+            m_particles = &(pti.GetArrayOfStructs()[0]);
+            m_buffer_mask = (*bmasks)[pti].array();
+            m_inexflag_ptr = inexflag.dataPtr();
+            m_domain = geom.Domain();
+            for (int idim=0; idim<AMREX_SPACEDIM; idim++) {
+                m_prob_lo[idim] = geom.ProbLo(idim);
+                m_inv_cell_size[idim] = geom.InvCellSize(idim);
+            }
+        };
+
+
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        void operator()( const long i ) const {
+            // Select a particle
+            auto const& p = m_particles[i+m_start_index];
+            // Find the index of the cell where this particle is located
+            amrex::IntVect const iv = amrex::getParticleCell( p,
+                                m_prob_lo, m_inv_cell_size, m_domain );
+            // Find the value of the buffer flag in this cell and
+            // store it at the corresponding particle position in the array `inexflag`
+            m_inexflag_ptr[i+m_start_index] = m_buffer_mask(iv);
+        };
+
+    private:
+        amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> m_prob_lo;
+        amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> m_inv_cell_size;
+        amrex::Box m_domain;
+        int* m_inexflag_ptr;
+        WarpXParticleContainer::ParticleType const* m_particles;
+        amrex::Array4<int const> m_buffer_mask;
+        long const m_start_index;
+};
+
+/* \brief Functor that copies the elements of `src` into `dst`,
+ *       while reordering them according to `indices`
+ *
+ * \param[in] src Source vector
+ * \param[out] dst Destination vector
+ * \param[in] indices Array of indices that indicate how to reorder elements
+ */
+template <typename T>
+class copyAndReorder
+{
+    public:
+        copyAndReorder(
+            amrex::Gpu::ManagedDeviceVector<T> const& src,
+            amrex::Gpu::ManagedDeviceVector<T>& dst,
+            amrex::Gpu::DeviceVector<long> const& indices ) {
+            // Extract simple structure that can be used directly on the GPU
+            m_src_ptr = src.dataPtr();
+            m_dst_ptr = dst.dataPtr();
+            m_indices_ptr = indices.dataPtr();
+        };
+
+        AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+        void operator()( const long ip ) const {
+            m_dst_ptr[ip] = m_src_ptr[ m_indices_ptr[ip] ];
+        };
+
+    private:
+        T const* m_src_ptr;
+        T* m_dst_ptr;
+        long const* m_indices_ptr;
+};
+
+#endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
diff --git a/Source/Particles/WarpXParticleContainer.H b/Source/Particles/WarpXParticleContainer.H
index b717d89e6..4b36b12f6 100644
--- a/Source/Particles/WarpXParticleContainer.H
+++ b/Source/Particles/WarpXParticleContainer.H
@@ -236,14 +236,6 @@ public:
                         const amrex::ParticleReal* vx, const amrex::ParticleReal* vy, const amrex::ParticleReal* vz,
                         int nattr, const amrex::ParticleReal* attr, int uniqueparticles, int id=-1);
 
-    void AddOneParticle (int lev, int grid, int tile,
-                         amrex::ParticleReal x, amrex::ParticleReal y, amrex::ParticleReal z,
-                         std::array<amrex::ParticleReal,PIdx::nattribs>& attribs);
-
-    void AddOneParticle (ParticleTileType& particle_tile,
-                         amrex::ParticleReal x, amrex::ParticleReal y, amrex::ParticleReal z,
-                         std::array<amrex::ParticleReal,PIdx::nattribs>& attribs);
-
     virtual void ReadHeader (std::istream& is);
 
     virtual void WriteHeader (std::ostream& os) const;
diff --git a/Source/Particles/WarpXParticleContainer.cpp b/Source/Particles/WarpXParticleContainer.cpp
index 65a82f233..7fb57500d 100644
--- a/Source/Particles/WarpXParticleContainer.cpp
+++ b/Source/Particles/WarpXParticleContainer.cpp
@@ -135,45 +135,6 @@ WarpXParticleContainer::AllocData ()
 }
 
 void
-WarpXParticleContainer::AddOneParticle (int lev, int grid, int tile,
-                                        ParticleReal x, ParticleReal y, ParticleReal z,
-                                        std::array<ParticleReal,PIdx::nattribs>& attribs)
-{
-    auto& particle_tile = DefineAndReturnParticleTile(lev, grid, tile);
-    AddOneParticle(particle_tile, x, y, z, attribs);
-}
-
-void
-WarpXParticleContainer::AddOneParticle (ParticleTileType& particle_tile,
-                                        ParticleReal x, ParticleReal y, ParticleReal z,
-                                        std::array<ParticleReal,PIdx::nattribs>& attribs)
-{
-    ParticleType p;
-    p.id()  = ParticleType::NextID();
-    p.cpu() = ParallelDescriptor::MyProc();
-#if (AMREX_SPACEDIM == 3)
-    p.pos(0) = x;
-    p.pos(1) = y;
-    p.pos(2) = z;
-#elif (AMREX_SPACEDIM == 2)
-#ifdef WARPX_DIM_RZ
-    attribs[PIdx::theta] = std::atan2(y, x);
-    x = std::sqrt(x*x + y*y);
-#endif
-    p.pos(0) = x;
-    p.pos(1) = z;
-#endif
-
-    particle_tile.push_back(p);
-    particle_tile.push_back_real(attribs);
-
-    for (int i = PIdx::nattribs; i < NumRealComps(); ++i)
-    {
-        particle_tile.push_back_real(i, 0.0);
-    }
-}
-
-void
 WarpXParticleContainer::AddNParticles (int lev,
                                        int n, const ParticleReal* x, const ParticleReal* y, const ParticleReal* z,
                                        const ParticleReal* vx, const ParticleReal* vy, const ParticleReal* vz,
diff --git a/Source/Python/WarpXWrappers.cpp b/Source/Python/WarpXWrappers.cpp
index c6d7dfdb8..be9ec9519 100644
--- a/Source/Python/WarpXWrappers.cpp
+++ b/Source/Python/WarpXWrappers.cpp
@@ -1,7 +1,4 @@
 
-#include <AMReX.H>
-#include <AMReX_BLProfiler.H>
-
 #include <WarpXWrappers.h>
 #include <WarpXParticleContainer.H>
 #include <WarpX.H>
@@ -10,7 +7,7 @@
 
 namespace
 {
-    double** getMultiFabPointers(const amrex::MultiFab& mf, int *num_boxes, int *ncomps, int *ngrow, int **shapes)
+    amrex::Real** getMultiFabPointers(const amrex::MultiFab& mf, int *num_boxes, int *ncomps, int *ngrow, int **shapes)
     {
         *ncomps = mf.nComp();
         *ngrow = mf.nGrow();
@@ -18,14 +15,14 @@ namespace
         int shapesize = AMREX_SPACEDIM;
         if (mf.nComp() > 1) shapesize += 1;
         *shapes = (int*) malloc(shapesize * (*num_boxes) * sizeof(int));
-        double** data = (double**) malloc((*num_boxes) * sizeof(double*));
+        amrex::Real** data = (amrex::Real**) malloc((*num_boxes) * sizeof(amrex::Real*));
 
 #ifdef _OPENMP
 #pragma omp parallel
 #endif
         for ( amrex::MFIter mfi(mf, false); mfi.isValid(); ++mfi ) {
             int i = mfi.LocalIndex();
-            data[i] = (double*) mf[mfi].dataPtr();
+            data[i] = (amrex::Real*) mf[mfi].dataPtr();
             for (int j = 0; j < AMREX_SPACEDIM; ++j) {
                 (*shapes)[shapesize*i+j] = mf[mfi].box().length(j);
             }
@@ -53,6 +50,16 @@ namespace
 extern "C"
 {
 
+    int warpx_Real_size()
+    {
+        return (int)sizeof(amrex::Real);
+    }
+
+    int warpx_ParticleReal_size()
+    {
+        return (int)sizeof(amrex::ParticleReal);
+    }
+
     int warpx_nSpecies()
     {
         auto & mypc = WarpX::GetInstance().GetPartContainer();
@@ -165,9 +172,9 @@ extern "C"
     }
 
     void warpx_addNParticles(int speciesnumber, int lenx,
-                             double* x, double* y, double* z,
-                             double* vx, double* vy, double* vz,
-                             int nattr, double* attr, int uniqueparticles)
+                             amrex::ParticleReal* x, amrex::ParticleReal* y, amrex::ParticleReal* z,
+                             amrex::ParticleReal* vx, amrex::ParticleReal* vy, amrex::ParticleReal* vz,
+                             int nattr, amrex::ParticleReal* attr, int uniqueparticles)
     {
         auto & mypc = WarpX::GetInstance().GetPartContainer();
         auto & myspc = mypc.GetParticleContainer(speciesnumber);
@@ -180,14 +187,14 @@ extern "C"
       ConvertLabParamsToBoost();
     }
 
-    double warpx_getProbLo(int dir)
+    amrex::Real warpx_getProbLo(int dir)
     {
       WarpX& warpx = WarpX::GetInstance();
       const amrex::Geometry& geom = warpx.Geom(0);
       return geom.ProbLo(dir);
     }
 
-    double warpx_getProbHi(int dir)
+    amrex::Real warpx_getProbHi(int dir)
     {
       WarpX& warpx = WarpX::GetInstance();
       const amrex::Geometry& geom = warpx.Geom(0);
@@ -200,7 +207,7 @@ extern "C"
         return myspc.TotalNumberOfParticles();
     }
 
-    double** warpx_getEfield(int lev, int direction,
+    amrex::Real** warpx_getEfield(int lev, int direction,
                              int *return_size, int *ncomps, int *ngrow, int **shapes) {
         auto & mf = WarpX::GetInstance().getEfield(lev, direction);
         return getMultiFabPointers(mf, return_size, ncomps, ngrow, shapes);
@@ -212,7 +219,7 @@ extern "C"
         return getMultiFabLoVects(mf, return_size, ngrow);
     }
 
-    double** warpx_getEfieldCP(int lev, int direction,
+    amrex::Real** warpx_getEfieldCP(int lev, int direction,
                                int *return_size, int *ncomps, int *ngrow, int **shapes) {
         auto & mf = WarpX::GetInstance().getEfield_cp(lev, direction);
         return getMultiFabPointers(mf, return_size, ncomps, ngrow, shapes);
@@ -224,7 +231,7 @@ extern "C"
         return getMultiFabLoVects(mf, return_size, ngrow);
     }
 
-    double** warpx_getEfieldFP(int lev, int direction,
+    amrex::Real** warpx_getEfieldFP(int lev, int direction,
                                int *return_size, int *ncomps, int *ngrow, int **shapes) {
         auto & mf = WarpX::GetInstance().getEfield_fp(lev, direction);
         return getMultiFabPointers(mf, return_size, ncomps, ngrow, shapes);
@@ -236,7 +243,7 @@ extern "C"
         return getMultiFabLoVects(mf, return_size, ngrow);
     }
 
-    double** warpx_getBfield(int lev, int direction,
+    amrex::Real** warpx_getBfield(int lev, int direction,
                              int *return_size, int *ncomps, int *ngrow, int **shapes) {
         auto & mf = WarpX::GetInstance().getBfield(lev, direction);
         return getMultiFabPointers(mf, return_size, ncomps, ngrow, shapes);
@@ -248,7 +255,7 @@ extern "C"
         return getMultiFabLoVects(mf, return_size, ngrow);
     }
 
-    double** warpx_getBfieldCP(int lev, int direction,
+    amrex::Real** warpx_getBfieldCP(int lev, int direction,
                                int *return_size, int *ncomps, int *ngrow, int **shapes) {
         auto & mf = WarpX::GetInstance().getBfield_cp(lev, direction);
         return getMultiFabPointers(mf, return_size, ncomps, ngrow, shapes);
@@ -260,7 +267,7 @@ extern "C"
         return getMultiFabLoVects(mf, return_size, ngrow);
     }
 
-    double** warpx_getBfieldFP(int lev, int direction,
+    amrex::Real** warpx_getBfieldFP(int lev, int direction,
                                int *return_size, int *ncomps, int *ngrow, int **shapes) {
         auto & mf = WarpX::GetInstance().getBfield_fp(lev, direction);
         return getMultiFabPointers(mf, return_size, ncomps, ngrow, shapes);
@@ -272,7 +279,7 @@ extern "C"
         return getMultiFabLoVects(mf, return_size, ngrow);
     }
 
-    double** warpx_getCurrentDensity(int lev, int direction,
+    amrex::Real** warpx_getCurrentDensity(int lev, int direction,
                                      int *return_size, int *ncomps, int *ngrow, int **shapes) {
         auto & mf = WarpX::GetInstance().getcurrent(lev, direction);
         return getMultiFabPointers(mf, return_size, ncomps, ngrow, shapes);
@@ -284,7 +291,7 @@ extern "C"
         return getMultiFabLoVects(mf, return_size, ngrow);
     }
 
-    double** warpx_getCurrentDensityCP(int lev, int direction,
+    amrex::Real** warpx_getCurrentDensityCP(int lev, int direction,
                                        int *return_size, int *ncomps, int *ngrow, int **shapes) {
         auto & mf = WarpX::GetInstance().getcurrent_cp(lev, direction);
         return getMultiFabPointers(mf, return_size, ncomps, ngrow, shapes);
@@ -296,7 +303,7 @@ extern "C"
         return getMultiFabLoVects(mf, return_size, ngrow);
     }
 
-    double** warpx_getCurrentDensityFP(int lev, int direction,
+    amrex::Real** warpx_getCurrentDensityFP(int lev, int direction,
                                        int *return_size, int *ncomps, int *ngrow, int **shapes) {
         auto & mf = WarpX::GetInstance().getcurrent_fp(lev, direction);
         return getMultiFabPointers(mf, return_size, ncomps, ngrow, shapes);
@@ -308,7 +315,7 @@ extern "C"
         return getMultiFabLoVects(mf, return_size, ngrow);
     }
 
-    double** warpx_getParticleStructs(int speciesnumber, int lev,
+    amrex::ParticleReal** warpx_getParticleStructs(int speciesnumber, int lev,
                                       int* num_tiles, int** particles_per_tile) {
         auto & mypc = WarpX::GetInstance().GetPartContainer();
         auto & myspc = mypc.GetParticleContainer(speciesnumber);
@@ -320,17 +327,17 @@ extern "C"
         *num_tiles = i;
         *particles_per_tile = (int*) malloc(*num_tiles*sizeof(int));
 
-        double** data = (double**) malloc(*num_tiles*sizeof(typename WarpXParticleContainer::ParticleType*));
+        amrex::ParticleReal** data = (amrex::ParticleReal**) malloc(*num_tiles*sizeof(typename WarpXParticleContainer::ParticleType*));
         i = 0;
         for (WarpXParIter pti(myspc, lev); pti.isValid(); ++pti, ++i) {
             auto& aos = pti.GetArrayOfStructs();
-            data[i] = (double*) aos.data();
+            data[i] = (amrex::ParticleReal*) aos.data();
             (*particles_per_tile)[i] = pti.numParticles();
         }
         return data;
     }
 
-    double** warpx_getParticleArrays(int speciesnumber, int comp, int lev,
+    amrex::ParticleReal** warpx_getParticleArrays(int speciesnumber, int comp, int lev,
                                      int* num_tiles, int** particles_per_tile) {
         auto & mypc = WarpX::GetInstance().GetPartContainer();
         auto & myspc = mypc.GetParticleContainer(speciesnumber);
@@ -342,11 +349,11 @@ extern "C"
         *num_tiles = i;
         *particles_per_tile = (int*) malloc(*num_tiles*sizeof(int));
 
-        double** data = (double**) malloc(*num_tiles*sizeof(double*));
+        amrex::ParticleReal** data = (amrex::ParticleReal**) malloc(*num_tiles*sizeof(amrex::ParticleReal*));
         i = 0;
         for (WarpXParIter pti(myspc, lev); pti.isValid(); ++pti, ++i) {
             auto& soa = pti.GetStructOfArrays();
-            data[i] = (double*) soa.GetRealData(comp).dataPtr();
+            data[i] = (amrex::ParticleReal*) soa.GetRealData(comp).dataPtr();
             (*particles_per_tile)[i] = pti.numParticles();
         }
         return data;
@@ -361,11 +368,11 @@ extern "C"
         warpx.MoveWindow (true);
     }
 
-    void warpx_EvolveE (double dt) {
+    void warpx_EvolveE (amrex::Real dt) {
         WarpX& warpx = WarpX::GetInstance();
         warpx.EvolveE (dt);
     }
-    void warpx_EvolveB (double dt) {
+    void warpx_EvolveB (amrex::Real dt) {
         WarpX& warpx = WarpX::GetInstance();
         warpx.EvolveB (dt);
     }
@@ -385,7 +392,7 @@ extern "C"
         WarpX& warpx = WarpX::GetInstance();
         warpx.UpdateAuxilaryData ();
     }
-    void warpx_PushParticlesandDepose (double cur_time) {
+    void warpx_PushParticlesandDepose (amrex::Real cur_time) {
         WarpX& warpx = WarpX::GetInstance();
         warpx.PushParticlesandDepose (cur_time);
     }
@@ -398,15 +405,15 @@ extern "C"
         WarpX& warpx = WarpX::GetInstance();
         warpx.setistep (lev, ii);
     }
-    double warpx_gett_new (int lev) {
+    amrex::Real warpx_gett_new (int lev) {
         WarpX& warpx = WarpX::GetInstance();
         return warpx.gett_new (lev);
     }
-    void warpx_sett_new (int lev, double time) {
+    void warpx_sett_new (int lev, amrex::Real time) {
         WarpX& warpx = WarpX::GetInstance();
         warpx.sett_new (lev, time);
     }
-    double warpx_getdt (int lev) {
+    amrex::Real warpx_getdt (int lev) {
         WarpX& warpx = WarpX::GetInstance();
         return warpx.getdt (lev);
     }
@@ -415,7 +422,7 @@ extern "C"
         WarpX& warpx = WarpX::GetInstance();
         return warpx.maxStep ();
     }
-    double warpx_stopTime () {
+    amrex::Real warpx_stopTime () {
         WarpX& warpx = WarpX::GetInstance();
         return warpx.stopTime ();
     }
diff --git a/Source/Python/WarpXWrappers.h b/Source/Python/WarpXWrappers.h
index 233a0b930..a272b9250 100644
--- a/Source/Python/WarpXWrappers.h
+++ b/Source/Python/WarpXWrappers.h
@@ -1,6 +1,9 @@
 #ifndef WARPX_WRAPPERS_H_
 #define WARPX_WRAPPERS_H_
 
+#include <AMReX.H>
+#include <AMReX_BLProfiler.H>
+
 #ifdef BL_USE_MPI
 #include <mpi.h>
 #endif
@@ -9,6 +12,9 @@
 extern "C" {
 #endif
 
+    int warpx_Real_size();
+    int warpx_ParticleReal_size();
+
     int warpx_nSpecies();
 
     bool warpx_use_fdtd_nci_corr();
@@ -49,61 +55,61 @@ extern "C" {
     void warpx_evolve (int numsteps);  // -1 means the inputs parameter will be used.
 
     void warpx_addNParticles(int speciesnumber, int lenx,
-                             double* x, double* y, double* z,
-                             double* vx, double* vy, double* vz,
-                             int nattr, double* attr, int uniqueparticles);
+                             amrex::ParticleReal* x, amrex::ParticleReal* y, amrex::ParticleReal* z,
+                             amrex::ParticleReal* vx, amrex::ParticleReal* vy, amrex::ParticleReal* vz,
+                             int nattr, amrex::ParticleReal* attr, int uniqueparticles);
 
     void warpx_ConvertLabParamsToBoost();
 
-    double warpx_getProbLo(int dir);
+    amrex::Real warpx_getProbLo(int dir);
 
-    double warpx_getProbHi(int dir);
+    amrex::Real warpx_getProbHi(int dir);
 
     long warpx_getNumParticles(int speciesnumber);
 
-    double** warpx_getEfield(int lev, int direction,
-                             int *return_size, int* ncomps, int* ngrow, int **shapes);
+    amrex::Real** warpx_getEfield(int lev, int direction,
+                                  int *return_size, int* ncomps, int* ngrow, int **shapes);
 
     int* warpx_getEfieldLoVects(int lev, int direction,
                                 int *return_size, int* ngrow);
 
-    double** warpx_getBfield(int lev, int direction,
-                             int *return_size, int* ncomps, int* ngrow, int **shapes);
+    amrex::Real** warpx_getBfield(int lev, int direction,
+                                  int *return_size, int* ncomps, int* ngrow, int **shapes);
 
     int* warpx_getBfieldLoVects(int lev, int direction,
                                 int *return_size, int* ngrow);
 
-    double** warpx_getCurrentDensity(int lev, int direction,
-                                     int *return_size, int* ncomps, int* ngrow, int **shapes);
+    amrex::Real** warpx_getCurrentDensity(int lev, int direction,
+                                          int *return_size, int* ncomps, int* ngrow, int **shapes);
 
     int* warpx_getCurrentDensityLoVects(int lev, int direction,
                                         int *return_size, int* ngrow);
 
-    double** warpx_getParticleStructs(int speciesnumber, int lev,
-                                      int* num_tiles, int** particles_per_tile);
+    amrex::ParticleReal** warpx_getParticleStructs(int speciesnumber, int lev,
+                                                   int* num_tiles, int** particles_per_tile);
 
-    double** warpx_getParticleArrays(int speciesnumber, int comp, int lev,
-                                     int* num_tiles, int** particles_per_tile);
+    amrex::ParticleReal** warpx_getParticleArrays(int speciesnumber, int comp, int lev,
+                                                  int* num_tiles, int** particles_per_tile);
 
   void warpx_ComputeDt ();
   void warpx_MoveWindow ();
 
-  void warpx_EvolveE (double dt);
-  void warpx_EvolveB (double dt);
+  void warpx_EvolveE (amrex::Real dt);
+  void warpx_EvolveB (amrex::Real dt);
   void warpx_FillBoundaryE ();
   void warpx_FillBoundaryB ();
   void warpx_SyncCurrent ();
   void warpx_UpdateAuxilaryData ();
-  void warpx_PushParticlesandDepose (double cur_time);
+  void warpx_PushParticlesandDepose (amrex::Real cur_time);
 
   int warpx_getistep (int lev);
   void warpx_setistep (int lev, int ii);
-  double warpx_gett_new (int lev);
-  void warpx_sett_new (int lev, double time);
-  double warpx_getdt (int lev);
+  amrex::Real warpx_gett_new (int lev);
+  void warpx_sett_new (int lev, amrex::Real time);
+  amrex::Real warpx_getdt (int lev);
 
   int warpx_maxStep ();
-  double warpx_stopTime ();
+  amrex::Real warpx_stopTime ();
 
   int warpx_checkInt ();
   int warpx_plotInt ();
diff --git a/Source/WarpX.H b/Source/WarpX.H
index c59802427..0da1cf350 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -436,10 +436,12 @@ private:
      *  averaging the values of the charge density of the fine patch (on the same level).
      *
      * \param[in]  fine             fine patches to interpolate from
-     * \param[out] crse             coarse patches to interpolate to
-     * \param[in]  ref_ratio        integer ratio between the two
+     * \param[out] coarse           coarse patches to interpolate to
+     * \param[in]  refinement_ratio integer ratio between the two
      */
-    void SyncRho (const amrex::MultiFab& fine, amrex::MultiFab& crse, int ref_ratio);
+    void interpolateDensityFineToCoarse (const amrex::MultiFab& fine,
+                                               amrex::MultiFab& coarse,
+                                         int const refinement_ratio);
 
     void ExchangeWithPmlB (int lev);
     void ExchangeWithPmlE (int lev);
diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp
index 91174ee0b..5a51d7d13 100644
--- a/Source/WarpX.cpp
+++ b/Source/WarpX.cpp
@@ -137,13 +137,13 @@ WarpX::WarpX ()
     // No valid BoxArray and DistributionMapping have been defined.
     // But the arrays for them have been resized.
 
-    int nlevs_max = maxLevel() + 1;
+    const int nlevs_max = maxLevel() + 1;
 
     istep.resize(nlevs_max, 0);
     nsubsteps.resize(nlevs_max, 1);
 #if 0
     // no subcycling yet
-    for (int lev = 1; lev <= maxLevel(); ++lev) {
+    for (int lev = 1; lev < nlevs_max; ++lev) {
         nsubsteps[lev] = MaxRefRatio(lev-1);
     }
 #endif
@@ -239,7 +239,7 @@ WarpX::WarpX ()
 
 WarpX::~WarpX ()
 {
-    int nlevs_max = maxLevel() +1;
+    const int nlevs_max = maxLevel() +1;
     for (int lev = 0; lev < nlevs_max; ++lev) {
         ClearLevel(lev);
     }
@@ -452,7 +452,8 @@ WarpX::ReadParameters ()
         }
 
         // Check that the coarsening_ratio can divide the blocking factor
-        for (int lev=0; lev<maxLevel(); lev++){
+        const int nlevs_max = maxLevel();
+        for (int lev=0; lev<nlevs_max; lev++){
           for (int comp=0; comp<AMREX_SPACEDIM; comp++){
             if ( blockingFactor(lev)[comp] % plot_coarsening_ratio != 0 ){
               amrex::Abort("plot_coarsening_ratio should be an integer "