5 files changed, 143 insertions, 88 deletions
diff --git a/Source/FortranInterface/WarpX_f.F90 b/Source/FortranInterface/WarpX_f.F90
index 2ddf9409d..762ed2cd8 100644
--- a/Source/FortranInterface/WarpX_f.F90
+++ b/Source/FortranInterface/WarpX_f.F90
@@ -51,65 +51,6 @@ contains
 
   end subroutine warpx_compute_E
 
-  subroutine warpx_sync_rho_2d (lo, hi, crse, clo, chi, fine, flo, fhi, nc) &
-       bind(c, name='warpx_sync_rho_2d')
-    integer, intent(in) :: lo(2), hi(2), flo(2), fhi(2), clo(2), chi(2), nc
-    real(amrex_real), intent(in   ) :: fine(flo(1):fhi(1),flo(2):fhi(2),nc)
-    real(amrex_real), intent(inout) :: crse(clo(1):chi(1),clo(2):chi(2),nc)
-
-    integer :: i,j,ii,jj,m
-
-    do m = 1, nc
-       do j = lo(2), hi(2)
-          jj = j*2
-          do i = lo(1), hi(1)
-             ii = i*2
-             crse(i,j,m) = 0.25d0 * &
-                  ( fine(ii,jj,m) + 0.5d0 *(fine(ii-1,jj  ,m)+fine(ii+1,jj  ,m) &
-                  &                       + fine(ii  ,jj-1,m)+fine(ii  ,jj+1,m)) &
-                  &               + 0.25d0*(fine(ii-1,jj-1,m)+fine(ii+1,jj-1,m) &
-                  &                       + fine(ii-1,jj+1,m)+fine(ii+1,jj+1,m)) )
-          end do
-       end do
-    end do
-  end subroutine warpx_sync_rho_2d
-
-  subroutine warpx_sync_rho_3d (lo, hi, crse, clo, chi, fine, flo, fhi, nc) &
-       bind(c, name='warpx_sync_rho_3d')
-    integer, intent(in) :: lo(3), hi(3), flo(3), fhi(3), clo(3), chi(3), nc
-    real(amrex_real), intent(in   ) :: fine(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),nc)
-    real(amrex_real), intent(inout) :: crse(clo(1):chi(1),clo(2):chi(2),clo(3):chi(3),nc)
-
-    integer :: i,j,k,ii,jj,kk,m
-
-    do m = 1, nc
-       do k = lo(3), hi(3)
-          kk = k*2
-          do j = lo(2), hi(2)
-             jj = j*2
-             do i = lo(1), hi(1)
-                ii = i*2
-                crse(i,j,k,m) = 0.125d0 * &
-                     (fine(ii,jj,kk,m) + 0.5d0  *(fine(ii-1,jj  ,kk  ,m)+fine(ii+1,jj  ,kk  ,m) &
-                     &                          + fine(ii  ,jj-1,kk  ,m)+fine(ii  ,jj+1,kk  ,m) &
-                     &                          + fine(ii  ,jj  ,kk-1,m)+fine(ii  ,jj  ,kk+1,m)) &
-                     &                 + 0.25d0 *(fine(ii-1,jj-1,kk  ,m)+fine(ii+1,jj-1,kk  ,m) &
-                     &                          + fine(ii-1,jj+1,kk  ,m)+fine(ii+1,jj+1,kk  ,m) &
-                     &                          + fine(ii-1,jj  ,kk-1,m)+fine(ii+1,jj  ,kk-1,m) &
-                     &                          + fine(ii-1,jj  ,kk+1,m)+fine(ii+1,jj  ,kk+1,m) &
-                     &                          + fine(ii  ,jj-1,kk-1,m)+fine(ii  ,jj+1,kk-1,m) &
-                     &                          + fine(ii  ,jj-1,kk+1,m)+fine(ii  ,jj+1,kk+1,m)) &
-                     &                 + 0.125d0*(fine(ii-1,jj-1,kk-1,m)+fine(ii-1,jj-1,kk+1,m) &
-                     &                          + fine(ii-1,jj+1,kk-1,m)+fine(ii-1,jj+1,kk+1,m) &
-                     &                          + fine(ii+1,jj-1,kk-1,m)+fine(ii+1,jj-1,kk+1,m) &
-                     &                          + fine(ii+1,jj+1,kk-1,m)+fine(ii+1,jj+1,kk+1,m)))
-             end do
-          end do
-       end do
-    end do
-
-  end subroutine warpx_sync_rho_3d
-
   subroutine warpx_build_buffer_masks (lo, hi, msk, mlo, mhi, gmsk, glo, ghi, ng) &
        bind(c, name='warpx_build_buffer_masks')
     integer, dimension(3), intent(in) :: lo, hi, mlo, mhi, glo, ghi
diff --git a/Source/FortranInterface/WarpX_f.H b/Source/FortranInterface/WarpX_f.H
index 2d5d346e2..b51a83387 100644
--- a/Source/FortranInterface/WarpX_f.H
+++ b/Source/FortranInterface/WarpX_f.H
@@ -16,8 +16,6 @@
 
 #if (AMREX_SPACEDIM == 3)
 
-#define WRPX_SYNC_RHO                    warpx_sync_rho_3d
-
 #define WRPX_SUM_FINE_TO_CRSE_NODAL      warpx_sum_fine_to_crse_nodal_3d
 #define WRPX_ZERO_OUT_BNDRY              warpx_zero_out_bndry_3d
 #define WRPX_BUILD_MASK                  warpx_build_mask_3d
@@ -30,8 +28,6 @@
 
 #elif (AMREX_SPACEDIM == 2)
 
-#define WRPX_SYNC_RHO                    warpx_sync_rho_2d
-
 #define WRPX_SUM_FINE_TO_CRSE_NODAL      warpx_sum_fine_to_crse_nodal_2d
 #define WRPX_ZERO_OUT_BNDRY              warpx_zero_out_bndry_2d
 #define WRPX_BUILD_MASK                  warpx_build_mask_2d
@@ -229,11 +225,6 @@ extern "C"
 #endif
                           const amrex::Real* sigbz, int sigbz_lo, int sigbz_hi);
 
-    void WRPX_SYNC_RHO (const int* lo, const int* hi,
-                        BL_FORT_FAB_ARG_ANYD(crse),
-                        const BL_FORT_FAB_ARG_ANYD(fine),
-                        const int* ncomp);
-
 #ifdef WARPX_USE_PSATD
     void warpx_fft_mpi_init (int fcomm);
     void warpx_fft_domain_decomp (int* warpx_local_nz, int* warpx_local_z0,
diff --git a/Source/Parallelization/InterpolateDensityFineToCoarse.H b/Source/Parallelization/InterpolateDensityFineToCoarse.H
new file mode 100644
index 000000000..947db2aac
--- /dev/null
+++ b/Source/Parallelization/InterpolateDensityFineToCoarse.H
@@ -0,0 +1,120 @@
+/* Copyright 2019 Axel Huebl, Weiqun Zhang
+ *
+ * This file is part of WarpX.
+ *
+ * License: BSD-3-Clause-LBNL
+ */
+
+#ifndef INTERPOLATE_DENSITY_FINE_TO_COARSE_H
+#define INTERPOLATE_DENSITY_FINE_TO_COARSE_H
+
+#include <AMReX_Array4.H>
+#include <AMReX_REAL.H>
+#include <AMReX_BLassert.H>
+#include <AMReX_Extension.H>
+#include <AMReX_GpuQualifiers.H>
+
+#include <utility> // std::move
+
+
+/** Fill a charge density (rho) coarse patch with averaged values from a fine patch
+ *
+ * Fills the values of the charge density on the coarse patch
+ * by averaging the values of the charge density of the fine patch.
+ */
+class InterpolateDensityFineToCoarse
+{
+public:
+    /** Construct with fine and coarse patch and their refinement ratio
+     *
+     * @param[in]     fine read-only fine patch
+     * @param[in,out] coarse overwritten coarse patch
+     * @param[in]     refinement_ratio ratio between coarse and fine patch granularity
+     *                (currently, only a value of is implemented)
+     * @param[in]     number_of_components the number of components
+     */
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    InterpolateDensityFineToCoarse(
+        amrex::Array4<amrex::Real const> const fine,
+        amrex::Array4<amrex::Real      > const coarse,
+        int const refinement_ratio,
+        int const number_of_components
+    ) : m_fine(std::move(fine)),
+        m_coarse(std::move(coarse)),
+        m_refinement_ratio(std::move(refinement_ratio)),
+        m_number_of_components(std::move(number_of_components))
+    {
+        //! @note constants and stencils in operator() implementation assume 2x refinement
+        BL_ASSERT(refinement_ratio == 2);
+    }
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    void
+    operator()(
+        int const i,
+        int const j,
+        int const k
+    ) const noexcept
+    {
+        using namespace amrex;
+
+        auto const & fine_unsafe = m_fine;   // out-of-bounds access not secured with zero-values yet
+        auto const & coarse      = m_coarse; // out-of-bounds access not secured but will also not occur
+
+        // return zero for out-of-bounds accesses during interpolation
+        // this is efficiently used as a method to add neutral elements beyond guards in the average below
+        auto const fine = [fine_unsafe] AMREX_GPU_DEVICE (int const j, int const k, int const l, int const m) noexcept
+        {
+            return fine_unsafe.contains(j, k, l) ? fine_unsafe(j, k, l, m) :  amrex::Real{0.};
+        };
+
+        int const ii = i * m_refinement_ratio;
+        int const jj = j * m_refinement_ratio;
+        int const kk = k * m_refinement_ratio;
+        for( int m = 0; m < m_number_of_components; ++m ) {
+#if AMREX_SPACEDIM == 2
+            coarse(i,j,kk,m) = 0.25_rt * (
+                fine(ii,jj,kk,m) +
+                0.5_rt  * (
+                    fine(ii-1,jj  ,kk,m) + fine(ii+1,jj  ,kk,m) +
+                    fine(ii  ,jj-1,kk,m) + fine(ii  ,jj+1,kk,m)
+                ) +
+                0.25_rt * (
+                    fine(ii-1,jj-1,kk,m) + fine(ii+1,jj-1,kk,m) +
+                    fine(ii-1,jj+1,kk,m) + fine(ii+1,jj+1,kk,m)
+                )
+            );
+#elif AMREX_SPACEDIM == 3
+            coarse(i,j,k,m) = 0.125_rt * (
+                fine(ii,jj,kk,m) +
+                0.5_rt   * (
+                    fine(ii-1,jj  ,kk  ,m) + fine(ii+1,jj  ,kk  ,m) +
+                    fine(ii  ,jj-1,kk  ,m) + fine(ii  ,jj+1,kk  ,m) +
+                    fine(ii  ,jj  ,kk-1,m) + fine(ii  ,jj  ,kk+1,m)
+                ) +
+                0.25_rt  * (
+                    fine(ii-1,jj-1,kk  ,m) + fine(ii+1,jj-1,kk  ,m) +
+                    fine(ii-1,jj+1,kk  ,m) + fine(ii+1,jj+1,kk  ,m) +
+                    fine(ii-1,jj  ,kk-1,m) + fine(ii+1,jj  ,kk-1,m) +
+                    fine(ii-1,jj  ,kk+1,m) + fine(ii+1,jj  ,kk+1,m) +
+                    fine(ii  ,jj-1,kk-1,m) + fine(ii  ,jj+1,kk-1,m) +
+                    fine(ii  ,jj-1,kk+1,m) + fine(ii  ,jj+1,kk+1,m)
+                ) +
+                0.125_rt * (
+                    fine(ii-1,jj-1,kk-1,m) + fine(ii-1,jj-1,kk+1,m) +
+                    fine(ii-1,jj+1,kk-1,m) + fine(ii-1,jj+1,kk+1,m) +
+                    fine(ii+1,jj-1,kk-1,m) + fine(ii+1,jj-1,kk+1,m) +
+                    fine(ii+1,jj+1,kk-1,m) + fine(ii+1,jj+1,kk+1,m)
+                )
+            );
+#endif
+        }
+    }
+private:
+    amrex::Array4< amrex::Real const > m_fine;
+    amrex::Array4< amrex::Real       > m_coarse;
+    int m_refinement_ratio;
+    int m_number_of_components;
+};
+
+#endif // INTERPOLATE_DENSITY_FINE_TO_COARSE_H
diff --git a/Source/Parallelization/WarpXComm.cpp b/Source/Parallelization/WarpXComm.cpp
index 4f870e79c..92f0b4f09 100644
--- a/Source/Parallelization/WarpXComm.cpp
+++ b/Source/Parallelization/WarpXComm.cpp
@@ -3,10 +3,12 @@
 #include <WarpX_f.H>
 #include <WarpXSumGuardCells.H>
 #include <Parallelization/InterpolateCurrentFineToCoarse.H>
+#include <Parallelization/InterpolateDensityFineToCoarse.H>
 
 #include <algorithm>
 #include <cstdlib>
 
+
 using namespace amrex;
 
 void
@@ -354,7 +356,7 @@ WarpX::interpolateCurrentFineToCoarse ( std::array< amrex::MultiFab const *, 3 >
                                         std::array< amrex::MultiFab       *, 3 > const & coarse,
                                         int const refinement_ratio)
 {
-    BL_PROFILE("InterpolateCurrentFineToCoarse()");
+    BL_PROFILE("interpolateCurrentFineToCoarse()");
     BL_ASSERT(refinement_ratio == 2);
     const IntVect& ng = (fine[0]->nGrowVect() + 1) / refinement_ratio; // add equivalent no. of guards to coarse patch
 
@@ -386,6 +388,8 @@ WarpX::interpolateCurrentFineToCoarse ( std::array< amrex::MultiFab const *, 3 >
 void
 WarpX::SyncRho ()
 {
+    BL_PROFILE("SyncRho()");
+
     if (!rho_fp[0]) return;
     const int ncomp = rho_fp[0]->nComp();
 
@@ -395,7 +399,7 @@ WarpX::SyncRho ()
     {
         rho_cp[lev]->setVal(0.0);
         const IntVect& refinement_ratio = refRatio(lev-1);
-        SyncRho(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
+        interpolateDensityFineToCoarse(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
     }
 
     // For each level
@@ -408,29 +412,26 @@ WarpX::SyncRho ()
 }
 
 void
-WarpX::SyncRho (const MultiFab& fine, MultiFab& crse, int refinement_ratio)
+WarpX::interpolateDensityFineToCoarse (const MultiFab& fine, MultiFab& coarse, int const refinement_ratio)
 {
+    BL_PROFILE("interpolateDensityFineToCoarse()");
     BL_ASSERT(refinement_ratio == 2);
-    const IntVect& ng = (fine.nGrowVect()+1)/refinement_ratio;
+    const IntVect& ng = (fine.nGrowVect() + 1) / refinement_ratio;  // add equivalent no. of guards to coarse patch
     const int nc = fine.nComp();
 
 #ifdef _OPEMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
     {
-        FArrayBox ffab;
-        for (MFIter mfi(crse,true); mfi.isValid(); ++mfi)
+        // OMP in-box decomposition of coarse into tilebox
+        for (MFIter mfi(coarse, TilingIfNotGPU()); mfi.isValid(); ++mfi)
         {
-            const Box& bx = mfi.growntilebox(ng);
-            Box fbx = amrex::grow(amrex::refine(bx,refinement_ratio),1);
-            ffab.resize(fbx, nc);
-            fbx &= fine[mfi].box();
-            ffab.setVal(0.0);
-            ffab.copy(fine[mfi], fbx, 0, fbx, 0, nc);
-            WRPX_SYNC_RHO(bx.loVect(), bx.hiVect(),
-                          BL_TO_FORTRAN_ANYD(crse[mfi]),
-                          BL_TO_FORTRAN_ANYD(ffab),
-                          &nc);
+            const Box& bx = mfi.growntilebox(ng); // only grow to outer directions of tileboxes for filling guards
+
+            amrex::ParallelFor(
+                bx,
+                InterpolateDensityFineToCoarse(fine.const_array(mfi), coarse.array(mfi), refinement_ratio, nc)
+            );
         }
     }
 }
@@ -562,7 +563,7 @@ WarpX::RestrictRhoFromFineToCoarsePatch (int lev)
     if (rho_fp[lev]) {
         rho_cp[lev]->setVal(0.0);
         const IntVect& refinement_ratio = refRatio(lev-1);
-        SyncRho(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
+        interpolateDensityFineToCoarse(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
     }
 }
 
diff --git a/Source/WarpX.H b/Source/WarpX.H
index c59802427..0da1cf350 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -436,10 +436,12 @@ private:
      *  averaging the values of the charge density of the fine patch (on the same level).
      *
      * \param[in]  fine             fine patches to interpolate from
-     * \param[out] crse             coarse patches to interpolate to
-     * \param[in]  ref_ratio        integer ratio between the two
+     * \param[out] coarse           coarse patches to interpolate to
+     * \param[in]  refinement_ratio integer ratio between the two
      */
-    void SyncRho (const amrex::MultiFab& fine, amrex::MultiFab& crse, int ref_ratio);
+    void interpolateDensityFineToCoarse (const amrex::MultiFab& fine,
+                                               amrex::MultiFab& coarse,
+                                         int const refinement_ratio);
 
     void ExchangeWithPmlB (int lev);
     void ExchangeWithPmlE (int lev);