From 15658267709e5a53a11ef6f945d1bfb283f994a6 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Tue, 24 Sep 2019 16:25:49 -0700
Subject: Current Synchronize: Port to GPU

Port the current synchronize functions to GPU.
---
 Source/Parallelization/WarpXComm.cpp | 44 ++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 24 deletions(-)

(limited to 'Source/Parallelization/WarpXComm.cpp')
diff --git a/Source/Parallelization/WarpXComm.cpp b/Source/Parallelization/WarpXComm.cpp
index 990d0f988..22fa4820e 100644
--- a/Source/Parallelization/WarpXComm.cpp
+++ b/Source/Parallelization/WarpXComm.cpp
@@ -2,6 +2,7 @@
 #include <WarpX.H>
 #include <WarpX_f.H>
 #include <WarpXSumGuardCells.H>
+#include <Parallelization/CurrentSynchronize.H>
 
 #include <algorithm>
 #include <cstdlib>
@@ -348,36 +349,34 @@ WarpX::SyncCurrent ()
     }
 }
 
-/** \brief Fills the values of the current on the coarse patch by
- *  averaging the values of the current of the fine patch (on the same level).
- */
 void
 WarpX::SyncCurrent (const std::array<const amrex::MultiFab*,3>& fine,
-                    const std::array<      amrex::MultiFab*,3>& crse,
-                    int refinement_ratio)
+                    const std::array<      amrex::MultiFab*,3>& coarse,
+                    int const refinement_ratio)
 {
     BL_ASSERT(refinement_ratio == 2);
-    const IntVect& ng = (fine[0]->nGrowVect() + 1) /refinement_ratio;
+    const IntVect& ng = (fine[0]->nGrowVect() + 1) / refinement_ratio; // add equivalent no. of guards to coarse patch
 
 #ifdef _OPEMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
     {
-        FArrayBox ffab;
-        for (int idim = 0; idim < 3; ++idim)
+        FArrayBox ffab; // contiguous, temporary, copy of the tiled fine patch to read from
+        for (int idim = 0; idim < fine.size(); ++idim)  // j-field components
         {
-            for (MFIter mfi(*crse[idim],true); mfi.isValid(); ++mfi)
+            for (MFIter mfi(*coarse[idim],true); mfi.isValid(); ++mfi) // OMP in-box decomposition of coarse into tilebox
             {
-                const Box& bx = mfi.growntilebox(ng);
-                Box fbx = amrex::grow(amrex::refine(bx,refinement_ratio),1);
-                ffab.resize(fbx);
-                fbx &= (*fine[idim])[mfi].box();
-                ffab.setVal(0.0);
-                ffab.copy((*fine[idim])[mfi], fbx, 0, fbx, 0, fine[idim]->nComp());
-                WRPX_SYNC_CURRENT(bx.loVect(), bx.hiVect(),
-                                   BL_TO_FORTRAN_ANYD((*crse[idim])[mfi]),
-                                   BL_TO_FORTRAN_ANYD(ffab),
-                                   &idim);
+                const Box& bx = mfi.growntilebox(ng); // only grow to outer directions of tileboxes for filling guards
+
+                auto const & arrFine = fine[idim]->const_array(mfi);
+                auto const & arrCoarse = coarse[idim]->array(mfi);
+
+                if( idim == 0 )
+                    amrex::ParallelFor( bx, WarpxSyncCurrent<0>(arrFine, arrCoarse, refinement_ratio) );
+                else if( idim == 1 )
+                    amrex::ParallelFor( bx, WarpxSyncCurrent<1>(arrFine, arrCoarse, refinement_ratio) );
+                else if( idim == 2 )
+                    amrex::ParallelFor( bx, WarpxSyncCurrent<2>(arrFine, arrCoarse, refinement_ratio) );
             }
         }
     }
@@ -407,9 +406,6 @@ WarpX::SyncRho ()
     }
 }
 
-/** \brief Fills the values of the charge density on the coarse patch by
- *  averaging the values of the charge density of the fine patch (on the same level).
- */
 void
 WarpX::SyncRho (const MultiFab& fine, MultiFab& crse, int refinement_ratio)
 {
@@ -418,7 +414,7 @@ WarpX::SyncRho (const MultiFab& fine, MultiFab& crse, int refinement_ratio)
     const int nc = fine.nComp();
 
 #ifdef _OPEMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
     {
         FArrayBox ffab;
-- 
cgit v1.2.3


From d67030eb709d965672a9900132d0085922335cc1 Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Wed, 25 Sep 2019 09:54:52 -0700
Subject: SyncCurrent: Disable Tiling for GPU

Co-authored-by: Weiqun Zhang <weiqunzhang@lbl.gov>
---
 Source/Parallelization/WarpXComm.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'Source/Parallelization/WarpXComm.cpp')

diff --git a/Source/Parallelization/WarpXComm.cpp b/Source/Parallelization/WarpXComm.cpp
index 22fa4820e..40f8203a9 100644
--- a/Source/Parallelization/WarpXComm.cpp
+++ b/Source/Parallelization/WarpXComm.cpp
@@ -361,10 +361,10 @@ WarpX::SyncCurrent (const std::array<const amrex::MultiFab*,3>& fine,
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
     {
-        FArrayBox ffab; // contiguous, temporary, copy of the tiled fine patch to read from
         for (int idim = 0; idim < fine.size(); ++idim)  // j-field components
         {
-            for (MFIter mfi(*coarse[idim],true); mfi.isValid(); ++mfi) // OMP in-box decomposition of coarse into tilebox
+            // OMP in-box decomposition of coarse into tilebox
+            for (MFIter mfi(*coarse[idim], TilingIfNotGPU()); mfi.isValid(); ++mfi)
             {
                 const Box& bx = mfi.growntilebox(ng); // only grow to outer directions of tileboxes for filling guards
 
-- 
cgit v1.2.3


From c9577f8d200d99c40d15d5ff0d2fdacbddc2026f Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Fri, 27 Sep 2019 11:14:00 -0700
Subject: Rename, Profile & Remove TODO

---
 Source/Parallelization/CurrentSynchronize.H        | 175 ---------------------
 .../InterpolateCurrentFineToCoarse.H               | 175 +++++++++++++++++++++
 Source/Parallelization/WarpXComm.cpp               |  19 +--
 Source/WarpX.H                                     |   6 +-
 4 files changed, 188 insertions(+), 187 deletions(-)
 delete mode 100644 Source/Parallelization/CurrentSynchronize.H
 create mode 100644 Source/Parallelization/InterpolateCurrentFineToCoarse.H

(limited to 'Source/Parallelization/WarpXComm.cpp')

diff --git a/Source/Parallelization/CurrentSynchronize.H b/Source/Parallelization/CurrentSynchronize.H
deleted file mode 100644
index 5329ca242..000000000
--- a/Source/Parallelization/CurrentSynchronize.H
+++ /dev/null
@@ -1,175 +0,0 @@
-/* Copyright 2019 Axel Huebl, Weiqun Zhang
- *
- * This file is part of WarpX.
- *
- * License: BSD-3-Clause-LBNL
- */
-
-#ifndef WARPX_CURRENTSYNCHRONIZE_H
-#define WARPX_CURRENTSYNCHRONIZE_H
-
-#include <AMReX_Array4.H>
-#include <AMReX_REAL.H>
-#include <AMReX_BLassert.H>
-#include <AMReX_Extension.H>
-#include <AMReX_GpuQualifiers.H>
-
-#include <utility> // std::move
-
-
-/** Fill a current coarse patch with averaged values from a fine patch
- *
- * Fills the values of the current for a selected component on the coarse patch
- * by averaging the values of the current of the fine patch.
- *
- * @tparam IDim j-field component on which the averaging is performed
- */
-template< int IDim >
-class WarpxSyncCurrent
-{
-public:
-    /** Construct with fine and coarse patch and their refinement ratio
-     *
-     * @param[in]     fine read-only fine patch
-     * @param[in,out] coarse overwritten coarse patch
-     * @param[in]     refinement_ratio ratio between coarse and fine patch granularity
-     *                (currently, only a value of is implemented)
-     */
-    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-    WarpxSyncCurrent(
-        amrex::Array4<amrex::Real const> const fine,
-        amrex::Array4<amrex::Real      > const coarse,
-        int const refinement_ratio
-    ) : m_fine(std::move(fine)),
-        m_coarse(std::move(coarse)),
-        m_refinement_ratio(std::move(refinement_ratio))
-    {
-        //! @note constants and stencils in operator() implementation assume 2x refinement
-        BL_ASSERT(refinement_ratio == 2);
-    }
-
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    void
-    operator()(
-        int const i,
-        int const j,
-        int const k
-    ) const noexcept   // TODO rename to jkl
-    {
-        auto const & fine_unsafe = m_fine;   // out-of-bounds access not secured with zero-values yet
-        auto const & coarse      = m_coarse; // out-of-bounds access not secured but will also not occur
-
-        // return zero for out-of-bounds accesses during interpolation
-        // this is efficiently used as a method to add neutral elements beyond guards in the average below
-        auto const fine = [fine_unsafe] AMREX_GPU_DEVICE (int const j, int const k, int const l) noexcept
-        {
-            return fine_unsafe.contains(j, k, l) ? fine_unsafe(j, k, l) :  amrex::Real{0.};
-        };
-
-        int const ii = i * m_refinement_ratio;
-        int const jj = j * m_refinement_ratio;
-        int const kk = k * m_refinement_ratio;
-#if AMREX_SPACEDIM == 2
-        if (IDim == 0) {
-            coarse(i, j, k) = 0.25 * (
-                fine(ii, jj, kk) + fine(ii + 1, jj, kk) +
-                0.5  * (
-                    fine(ii, jj - 1, kk) + fine(ii + 1, jj - 1, kk) +
-                    fine(ii, jj + 1, kk) + fine(ii + 1, jj + 1, kk)
-                )
-            );
-        } else if (IDim == 2) {
-            coarse(i, j, k) = 0.25 * (
-                fine(ii, jj, kk) + fine(ii, jj + 1, kk) +
-                0.5  * (
-                    fine(ii - 1, jj, kk) + fine(ii - 1, jj + 1, kk) +
-                    fine(ii + 1, jj, kk) + fine(ii + 1, jj + 1, kk)
-                )
-            );
-        } else {
-            coarse(i, j, k) = 0.25 * (
-                fine(ii, jj, kk) +
-                0.5  * (
-                    fine(ii - 1, jj    , kk) + fine(ii + 1, jj    , kk) +
-                    fine(ii    , jj - 1, kk) + fine(ii    , jj + 1, kk)
-                ) +
-                0.25 * (
-                    fine(ii - 1, jj - 1, kk) + fine(ii + 1, jj - 1, kk) +
-                    fine(ii - 1, jj + 1, kk) + fine(ii + 1, jj + 1, kk)
-                )
-            );
-        }
-#elif AMREX_SPACEDIM == 3
-        if (IDim == 0) {
-            coarse(i,j,k) = 0.125 * (
-                fine(ii  , jj, kk) +
-                0.5  * (
-                    fine(ii  , jj-1, kk  ) + fine(ii  , jj+1, kk  ) +
-                    fine(ii  , jj  , kk-1) + fine(ii  , jj  , kk+1)
-                ) +
-                0.25 * (
-                    fine(ii  , jj-1, kk-1) + fine(ii  , jj+1, kk-1) +
-                    fine(ii  , jj-1, kk+1) + fine(ii  , jj+1, kk+1)
-                ) +
-                fine(ii+1, jj, kk) +
-                0.5  * (
-                    fine(ii+1, jj-1, kk  ) + fine(ii+1, jj+1, kk  ) +
-                    fine(ii+1, jj  , kk-1) + fine(ii+1, jj  , kk+1)
-                ) +
-                0.25 * (
-                    fine(ii+1, jj-1, kk-1) + fine(ii+1, jj+1, kk-1) +
-                    fine(ii+1, jj-1, kk+1) + fine(ii+1, jj+1, kk+1)
-                )
-            );
-        } else if (IDim == 1) {
-            coarse(i, j, k) = 0.125 * (
-                fine(ii, jj , kk) +
-                0.5  * (
-                    fine(ii-1, jj  , kk  ) + fine(ii+1, jj  , kk  ) +
-                    fine(ii  , jj  , kk-1) + fine(ii  , jj  , kk+1)
-                 ) +
-                 0.25 * (
-                    fine(ii-1, jj  , kk-1) + fine(ii+1, jj  , kk-1) +
-                    fine(ii-1, jj  , kk+1) + fine(ii+1, jj  , kk+1)
-                 ) +
-                 fine(ii, jj+1, kk) +
-                 0.5  * (
-                    fine(ii-1, jj+1, kk  ) + fine(ii+1, jj+1, kk  ) +
-                    fine(ii  , jj+1, kk-1) + fine(ii  , jj+1, kk+1)
-                 ) +
-                 0.25 * (
-                    fine(ii-1, jj+1, kk-1) + fine(ii+1, jj+1, kk-1) +
-                    fine(ii-1, jj+1, kk+1) + fine(ii+1, jj+1, kk+1)
-                 )
-             );
-        } else {
-            coarse(i, j, k) = 0.125 * (
-                fine(ii, jj, kk  ) +
-                0.5  * (
-                    fine(ii-1, jj  , kk  ) + fine(ii+1, jj  , kk  ) +
-                    fine(ii  , jj-1, kk  ) + fine(ii  , jj+1, kk  )
-                ) +
-                0.25 * (
-                    fine(ii-1, jj-1, kk  ) + fine(ii+1, jj-1, kk  ) +
-                    fine(ii-1, jj+1, kk  ) + fine(ii+1, jj+1, kk  )
-                ) +
-                fine(ii, jj, kk+1) +
-                0.5  * (
-                    fine(ii-1, jj  , kk+1) + fine(ii+1, jj  , kk+1) +
-                    fine(ii  , jj-1, kk+1) + fine(ii  , jj+1, kk+1)
-                ) +
-                0.25 * (
-                    fine(ii-1, jj-1, kk+1) + fine(ii+1, jj-1, kk+1) +
-                    fine(ii-1, jj+1, kk+1) + fine(ii+1, jj+1, kk+1)
-                )
-             );
-         }
-#endif
-    }
-private:
-    amrex::Array4< amrex::Real const > m_fine;
-    amrex::Array4< amrex::Real       > m_coarse;
-    int m_refinement_ratio;
-};
-
-#endif //WARPX_CURRENTSYNCHRONIZE_H
diff --git a/Source/Parallelization/InterpolateCurrentFineToCoarse.H b/Source/Parallelization/InterpolateCurrentFineToCoarse.H
new file mode 100644
index 000000000..148b725d0
--- /dev/null
+++ b/Source/Parallelization/InterpolateCurrentFineToCoarse.H
@@ -0,0 +1,175 @@
+/* Copyright 2019 Axel Huebl, Weiqun Zhang
+ *
+ * This file is part of WarpX.
+ *
+ * License: BSD-3-Clause-LBNL
+ */
+
+#ifndef INTERPOLATECURRENTFINETOCOARSE_H
+#define INTERPOLATECURRENTFINETOCOARSE_H
+
+#include <AMReX_Array4.H>
+#include <AMReX_REAL.H>
+#include <AMReX_BLassert.H>
+#include <AMReX_Extension.H>
+#include <AMReX_GpuQualifiers.H>
+
+#include <utility>  // std::move
+
+
+/** Fill a current coarse patch with averaged values from a fine patch
+ *
+ * Fills the values of the current for a selected component on the coarse patch
+ * by averaging the values of the current of the fine patch.
+ *
+ * @tparam IDim j-field component on which the averaging is performed
+ */
+template< int IDim >
+class InterpolateCurrentFineToCoarse
+{
+public:
+    /** Construct with fine and coarse patch and their refinement ratio
+     *
+     * @param[in]     fine read-only fine patch
+     * @param[in,out] coarse overwritten coarse patch
+     * @param[in]     refinement_ratio ratio between coarse and fine patch granularity
+     *                (currently, only a value of is implemented)
+     */
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    InterpolateCurrentFineToCoarse(
+        amrex::Array4< amrex::Real const > const fine,
+        amrex::Array4< amrex::Real       > const coarse,
+        int const refinement_ratio
+    ) : m_fine(std::move(fine)),
+        m_coarse(std::move(coarse)),
+        m_refinement_ratio(std::move(refinement_ratio))
+    {
+        //! @note constants and stencils in operator() implementation assume 2x refinement
+        BL_ASSERT(refinement_ratio == 2);
+    }
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    void
+    operator()(
+        int const i,
+        int const j,
+        int const k
+    ) const noexcept
+    {
+        auto const & fine_unsafe = m_fine;   // out-of-bounds access not secured with zero-values yet
+        auto const & coarse      = m_coarse; // out-of-bounds access not secured but will also not occur
+
+        // return zero for out-of-bounds accesses during interpolation
+        // this is efficiently used as a method to add neutral elements beyond guards in the average below
+        auto const fine = [fine_unsafe] AMREX_GPU_DEVICE (int const j, int const k, int const l) noexcept
+        {
+            return fine_unsafe.contains(j, k, l) ? fine_unsafe(j, k, l) :  amrex::Real{0.};
+        };
+
+        int const ii = i * m_refinement_ratio;
+        int const jj = j * m_refinement_ratio;
+        int const kk = k * m_refinement_ratio;
+#if AMREX_SPACEDIM == 2
+        if (IDim == 0) {
+            coarse(i, j, k) = 0.25 * (
+                fine(ii, jj, kk) + fine(ii + 1, jj, kk) +
+                0.5  * (
+                    fine(ii, jj - 1, kk) + fine(ii + 1, jj - 1, kk) +
+                    fine(ii, jj + 1, kk) + fine(ii + 1, jj + 1, kk)
+                )
+            );
+        } else if (IDim == 2) {
+            coarse(i, j, k) = 0.25 * (
+                fine(ii, jj, kk) + fine(ii, jj + 1, kk) +
+                0.5  * (
+                    fine(ii - 1, jj, kk) + fine(ii - 1, jj + 1, kk) +
+                    fine(ii + 1, jj, kk) + fine(ii + 1, jj + 1, kk)
+                )
+            );
+        } else {
+            coarse(i, j, k) = 0.25 * (
+                fine(ii, jj, kk) +
+                0.5  * (
+                    fine(ii - 1, jj    , kk) + fine(ii + 1, jj    , kk) +
+                    fine(ii    , jj - 1, kk) + fine(ii    , jj + 1, kk)
+                ) +
+                0.25 * (
+                    fine(ii - 1, jj - 1, kk) + fine(ii + 1, jj - 1, kk) +
+                    fine(ii - 1, jj + 1, kk) + fine(ii + 1, jj + 1, kk)
+                )
+            );
+        }
+#elif AMREX_SPACEDIM == 3
+        if (IDim == 0) {
+            coarse(i,j,k) = 0.125 * (
+                fine(ii  , jj, kk) +
+                0.5  * (
+                    fine(ii  , jj-1, kk  ) + fine(ii  , jj+1, kk  ) +
+                    fine(ii  , jj  , kk-1) + fine(ii  , jj  , kk+1)
+                ) +
+                0.25 * (
+                    fine(ii  , jj-1, kk-1) + fine(ii  , jj+1, kk-1) +
+                    fine(ii  , jj-1, kk+1) + fine(ii  , jj+1, kk+1)
+                ) +
+                fine(ii+1, jj, kk) +
+                0.5  * (
+                    fine(ii+1, jj-1, kk  ) + fine(ii+1, jj+1, kk  ) +
+                    fine(ii+1, jj  , kk-1) + fine(ii+1, jj  , kk+1)
+                ) +
+                0.25 * (
+                    fine(ii+1, jj-1, kk-1) + fine(ii+1, jj+1, kk-1) +
+                    fine(ii+1, jj-1, kk+1) + fine(ii+1, jj+1, kk+1)
+                )
+            );
+        } else if (IDim == 1) {
+            coarse(i, j, k) = 0.125 * (
+                fine(ii, jj , kk) +
+                0.5  * (
+                    fine(ii-1, jj  , kk  ) + fine(ii+1, jj  , kk  ) +
+                    fine(ii  , jj  , kk-1) + fine(ii  , jj  , kk+1)
+                 ) +
+                 0.25 * (
+                    fine(ii-1, jj  , kk-1) + fine(ii+1, jj  , kk-1) +
+                    fine(ii-1, jj  , kk+1) + fine(ii+1, jj  , kk+1)
+                 ) +
+                 fine(ii, jj+1, kk) +
+                 0.5  * (
+                    fine(ii-1, jj+1, kk  ) + fine(ii+1, jj+1, kk  ) +
+                    fine(ii  , jj+1, kk-1) + fine(ii  , jj+1, kk+1)
+                 ) +
+                 0.25 * (
+                    fine(ii-1, jj+1, kk-1) + fine(ii+1, jj+1, kk-1) +
+                    fine(ii-1, jj+1, kk+1) + fine(ii+1, jj+1, kk+1)
+                 )
+             );
+        } else {
+            coarse(i, j, k) = 0.125 * (
+                fine(ii, jj, kk  ) +
+                0.5  * (
+                    fine(ii-1, jj  , kk  ) + fine(ii+1, jj  , kk  ) +
+                    fine(ii  , jj-1, kk  ) + fine(ii  , jj+1, kk  )
+                ) +
+                0.25 * (
+                    fine(ii-1, jj-1, kk  ) + fine(ii+1, jj-1, kk  ) +
+                    fine(ii-1, jj+1, kk  ) + fine(ii+1, jj+1, kk  )
+                ) +
+                fine(ii, jj, kk+1) +
+                0.5  * (
+                    fine(ii-1, jj  , kk+1) + fine(ii+1, jj  , kk+1) +
+                    fine(ii  , jj-1, kk+1) + fine(ii  , jj+1, kk+1)
+                ) +
+                0.25 * (
+                    fine(ii-1, jj-1, kk+1) + fine(ii+1, jj-1, kk+1) +
+                    fine(ii-1, jj+1, kk+1) + fine(ii+1, jj+1, kk+1)
+                )
+             );
+         }
+#endif
+    }
+private:
+    amrex::Array4< amrex::Real const > m_fine;
+    amrex::Array4< amrex::Real       > m_coarse;
+    int m_refinement_ratio;
+};
+
+#endif // INTERPOLATECURRENTFINETOCOARSE_H
diff --git a/Source/Parallelization/WarpXComm.cpp b/Source/Parallelization/WarpXComm.cpp
index 40f8203a9..4f870e79c 100644
--- a/Source/Parallelization/WarpXComm.cpp
+++ b/Source/Parallelization/WarpXComm.cpp
@@ -2,7 +2,7 @@
 #include <WarpX.H>
 #include <WarpX_f.H>
 #include <WarpXSumGuardCells.H>
-#include <Parallelization/CurrentSynchronize.H>
+#include <Parallelization/InterpolateCurrentFineToCoarse.H>
 
 #include <algorithm>
 #include <cstdlib>
@@ -337,7 +337,7 @@ WarpX::SyncCurrent ()
         std::array<      MultiFab*,3> crse { current_cp[lev][0].get(),
                                              current_cp[lev][1].get(),
                                              current_cp[lev][2].get() };
-        SyncCurrent(fine, crse, refinement_ratio[0]);
+        interpolateCurrentFineToCoarse(fine, crse, refinement_ratio[0]);
     }
 
     // For each level
@@ -350,10 +350,11 @@ WarpX::SyncCurrent ()
 }
 
 void
-WarpX::SyncCurrent (const std::array<const amrex::MultiFab*,3>& fine,
-                    const std::array<      amrex::MultiFab*,3>& coarse,
-                    int const refinement_ratio)
+WarpX::interpolateCurrentFineToCoarse ( std::array< amrex::MultiFab const *, 3 > const & fine,
+                                        std::array< amrex::MultiFab       *, 3 > const & coarse,
+                                        int const refinement_ratio)
 {
+    BL_PROFILE("InterpolateCurrentFineToCoarse()");
     BL_ASSERT(refinement_ratio == 2);
     const IntVect& ng = (fine[0]->nGrowVect() + 1) / refinement_ratio; // add equivalent no. of guards to coarse patch
 
@@ -372,11 +373,11 @@ WarpX::SyncCurrent (const std::array<const amrex::MultiFab*,3>& fine,
                 auto const & arrCoarse = coarse[idim]->array(mfi);
 
                 if( idim == 0 )
-                    amrex::ParallelFor( bx, WarpxSyncCurrent<0>(arrFine, arrCoarse, refinement_ratio) );
+                    amrex::ParallelFor( bx, InterpolateCurrentFineToCoarse<0>(arrFine, arrCoarse, refinement_ratio) );
                 else if( idim == 1 )
-                    amrex::ParallelFor( bx, WarpxSyncCurrent<1>(arrFine, arrCoarse, refinement_ratio) );
+                    amrex::ParallelFor( bx, InterpolateCurrentFineToCoarse<1>(arrFine, arrCoarse, refinement_ratio) );
                 else if( idim == 2 )
-                    amrex::ParallelFor( bx, WarpxSyncCurrent<2>(arrFine, arrCoarse, refinement_ratio) );
+                    amrex::ParallelFor( bx, InterpolateCurrentFineToCoarse<2>(arrFine, arrCoarse, refinement_ratio) );
             }
         }
     }
@@ -452,7 +453,7 @@ WarpX::RestrictCurrentFromFineToCoarsePatch (int lev)
     std::array<      MultiFab*,3> crse { current_cp[lev][0].get(),
                                          current_cp[lev][1].get(),
                                          current_cp[lev][2].get() };
-    SyncCurrent(fine, crse, refinement_ratio[0]);
+    interpolateCurrentFineToCoarse(fine, crse, refinement_ratio[0]);
 }
 
 void
diff --git a/Source/WarpX.H b/Source/WarpX.H
index 9b9fb045c..c59802427 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -428,9 +428,9 @@ private:
      * \param[out] coarse           coarse patches to interpolate to
      * \param[in]  refinement_ratio integer ratio between the two
      */
-    void SyncCurrent (const std::array<const amrex::MultiFab*,3>& fine,
-                      const std::array<      amrex::MultiFab*,3>& coarse,
-                      int const refinement_ratio);
+    void interpolateCurrentFineToCoarse (std::array< amrex::MultiFab const *, 3 > const & fine,
+                                         std::array< amrex::MultiFab       *, 3 > const & coarse,
+                                         int const refinement_ratio);
 
     /** \brief Fills the values of the charge density on the coarse patch by
      *  averaging the values of the charge density of the fine patch (on the same level).
-- 
cgit v1.2.3


From 69b0a98b3e8495eb2eca4269533516d66f3ff6f3 Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Fri, 27 Sep 2019 11:41:49 -0700
Subject: nodal auxilary data

---
 Source/Parallelization/WarpXComm.cpp   | 151 ++++++++++
 Source/Parallelization/WarpXComm_K.H   | 485 +++++++++++++++++++++++++++++++++
 Source/Parallelization/WarpXRegrid.cpp |   3 +-
 Source/WarpX.H                         |   2 +
 Source/WarpX.cpp                       |  56 ++--
 5 files changed, 680 insertions(+), 17 deletions(-)

(limited to 'Source/Parallelization/WarpXComm.cpp')

diff --git a/Source/Parallelization/WarpXComm.cpp b/Source/Parallelization/WarpXComm.cpp
index 990d0f988..e67acec22 100644
--- a/Source/Parallelization/WarpXComm.cpp
+++ b/Source/Parallelization/WarpXComm.cpp
@@ -51,6 +51,157 @@ WarpX::UpdateAuxilaryData ()
 {
     BL_PROFILE("UpdateAuxilaryData()");
 
+    if (Bfield_aux[0][0]->is_nodal()) {
+        UpdateNodalAuxilaryData();
+    } else {
+        UpdateStagAuxilaryData();
+    }
+}
+
+void
+WarpX::UpdateNodalAuxilaryData ()
+{
+    // For level 0, we only need to do the average.
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for (MFIter mfi(*Bfield_aux[0][0]); mfi.isValid(); ++mfi)
+    {
+        Array4<Real> const& bx_aux = Bfield_aux[0][0]->array(mfi);
+        Array4<Real> const& by_aux = Bfield_aux[0][1]->array(mfi);
+        Array4<Real> const& bz_aux = Bfield_aux[0][2]->array(mfi);
+        Array4<Real const> const& bx_fp = Bfield_fp[0][0]->const_array(mfi);
+        Array4<Real const> const& by_fp = Bfield_fp[0][1]->const_array(mfi);
+        Array4<Real const> const& bz_fp = Bfield_fp[0][2]->const_array(mfi);
+
+        Array4<Real> const& ex_aux = Efield_aux[0][0]->array(mfi);
+        Array4<Real> const& ey_aux = Efield_aux[0][1]->array(mfi);
+        Array4<Real> const& ez_aux = Efield_aux[0][2]->array(mfi);
+        Array4<Real const> const& ex_fp = Efield_fp[0][0]->const_array(mfi);
+        Array4<Real const> const& ey_fp = Efield_fp[0][1]->const_array(mfi);
+        Array4<Real const> const& ez_fp = Efield_fp[0][2]->const_array(mfi);
+
+        const Box& bx = mfi.fabbox();
+        amrex::ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE (int j, int k, int l) noexcept
+        {
+            warpx_interp_nd_bfield_x(j,k,l, bx_aux, bx_fp);
+            warpx_interp_nd_bfield_y(j,k,l, by_aux, by_fp);
+            warpx_interp_nd_bfield_z(j,k,l, bz_aux, bz_fp);
+            warpx_interp_nd_efield_x(j,k,l, ex_aux, ex_fp);
+            warpx_interp_nd_efield_y(j,k,l, ey_aux, ey_fp);
+            warpx_interp_nd_efield_z(j,k,l, ez_aux, ez_fp);
+        });
+    }
+
+    for (int lev = 1; lev <= finest_level; ++lev)
+    {
+        BoxArray const& nba = Bfield_aux[lev][0]->boxArray();
+        BoxArray const& cnba = amrex::coarsen(nba,2);
+        DistributionMapping const& dm = Bfield_aux[lev][0]->DistributionMap();
+        auto const& cperiod = Geom(lev-1).periodicity();
+
+        // Bfield
+        {
+            Array<std::unique_ptr<MultiFab>,3> Btmp;
+            if (Bfield_cax[lev][0]) {
+                for (int i = 0; i < 3; ++i) {
+                    Btmp[i].reset(new MultiFab(*Bfield_cax[lev][i], amrex::make_alias, 0, 1));
+                }
+            } else {
+                IntVect ngtmp = Bfield_aux[lev-1][0]->nGrowVect();
+                for (int i = 0; i < 3; ++i) {
+                    Btmp[i].reset(new MultiFab(cnba, dm, 1, ngtmp));
+                }
+            }
+            // ParallelCopy from coarse level
+            for (int i = 0; i < 3; ++i) {
+                IntVect ng = Btmp[i]->nGrowVect();
+                Btmp[i]->ParallelCopy(*Bfield_aux[lev-1][i], 0, 0, 1, ng, ng, cperiod);
+            }
+
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+            for (MFIter mfi(*Bfield_aux[lev][0]); mfi.isValid(); ++mfi)
+            {
+                Array4<Real> const& bx_aux = Bfield_aux[lev][0]->array(mfi);
+                Array4<Real> const& by_aux = Bfield_aux[lev][1]->array(mfi);
+                Array4<Real> const& bz_aux = Bfield_aux[lev][2]->array(mfi);
+                Array4<Real const> const& bx_fp = Bfield_fp[lev][0]->const_array(mfi);
+                Array4<Real const> const& by_fp = Bfield_fp[lev][1]->const_array(mfi);
+                Array4<Real const> const& bz_fp = Bfield_fp[lev][2]->const_array(mfi);
+                Array4<Real const> const& bx_cp = Bfield_cp[lev][0]->const_array(mfi);
+                Array4<Real const> const& by_cp = Bfield_cp[lev][1]->const_array(mfi);
+                Array4<Real const> const& bz_cp = Bfield_cp[lev][2]->const_array(mfi);
+                Array4<Real const> const& bx_c = Btmp[0]->const_array(mfi);
+                Array4<Real const> const& by_c = Btmp[1]->const_array(mfi);
+                Array4<Real const> const& bz_c = Btmp[2]->const_array(mfi);
+
+                const Box& bx = mfi.fabbox();
+                amrex::ParallelFor(bx,
+                [=] AMREX_GPU_DEVICE (int j, int k, int l) noexcept
+                {
+                    warpx_interp_nd_bfield_x(j,k,l, bx_aux, bx_fp, bx_cp, bx_c);
+                    warpx_interp_nd_bfield_y(j,k,l, by_aux, by_fp, by_cp, by_c);
+                    warpx_interp_nd_bfield_z(j,k,l, bz_aux, bz_fp, bz_cp, bz_c);
+                });
+            }
+        }
+
+        // Efield
+        {
+            Array<std::unique_ptr<MultiFab>,3> Etmp;
+            if (Efield_cax[lev][0]) {
+                for (int i = 0; i < 3; ++i) {
+                    Etmp[i].reset(new MultiFab(*Efield_cax[lev][i], amrex::make_alias, 0, 1));
+                }
+            } else {
+                IntVect ngtmp = Efield_aux[lev-1][0]->nGrowVect();
+                for (int i = 0; i < 3; ++i) {
+                    Etmp[i].reset(new MultiFab(cnba, dm, 1, ngtmp));
+                }
+            }
+            // ParallelCopy from coarse level
+            for (int i = 0; i < 3; ++i) {
+                IntVect ng = Etmp[i]->nGrowVect();
+                Etmp[i]->ParallelCopy(*Efield_aux[lev-1][i], 0, 0, 1, ng, ng, cperiod);
+            }
+
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+            for (MFIter mfi(*Efield_aux[lev][0]); mfi.isValid(); ++mfi)
+            {
+                Array4<Real> const& ex_aux = Efield_aux[lev][0]->array(mfi);
+                Array4<Real> const& ey_aux = Efield_aux[lev][1]->array(mfi);
+                Array4<Real> const& ez_aux = Efield_aux[lev][2]->array(mfi);
+                Array4<Real const> const& ex_fp = Efield_fp[lev][0]->const_array(mfi);
+                Array4<Real const> const& ey_fp = Efield_fp[lev][1]->const_array(mfi);
+                Array4<Real const> const& ez_fp = Efield_fp[lev][2]->const_array(mfi);
+                Array4<Real const> const& ex_cp = Efield_cp[lev][0]->const_array(mfi);
+                Array4<Real const> const& ey_cp = Efield_cp[lev][1]->const_array(mfi);
+                Array4<Real const> const& ez_cp = Efield_cp[lev][2]->const_array(mfi);
+                Array4<Real const> const& ex_c = Etmp[0]->const_array(mfi);
+                Array4<Real const> const& ey_c = Etmp[1]->const_array(mfi);
+                Array4<Real const> const& ez_c = Etmp[2]->const_array(mfi);
+
+                const Box& bx = mfi.fabbox();
+                amrex::ParallelFor(bx,
+                [=] AMREX_GPU_DEVICE (int j, int k, int l) noexcept
+                {
+                    warpx_interp_nd_efield_x(j,k,l, ex_aux, ex_fp, ex_cp, ex_c);
+                    warpx_interp_nd_efield_y(j,k,l, ey_aux, ey_fp, ey_cp, ey_c);
+                    warpx_interp_nd_efield_z(j,k,l, ez_aux, ez_fp, ez_cp, ez_c);
+                });
+            }
+        }
+    }
+}
+
+void
+WarpX::UpdateStagAuxilaryData ()
+{
     for (int lev = 1; lev <= finest_level; ++lev)
     {
         const auto& crse_period = Geom(lev-1).periodicity();
diff --git a/Source/Parallelization/WarpXComm_K.H b/Source/Parallelization/WarpXComm_K.H
index 093323ec3..5da867c9f 100644
--- a/Source/Parallelization/WarpXComm_K.H
+++ b/Source/Parallelization/WarpXComm_K.H
@@ -158,4 +158,489 @@ void warpx_interp_efield_z (int j, int k, int l,
 #endif
 }
 
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_bfield_x (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Bxa,
+                               amrex::Array4<amrex::Real const> const& Bxf,
+                               amrex::Array4<amrex::Real const> const& Bxc,
+                               amrex::Array4<amrex::Real const> const& Bxg)
+{
+    using namespace amrex;
+
+    int jg = amrex::coarsen(j,2);
+    Real wx = (j == jg*2) ? 0.0 : 0.5;
+    Real owx = 1.0-wx;
+
+    int kg = amrex::coarsen(k,2);
+    Real wy = (k == kg*2) ? 0.0 : 0.5;
+    Real owy = 1.0-wy;
+
+#if (AMREX_SPACEDIM == 2)
+
+    // interp from coarse nodal to fine nodal
+    Real bg = owx * owy * Bxg(jg  ,kg  ,0)
+        +     owx *  wy * Bxg(jg  ,kg+1,0)
+        +      wx * owy * Bxg(jg+1,kg  ,0)
+        +      wx *  wy * Bxg(jg+1,kg+1,0);
+
+    // interp from coarse staggered to fine nodal
+    wy = 0.5-wy;  owy = 1.0-wy;
+    Real bc = owx * owy * Bxc(jg  ,kg  ,0)
+        +     owx *  wy * Bxc(jg  ,kg-1,0)
+        +      wx * owy * Bxc(jg+1,kg  ,0)
+        +      wx *  wy * Bxc(jg+1,kg-1,0);
+
+    // interp from fine staggered to fine nodal
+    Real bf = 0.5*(Bxf(j,k-1,0) + Bxf(j,k,0));
+
+#else
+
+    int lg = amrex::coarsen(l,2);
+    Real wz = (l == lg*2) ? 0.0 : 0.5;
+    Real owz = 1.0-wz;
+
+    // interp from coarse nodal to fine nodal
+    Real bg = owx * owy * owz * Bxg(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Bxg(jg+1,kg  ,lg  )
+        +     owx *  wy * owz * Bxg(jg  ,kg+1,lg  )
+        +      wx *  wy * owz * Bxg(jg+1,kg+1,lg  )
+        +     owx * owy *  wz * Bxg(jg  ,kg  ,lg+1)
+        +      wx * owy *  wz * Bxg(jg+1,kg  ,lg+1)
+        +     owx *  wy *  wz * Bxg(jg  ,kg+1,lg+1)
+        +      wx *  wy *  wz * Bxg(jg+1,kg+1,lg+1);
+
+    // interp from coarse staggered to fine nodal
+    wy = 0.5-wy;  owy = 1.0-wy;
+    wz = 0.5-wz;  owz = 1.0-wz;
+    Real bc = owx * owy * owz * Bxc(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Bxc(jg+1,kg  ,lg  )
+        +     owx *  wy * owz * Bxc(jg  ,kg-1,lg  )
+        +      wx *  wy * owz * Bxc(jg+1,kg-1,lg  )
+        +     owx * owy *  wz * Bxc(jg  ,kg  ,lg-1)
+        +      wx * owy *  wz * Bxc(jg+1,kg  ,lg-1)
+        +     owx *  wy *  wz * Bxc(jg  ,kg-1,lg-1)
+        +      wx *  wy *  wz * Bxc(jg+1,kg-1,lg-1);
+
+    // interp from fine stagged to fine nodal
+    Real bf = 0.25*(Bxf(j,k-1,l-1) + Bxf(j,k,l-1) + Bxf(j,k-1,l) + Bxf(j,k,l));
+#endif
+
+    Bxa(j,k,l) = bg + (bf-bc);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_bfield_y (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Bya,
+                               amrex::Array4<amrex::Real const> const& Byf,
+                               amrex::Array4<amrex::Real const> const& Byc,
+                               amrex::Array4<amrex::Real const> const& Byg)
+{
+    using namespace amrex;
+
+    int jg = amrex::coarsen(j,2);
+    Real wx = (j == jg*2) ? 0.0 : 0.5;
+    Real owx = 1.0-wx;
+
+    int kg = amrex::coarsen(k,2);
+    Real wy = (k == kg*2) ? 0.0 : 0.5;
+    Real owy = 1.0-wy;
+
+#if (AMREX_SPACEDIM == 2)
+
+    // interp from coarse nodal to fine nodal
+    Real bg = owx * owy * Byg(jg  ,kg  ,0)
+        +     owx *  wy * Byg(jg  ,kg+1,0)
+        +      wx * owy * Byg(jg+1,kg  ,0)
+        +      wx *  wy * Byg(jg+1,kg+1,0);
+
+    // interp from coarse stagged (cell-centered for By) to fine nodal
+    wx = 0.5-wx;  owx = 1.0-wx;
+    wy = 0.5-wy;  owy = 1.0-wy;
+    Real bc = owx * owy * Byc(jg  ,kg  ,0)
+        +     owx *  wy * Byc(jg  ,kg-1,0)
+        +      wx * owy * Byc(jg-1,kg  ,0)
+        +      wx *  wy * Byc(jg-1,kg-1,0);
+
+    // interp form fine stagger (cell-centered for By) to fine nodal
+    Real bf = 0.25*(Byf(j,k,0) + Byf(j-1,k,0) + Byf(j,k-1,0) + Byf(j-1,k-1,0));
+
+#else
+
+    int lg = amrex::coarsen(l,2);
+    Real wz = (l == lg*2) ? 0.0 : 0.5;
+    Real owz = 1.0-wz;
+
+    // interp from coarse nodal to fine nodal
+    Real bg = owx * owy * owz * Byg(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Byg(jg+1,kg  ,lg  )
+        +     owx *  wy * owz * Byg(jg  ,kg+1,lg  )
+        +      wx *  wy * owz * Byg(jg+1,kg+1,lg  )
+        +     owx * owy *  wz * Byg(jg  ,kg  ,lg+1)
+        +      wx * owy *  wz * Byg(jg+1,kg  ,lg+1)
+        +     owx *  wy *  wz * Byg(jg  ,kg+1,lg+1)
+        +      wx *  wy *  wz * Byg(jg+1,kg+1,lg+1);
+
+    // interp from coarse staggered to fine nodal
+    wx = 0.5-wx;  owx = 1.0-wx;
+    wz = 0.5-wz;  owz = 1.0-wz;
+    Real bc = owx * owy * owz * Byc(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Byc(jg-1,kg  ,lg  )
+        +     owx *  wy * owz * Byc(jg  ,kg+1,lg  )
+        +      wx *  wy * owz * Byc(jg-1,kg+1,lg  )
+        +     owx * owy *  wz * Byc(jg  ,kg  ,lg-1)
+        +      wx * owy *  wz * Byc(jg-1,kg  ,lg-1)
+        +     owx *  wy *  wz * Byc(jg  ,kg+1,lg-1)
+        +      wx *  wy *  wz * Byc(jg-1,kg+1,lg-1);
+
+    // interp from fine stagged to fine nodal
+    Real bf = 0.25*(Byf(j-1,k,l-1) + Byf(j,k,l-1) + Byf(j-1,k,l) + Byf(j,k,l));
+
+#endif
+
+    Bya(j,k,l) = bg + (bf-bc);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_bfield_z (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Bza,
+                               amrex::Array4<amrex::Real const> const& Bzf,
+                               amrex::Array4<amrex::Real const> const& Bzc,
+                               amrex::Array4<amrex::Real const> const& Bzg)
+{
+    using namespace amrex;
+
+    int jg = amrex::coarsen(j,2);
+    Real wx = (j == jg*2) ? 0.0 : 0.5;
+    Real owx = 1.0-wx;
+
+    int kg = amrex::coarsen(k,2);
+    Real wy = (k == kg*2) ? 0.0 : 0.5;
+    Real owy = 1.0-wy;
+
+#if (AMREX_SPACEDIM == 2)
+
+    // interp from coarse nodal to fine nodal
+    Real bg = owx * owy * Bzg(jg  ,kg  ,0)
+        +     owx *  wy * Bzg(jg  ,kg+1,0)
+        +      wx * owy * Bzg(jg+1,kg  ,0)
+        +      wx *  wy * Bzg(jg+1,kg+1,0);
+
+    // interp from coarse staggered to fine nodal
+    wx = 0.5-wx;  owx = 1.0-wx;
+    Real bc = owx * owy * Bzc(jg  ,kg  ,0)
+        +     owx *  wy * Bzc(jg  ,kg+1,0)
+        +      wx * owy * Bzc(jg-1,kg  ,0)
+        +      wx *  wy * Bzc(jg-1,kg+1,0);
+
+    // interp from fine staggered to fine nodal
+    Real bf = 0.5*(Bzf(j-1,k,0) + Bzf(j,k,0));
+
+#else
+
+    int lg = amrex::coarsen(l,2);
+    Real wz = (l == lg*2) ? 0.0 : 0.5;
+    Real owz = 1.0-wz;
+
+    // interp from coarse nodal to fine nodal
+    Real bg = owx * owy * owz * Bzg(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Bzg(jg+1,kg  ,lg  )
+        +     owx *  wy * owz * Bzg(jg  ,kg+1,lg  )
+        +      wx *  wy * owz * Bzg(jg+1,kg+1,lg  )
+        +     owx * owy *  wz * Bzg(jg  ,kg  ,lg+1)
+        +      wx * owy *  wz * Bzg(jg+1,kg  ,lg+1)
+        +     owx *  wy *  wz * Bzg(jg  ,kg+1,lg+1)
+        +      wx *  wy *  wz * Bzg(jg+1,kg+1,lg+1);
+
+    // interp from coarse staggered to fine nodal
+    wx = 0.5-wx;  owx = 1.0-wx;
+    wy = 0.5-wy;  owy = 1.0-wy;
+    Real bc = owx * owy * owz * Bzc(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Bzc(jg-1,kg  ,lg  )
+        +     owx *  wy * owz * Bzc(jg  ,kg-1,lg  )
+        +      wx *  wy * owz * Bzc(jg-1,kg-1,lg  )
+        +     owx * owy *  wz * Bzc(jg  ,kg  ,lg+1)
+        +      wx * owy *  wz * Bzc(jg-1,kg  ,lg+1)
+        +     owx *  wy *  wz * Bzc(jg  ,kg-1,lg+1)
+        +      wx *  wy *  wz * Bzc(jg-1,kg-1,lg+1);
+
+    // interp from fine stagged to fine nodal
+    Real bf = 0.25*(Bzf(j-1,k-1,l) + Bzf(j,k-1,l) + Bzf(j-1,k,l) + Bzf(j,k,l));
+
+#endif
+
+    Bza(j,k,l) = bg + (bf-bc);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_bfield_x (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Bxa,
+                               amrex::Array4<amrex::Real const> const& Bxf)
+{
+#if (AMREX_SPACEDIM == 2)
+    Bxa(j,k,0) = 0.5*(Bxf(j,k-1,0) + Bxf(j,k,0));
+#else
+    Bxa(j,k,l) = 0.25*(Bxf(j,k-1,l-1) + Bxf(j,k,l-1) + Bxf(j,k-1,l) + Bxf(j,k,l));
+#endif
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_bfield_y (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Bya,
+                               amrex::Array4<amrex::Real const> const& Byf)
+{
+#if (AMREX_SPACEDIM == 2)
+    Bya(j,k,0) = 0.25*(Byf(j,k,0) + Byf(j-1,k,0) + Byf(j,k-1,0) + Byf(j-1,k-1,0));
+#else
+    Bya(j,k,l) = 0.25*(Byf(j-1,k,l-1) + Byf(j,k,l-1) + Byf(j-1,k,l) + Byf(j,k,l));
+#endif
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_bfield_z (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Bza,
+                               amrex::Array4<amrex::Real const> const& Bzf)
+{
+#if (AMREX_SPACEDIM == 2)
+    Bza(j,k,0) = 0.5*(Bzf(j-1,k,0) + Bzf(j,k,0));
+#else
+    Bza(j,k,l) = 0.25*(Bzf(j-1,k-1,l) + Bzf(j,k-1,l) + Bzf(j-1,k,l) + Bzf(j,k,l));
+#endif
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_efield_x (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Exa,
+                               amrex::Array4<amrex::Real const> const& Exf,
+                               amrex::Array4<amrex::Real const> const& Exc,
+                               amrex::Array4<amrex::Real const> const& Exg)
+{
+    using namespace amrex;
+
+    int jg = amrex::coarsen(j,2);
+    Real wx = (j == jg*2) ? 0.0 : 0.5;
+    Real owx = 1.0-wx;
+
+    int kg = amrex::coarsen(k,2);
+    Real wy = (k == kg*2) ? 0.0 : 0.5;
+    Real owy = 1.0-wy;
+
+#if (AMREX_SPACEDIM == 2)
+
+    // interp from coarse nodal to fine nodal
+    Real eg = owx * owy * Exg(jg  ,kg  ,0)
+        +     owx *  wy * Exg(jg  ,kg+1,0)
+        +      wx * owy * Exg(jg+1,kg  ,0)
+        +      wx *  wy * Exg(jg+1,kg+1,0);
+
+    // interp from coarse staggered to fine nodal
+    wx = 0.5-wx;  owx = 1.0-wx;
+    Real ec = owx * owy * Exc(jg  ,kg  ,0)
+        +     owx *  wy * Exc(jg  ,kg+1,0)
+        +      wx * owy * Exc(jg-1,kg  ,0)
+        +      wx *  wy * Exc(jg-1,kg+1,0);
+
+    // interp from fine staggered to fine nodal
+    Real ef = 0.5*(Exf(j-1,k,0) + Exf(j,k,0));
+
+#else
+
+    int lg = amrex::coarsen(l,2);
+    Real wz = (l == lg*2) ? 0.0 : 0.5;
+    Real owz = 1.0-wz;
+
+    // interp from coarse nodal to fine nodal
+    Real eg = owx * owy * owz * Exg(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Exg(jg+1,kg  ,lg  )
+        +     owx *  wy * owz * Exg(jg  ,kg+1,lg  )
+        +      wx *  wy * owz * Exg(jg+1,kg+1,lg  )
+        +     owx * owy *  wz * Exg(jg  ,kg  ,lg+1)
+        +      wx * owy *  wz * Exg(jg+1,kg  ,lg+1)
+        +     owx *  wy *  wz * Exg(jg  ,kg+1,lg+1)
+        +      wx *  wy *  wz * Exg(jg+1,kg+1,lg+1);
+
+    // interp from coarse staggered to fine nodal
+    wx = 0.5-wx;  owx = 1.0-wx;
+    Real ec = owx * owy * owz * Exc(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Exc(jg-1,kg  ,lg  )
+        +     owx *  wy * owz * Exc(jg  ,kg+1,lg  )
+        +      wx *  wy * owz * Exc(jg-1,kg+1,lg  )
+        +     owx * owy *  wz * Exc(jg  ,kg  ,lg+1)
+        +      wx * owy *  wz * Exc(jg-1,kg  ,lg+1)
+        +     owx *  wy *  wz * Exc(jg  ,kg+1,lg+1)
+        +      wx *  wy *  wz * Exc(jg-1,kg+1,lg+1);
+
+    // interp from fine staggered to fine nodal
+    Real ef = 0.5*(Exf(j-1,k,l) + Exf(j,k,l));
+
+#endif
+
+    Exa(j,k,l) = eg + (ef-ec);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_efield_y (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Eya,
+                               amrex::Array4<amrex::Real const> const& Eyf,
+                               amrex::Array4<amrex::Real const> const& Eyc,
+                               amrex::Array4<amrex::Real const> const& Eyg)
+{
+    using namespace amrex;
+
+    int jg = amrex::coarsen(j,2);
+    Real wx = (j == jg*2) ? 0.0 : 0.5;
+    Real owx = 1.0-wx;
+
+    int kg = amrex::coarsen(k,2);
+    Real wy = (k == kg*2) ? 0.0 : 0.5;
+    Real owy = 1.0-wy;
+
+#if (AMREX_SPACEDIM == 2)
+
+    // interp from coarse nodal and coarse staggered to fine nodal
+    Real eg = owx * owy * (Eyg(jg  ,kg  ,0) + Eyc(jg  ,kg  ,0))
+        +     owx *  wy * (Eyg(jg  ,kg+1,0) + Eyc(jg  ,kg+1,0))
+        +      wx * owy * (Eyg(jg+1,kg  ,0) + Eyc(jg+1,kg  ,0))
+        +      wx *  wy * (Eyg(jg+1,kg+1,0) + Eyc(jg+1,kg+1,0));
+    Real ec = 0.0;
+
+    // interp from fine staggered to fine nodal
+    Real ef = Eyf(j,k,0);
+
+#else
+
+    int lg = amrex::coarsen(l,2);
+    Real wz = (l == lg*2) ? 0.0 : 0.5;
+    Real owz = 1.0-wz;
+
+    // interp from coarse nodal to fine nodal
+    Real eg = owx * owy * owz * Eyg(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Eyg(jg+1,kg  ,lg  )
+        +     owx *  wy * owz * Eyg(jg  ,kg+1,lg  )
+        +      wx *  wy * owz * Eyg(jg+1,kg+1,lg  )
+        +     owx * owy *  wz * Eyg(jg  ,kg  ,lg+1)
+        +      wx * owy *  wz * Eyg(jg+1,kg  ,lg+1)
+        +     owx *  wy *  wz * Eyg(jg  ,kg+1,lg+1)
+        +      wx *  wy *  wz * Eyg(jg+1,kg+1,lg+1);
+
+    // interp from coarse staggered to fine nodal
+    wy = 0.5-wy;  owy = 1.0-wy;
+    Real ec = owx * owy * owz * Eyc(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Eyc(jg+1,kg  ,lg  )
+        +     owx *  wy * owz * Eyc(jg  ,kg-1,lg  )
+        +      wx *  wy * owz * Eyc(jg+1,kg-1,lg  )
+        +     owx * owy *  wz * Eyc(jg  ,kg  ,lg+1)
+        +      wx * owy *  wz * Eyc(jg+1,kg  ,lg+1)
+        +     owx *  wy *  wz * Eyc(jg  ,kg-1,lg+1)
+        +      wx *  wy *  wz * Eyc(jg+1,kg-1,lg+1);
+
+    // interp from fine staggered to fine nodal
+    Real ef = 0.5*(Eyf(j,k-1,l) + Eyf(j,k,l));
+
+#endif
+
+    Eya(j,k,l) = eg + (ef-ec);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_efield_z (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Eza,
+                               amrex::Array4<amrex::Real const> const& Ezf,
+                               amrex::Array4<amrex::Real const> const& Ezc,
+                               amrex::Array4<amrex::Real const> const& Ezg)
+{
+    using namespace amrex;
+
+    int jg = amrex::coarsen(j,2);
+    Real wx = (j == jg*2) ? 0.0 : 0.5;
+    Real owx = 1.0-wx;
+
+    int kg = amrex::coarsen(k,2);
+    Real wy = (k == kg*2) ? 0.0 : 0.5;
+    Real owy = 1.0-wy;
+
+#if (AMREX_SPACEDIM == 2)
+
+    // interp from coarse nodal to fine nodal
+    Real eg = owx * owy * Ezg(jg  ,kg  ,0)
+        +     owx *  wy * Ezg(jg  ,kg+1,0)
+        +      wx * owy * Ezg(jg+1,kg  ,0)
+        +      wx *  wy * Ezg(jg+1,kg+1,0);
+
+    // interp from coarse stagged to fine nodal
+    wy = 0.5-wy;  owy = 1.0-wy;
+    Real ec = owx * owy * Ezc(jg  ,kg  ,0)
+        +     owx *  wy * Ezc(jg  ,kg-1,0)
+        +      wx * owy * Ezc(jg+1,kg  ,0)
+        +      wx *  wy * Ezc(jg+1,kg-1,0);
+
+    // interp from fine staggered to fine nodal
+    Real ef = 0.5*(Ezf(j,k-1,0) + Ezf(j,k,0));
+
+#else
+
+    int lg = amrex::coarsen(l,2);
+    Real wz = (l == lg*2) ? 0.0 : 0.5;
+    Real owz = 1.0-wz;
+
+    // interp from coarse nodal to fine nodal
+    Real eg = owx * owy * owz * Ezg(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Ezg(jg+1,kg  ,lg  )
+        +     owx *  wy * owz * Ezg(jg  ,kg+1,lg  )
+        +      wx *  wy * owz * Ezg(jg+1,kg+1,lg  )
+        +     owx * owy *  wz * Ezg(jg  ,kg  ,lg+1)
+        +      wx * owy *  wz * Ezg(jg+1,kg  ,lg+1)
+        +     owx *  wy *  wz * Ezg(jg  ,kg+1,lg+1)
+        +      wx *  wy *  wz * Ezg(jg+1,kg+1,lg+1);
+
+    // interp from coarse staggered to fine nodal
+    wz = 0.5-wz;  owz = 1.0-wz;
+    Real ec = owx * owy * owz * Ezc(jg  ,kg  ,lg  )
+        +      wx * owy * owz * Ezc(jg+1,kg  ,lg  )
+        +     owx *  wy * owz * Ezc(jg  ,kg+1,lg  )
+        +      wx *  wy * owz * Ezc(jg+1,kg+1,lg  )
+        +     owx * owy *  wz * Ezc(jg  ,kg  ,lg-1)
+        +      wx * owy *  wz * Ezc(jg+1,kg  ,lg-1)
+        +     owx *  wy *  wz * Ezc(jg  ,kg+1,lg-1)
+        +      wx *  wy *  wz * Ezc(jg+1,kg+1,lg-1);
+
+    // interp from fine staggered to fine nodal
+    Real ef = 0.5*(Ezf(j,k,l-1) + Ezf(j,k,l));
+
+#endif
+
+    Eza(j,k,l) = eg + (ef-ec);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_efield_x (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Exa,
+                               amrex::Array4<amrex::Real const> const& Exf)
+{
+    Exa(j,k,l) = 0.5*(Exf(j-1,k,l) + Exf(j,k,l));
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_efield_y (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Eya,
+                               amrex::Array4<amrex::Real const> const& Eyf)
+{
+#if (AMREX_SPACEDIM == 2)
+    Eya(j,k,0) = Eyf(j,k,0);
+#else
+    Eya(j,k,l) = 0.5*(Eyf(j,k-1,l) + Eyf(j,k,l));
+#endif
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_nd_efield_z (int j, int k, int l,
+                               amrex::Array4<amrex::Real> const& Eza,
+                               amrex::Array4<amrex::Real const> const& Ezf)
+{
+#if (AMREX_SPACEDIM == 2)
+    Eza(j,k,0) = 0.5*(Ezf(j,k-1,0) + Ezf(j,k,0));
+#else
+    Eza(j,k,l) = 0.5*(Ezf(j,k,l-1) + Ezf(j,k,l));
+#endif
+}
+
 #endif
diff --git a/Source/Parallelization/WarpXRegrid.cpp b/Source/Parallelization/WarpXRegrid.cpp
index 5441755f5..4ae2b6dca 100644
--- a/Source/Parallelization/WarpXRegrid.cpp
+++ b/Source/Parallelization/WarpXRegrid.cpp
@@ -1,5 +1,6 @@
 
 #include <WarpX.H>
+#include <WarpXAlgorithmSelection.H>
 #include <AMReX_BLProfiler.H>
 
 using namespace amrex;
@@ -91,7 +92,7 @@ WarpX::RemakeLevel (int lev, Real time, const BoxArray& ba, const DistributionMa
 
         // Aux patch
 
-        if (lev == 0)
+        if (lev == 0 && field_gathering_algo != GatheringAlgo::MomentumConserving)
         {
             for (int idim = 0; idim < 3; ++idim) {
                 Bfield_aux[lev][idim].reset(new MultiFab(*Bfield_fp[lev][idim], amrex::make_alias, 0, Bfield_aux[lev][idim]->nComp()));
diff --git a/Source/WarpX.H b/Source/WarpX.H
index 7a5344f4f..b8a88b77c 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -203,6 +203,8 @@ public:
     // This function does aux(lev) = fp(lev) + I(aux(lev-1)-cp(lev)).
     // Caller must make sure fp and cp have ghost cells filled.
     void UpdateAuxilaryData ();
+    void UpdateNodalAuxilaryData ();
+    void UpdateStagAuxilaryData ();
 
     // Fill boundary cells including coarse/fine boundaries
     void FillBoundaryB ();
diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp
index d92354f2e..bac0d4121 100644
--- a/Source/WarpX.cpp
+++ b/Source/WarpX.cpp
@@ -792,16 +792,19 @@ WarpX::AllocLevelMFs (int lev, const BoxArray& ba, const DistributionMapping& dm
     ncomps = n_rz_azimuthal_modes*2 - 1;
 #endif
 
+    bool is_nodal = (field_gathering_algo == GatheringAlgo::MomentumConserving);
+    IntVect ngextra(static_cast<int>(is_nodal));
+
     //
     // The fine patch
     //
-    Bfield_fp[lev][0].reset( new MultiFab(amrex::convert(ba,Bx_nodal_flag),dm,ncomps,ngE));
-    Bfield_fp[lev][1].reset( new MultiFab(amrex::convert(ba,By_nodal_flag),dm,ncomps,ngE));
-    Bfield_fp[lev][2].reset( new MultiFab(amrex::convert(ba,Bz_nodal_flag),dm,ncomps,ngE));
+    Bfield_fp[lev][0].reset( new MultiFab(amrex::convert(ba,Bx_nodal_flag),dm,ncomps,ngE+ngextra));
+    Bfield_fp[lev][1].reset( new MultiFab(amrex::convert(ba,By_nodal_flag),dm,ncomps,ngE+ngextra));
+    Bfield_fp[lev][2].reset( new MultiFab(amrex::convert(ba,Bz_nodal_flag),dm,ncomps,ngE+ngextra));
 
-    Efield_fp[lev][0].reset( new MultiFab(amrex::convert(ba,Ex_nodal_flag),dm,ncomps,ngE));
-    Efield_fp[lev][1].reset( new MultiFab(amrex::convert(ba,Ey_nodal_flag),dm,ncomps,ngE));
-    Efield_fp[lev][2].reset( new MultiFab(amrex::convert(ba,Ez_nodal_flag),dm,ncomps,ngE));
+    Efield_fp[lev][0].reset( new MultiFab(amrex::convert(ba,Ex_nodal_flag),dm,ncomps,ngE+ngextra));
+    Efield_fp[lev][1].reset( new MultiFab(amrex::convert(ba,Ey_nodal_flag),dm,ncomps,ngE+ngextra));
+    Efield_fp[lev][2].reset( new MultiFab(amrex::convert(ba,Ez_nodal_flag),dm,ncomps,ngE+ngextra));
 
     current_fp[lev][0].reset( new MultiFab(amrex::convert(ba,jx_nodal_flag),dm,ncomps,ngJ));
     current_fp[lev][1].reset( new MultiFab(amrex::convert(ba,jy_nodal_flag),dm,ncomps,ngJ));
@@ -848,7 +851,18 @@ WarpX::AllocLevelMFs (int lev, const BoxArray& ba, const DistributionMapping& dm
     //
     // The Aux patch (i.e., the full solution)
     //
-    if (lev == 0)
+    if (is_nodal)
+    {
+        BoxArray const nba = amrex::convert(ba,IntVect::TheUnitVector());
+        Bfield_aux[lev][0].reset( new MultiFab(nba,dm,ncomps,ngE));
+        Bfield_aux[lev][1].reset( new MultiFab(nba,dm,ncomps,ngE));
+        Bfield_aux[lev][2].reset( new MultiFab(nba,dm,ncomps,ngE));
+
+        Efield_aux[lev][0].reset( new MultiFab(nba,dm,ncomps,ngE));
+        Efield_aux[lev][1].reset( new MultiFab(nba,dm,ncomps,ngE));
+        Efield_aux[lev][2].reset( new MultiFab(nba,dm,ncomps,ngE));
+    }
+    else if (lev == 0)
     {
         for (int idir = 0; idir < 3; ++idir) {
             Efield_aux[lev][idir].reset(new MultiFab(*Efield_fp[lev][idir], amrex::make_alias, 0, ncomps));
@@ -929,15 +943,25 @@ WarpX::AllocLevelMFs (int lev, const BoxArray& ba, const DistributionMapping& dm
         cba.coarsen(refRatio(lev-1));
 
         if (n_field_gather_buffer > 0 || mypc->nSpeciesGatherFromMainGrid() > 0) {
-            // Create the MultiFabs for B
-            Bfield_cax[lev][0].reset( new MultiFab(amrex::convert(cba,Bx_nodal_flag),dm,ncomps,ngE));
-            Bfield_cax[lev][1].reset( new MultiFab(amrex::convert(cba,By_nodal_flag),dm,ncomps,ngE));
-            Bfield_cax[lev][2].reset( new MultiFab(amrex::convert(cba,Bz_nodal_flag),dm,ncomps,ngE));
-
-            // Create the MultiFabs for E
-            Efield_cax[lev][0].reset( new MultiFab(amrex::convert(cba,Ex_nodal_flag),dm,ncomps,ngE));
-            Efield_cax[lev][1].reset( new MultiFab(amrex::convert(cba,Ey_nodal_flag),dm,ncomps,ngE));
-            Efield_cax[lev][2].reset( new MultiFab(amrex::convert(cba,Ez_nodal_flag),dm,ncomps,ngE));
+            if (field_gathering_algo == GatheringAlgo::MomentumConserving) {
+                BoxArray const& cnba = amrex::convert(cba,IntVect::TheUnitVector());
+                Bfield_cax[lev][0].reset( new MultiFab(cnba,dm,ncomps,ngE));
+                Bfield_cax[lev][1].reset( new MultiFab(cnba,dm,ncomps,ngE));
+                Bfield_cax[lev][2].reset( new MultiFab(cnba,dm,ncomps,ngE));
+                Efield_cax[lev][0].reset( new MultiFab(cnba,dm,ncomps,ngE));
+                Efield_cax[lev][1].reset( new MultiFab(cnba,dm,ncomps,ngE));
+                Efield_cax[lev][2].reset( new MultiFab(cnba,dm,ncomps,ngE));
+            } else {
+                // Create the MultiFabs for B
+                Bfield_cax[lev][0].reset( new MultiFab(amrex::convert(cba,Bx_nodal_flag),dm,ncomps,ngE));
+                Bfield_cax[lev][1].reset( new MultiFab(amrex::convert(cba,By_nodal_flag),dm,ncomps,ngE));
+                Bfield_cax[lev][2].reset( new MultiFab(amrex::convert(cba,Bz_nodal_flag),dm,ncomps,ngE));
+
+                // Create the MultiFabs for E
+                Efield_cax[lev][0].reset( new MultiFab(amrex::convert(cba,Ex_nodal_flag),dm,ncomps,ngE));
+                Efield_cax[lev][1].reset( new MultiFab(amrex::convert(cba,Ey_nodal_flag),dm,ncomps,ngE));
+                Efield_cax[lev][2].reset( new MultiFab(amrex::convert(cba,Ez_nodal_flag),dm,ncomps,ngE));
+            }
 
             gather_buffer_masks[lev].reset( new iMultiFab(ba, dm, ncomps, 1) );
             // Gather buffer masks have 1 ghost cell, because of the fact
-- 
cgit v1.2.3


From 45b6e243030bb72816f79f55aa60f88f7aa7f7bc Mon Sep 17 00:00:00 2001
From: Axel Huebl <axel.huebl@plasma.ninja>
Date: Thu, 26 Sep 2019 18:24:45 -0700
Subject: Rho Synchronize: Port to GPU

Port the charge density synchronize functions to GPU.
---
 Source/FortranInterface/WarpX_f.F90                |  59 ----------
 Source/FortranInterface/WarpX_f.H                  |   9 --
 .../InterpolateDensityFineToCoarse.H               | 120 +++++++++++++++++++++
 Source/Parallelization/WarpXComm.cpp               |  35 +++---
 Source/WarpX.H                                     |   8 +-
 5 files changed, 143 insertions(+), 88 deletions(-)
 create mode 100644 Source/Parallelization/InterpolateDensityFineToCoarse.H

(limited to 'Source/Parallelization/WarpXComm.cpp')

diff --git a/Source/FortranInterface/WarpX_f.F90 b/Source/FortranInterface/WarpX_f.F90
index 2ddf9409d..762ed2cd8 100644
--- a/Source/FortranInterface/WarpX_f.F90
+++ b/Source/FortranInterface/WarpX_f.F90
@@ -51,65 +51,6 @@ contains
 
   end subroutine warpx_compute_E
 
-  subroutine warpx_sync_rho_2d (lo, hi, crse, clo, chi, fine, flo, fhi, nc) &
-       bind(c, name='warpx_sync_rho_2d')
-    integer, intent(in) :: lo(2), hi(2), flo(2), fhi(2), clo(2), chi(2), nc
-    real(amrex_real), intent(in   ) :: fine(flo(1):fhi(1),flo(2):fhi(2),nc)
-    real(amrex_real), intent(inout) :: crse(clo(1):chi(1),clo(2):chi(2),nc)
-
-    integer :: i,j,ii,jj,m
-
-    do m = 1, nc
-       do j = lo(2), hi(2)
-          jj = j*2
-          do i = lo(1), hi(1)
-             ii = i*2
-             crse(i,j,m) = 0.25d0 * &
-                  ( fine(ii,jj,m) + 0.5d0 *(fine(ii-1,jj  ,m)+fine(ii+1,jj  ,m) &
-                  &                       + fine(ii  ,jj-1,m)+fine(ii  ,jj+1,m)) &
-                  &               + 0.25d0*(fine(ii-1,jj-1,m)+fine(ii+1,jj-1,m) &
-                  &                       + fine(ii-1,jj+1,m)+fine(ii+1,jj+1,m)) )
-          end do
-       end do
-    end do
-  end subroutine warpx_sync_rho_2d
-
-  subroutine warpx_sync_rho_3d (lo, hi, crse, clo, chi, fine, flo, fhi, nc) &
-       bind(c, name='warpx_sync_rho_3d')
-    integer, intent(in) :: lo(3), hi(3), flo(3), fhi(3), clo(3), chi(3), nc
-    real(amrex_real), intent(in   ) :: fine(flo(1):fhi(1),flo(2):fhi(2),flo(3):fhi(3),nc)
-    real(amrex_real), intent(inout) :: crse(clo(1):chi(1),clo(2):chi(2),clo(3):chi(3),nc)
-
-    integer :: i,j,k,ii,jj,kk,m
-
-    do m = 1, nc
-       do k = lo(3), hi(3)
-          kk = k*2
-          do j = lo(2), hi(2)
-             jj = j*2
-             do i = lo(1), hi(1)
-                ii = i*2
-                crse(i,j,k,m) = 0.125d0 * &
-                     (fine(ii,jj,kk,m) + 0.5d0  *(fine(ii-1,jj  ,kk  ,m)+fine(ii+1,jj  ,kk  ,m) &
-                     &                          + fine(ii  ,jj-1,kk  ,m)+fine(ii  ,jj+1,kk  ,m) &
-                     &                          + fine(ii  ,jj  ,kk-1,m)+fine(ii  ,jj  ,kk+1,m)) &
-                     &                 + 0.25d0 *(fine(ii-1,jj-1,kk  ,m)+fine(ii+1,jj-1,kk  ,m) &
-                     &                          + fine(ii-1,jj+1,kk  ,m)+fine(ii+1,jj+1,kk  ,m) &
-                     &                          + fine(ii-1,jj  ,kk-1,m)+fine(ii+1,jj  ,kk-1,m) &
-                     &                          + fine(ii-1,jj  ,kk+1,m)+fine(ii+1,jj  ,kk+1,m) &
-                     &                          + fine(ii  ,jj-1,kk-1,m)+fine(ii  ,jj+1,kk-1,m) &
-                     &                          + fine(ii  ,jj-1,kk+1,m)+fine(ii  ,jj+1,kk+1,m)) &
-                     &                 + 0.125d0*(fine(ii-1,jj-1,kk-1,m)+fine(ii-1,jj-1,kk+1,m) &
-                     &                          + fine(ii-1,jj+1,kk-1,m)+fine(ii-1,jj+1,kk+1,m) &
-                     &                          + fine(ii+1,jj-1,kk-1,m)+fine(ii+1,jj-1,kk+1,m) &
-                     &                          + fine(ii+1,jj+1,kk-1,m)+fine(ii+1,jj+1,kk+1,m)))
-             end do
-          end do
-       end do
-    end do
-
-  end subroutine warpx_sync_rho_3d
-
   subroutine warpx_build_buffer_masks (lo, hi, msk, mlo, mhi, gmsk, glo, ghi, ng) &
        bind(c, name='warpx_build_buffer_masks')
     integer, dimension(3), intent(in) :: lo, hi, mlo, mhi, glo, ghi
diff --git a/Source/FortranInterface/WarpX_f.H b/Source/FortranInterface/WarpX_f.H
index 2d5d346e2..b51a83387 100644
--- a/Source/FortranInterface/WarpX_f.H
+++ b/Source/FortranInterface/WarpX_f.H
@@ -16,8 +16,6 @@
 
 #if (AMREX_SPACEDIM == 3)
 
-#define WRPX_SYNC_RHO                    warpx_sync_rho_3d
-
 #define WRPX_SUM_FINE_TO_CRSE_NODAL      warpx_sum_fine_to_crse_nodal_3d
 #define WRPX_ZERO_OUT_BNDRY              warpx_zero_out_bndry_3d
 #define WRPX_BUILD_MASK                  warpx_build_mask_3d
@@ -30,8 +28,6 @@
 
 #elif (AMREX_SPACEDIM == 2)
 
-#define WRPX_SYNC_RHO                    warpx_sync_rho_2d
-
 #define WRPX_SUM_FINE_TO_CRSE_NODAL      warpx_sum_fine_to_crse_nodal_2d
 #define WRPX_ZERO_OUT_BNDRY              warpx_zero_out_bndry_2d
 #define WRPX_BUILD_MASK                  warpx_build_mask_2d
@@ -229,11 +225,6 @@ extern "C"
 #endif
                           const amrex::Real* sigbz, int sigbz_lo, int sigbz_hi);
 
-    void WRPX_SYNC_RHO (const int* lo, const int* hi,
-                        BL_FORT_FAB_ARG_ANYD(crse),
-                        const BL_FORT_FAB_ARG_ANYD(fine),
-                        const int* ncomp);
-
 #ifdef WARPX_USE_PSATD
     void warpx_fft_mpi_init (int fcomm);
     void warpx_fft_domain_decomp (int* warpx_local_nz, int* warpx_local_z0,
diff --git a/Source/Parallelization/InterpolateDensityFineToCoarse.H b/Source/Parallelization/InterpolateDensityFineToCoarse.H
new file mode 100644
index 000000000..947db2aac
--- /dev/null
+++ b/Source/Parallelization/InterpolateDensityFineToCoarse.H
@@ -0,0 +1,120 @@
+/* Copyright 2019 Axel Huebl, Weiqun Zhang
+ *
+ * This file is part of WarpX.
+ *
+ * License: BSD-3-Clause-LBNL
+ */
+
+#ifndef INTERPOLATE_DENSITY_FINE_TO_COARSE_H
+#define INTERPOLATE_DENSITY_FINE_TO_COARSE_H
+
+#include <AMReX_Array4.H>
+#include <AMReX_REAL.H>
+#include <AMReX_BLassert.H>
+#include <AMReX_Extension.H>
+#include <AMReX_GpuQualifiers.H>
+
+#include <utility> // std::move
+
+
+/** Fill a charge density (rho) coarse patch with averaged values from a fine patch
+ *
+ * Fills the values of the charge density on the coarse patch
+ * by averaging the values of the charge density of the fine patch.
+ */
+class InterpolateDensityFineToCoarse
+{
+public:
+    /** Construct with fine and coarse patch and their refinement ratio
+     *
+     * @param[in]     fine read-only fine patch
+     * @param[in,out] coarse overwritten coarse patch
+     * @param[in]     refinement_ratio ratio between coarse and fine patch granularity
+     *                (currently, only a value of is implemented)
+     * @param[in]     number_of_components the number of components
+     */
+    AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+    InterpolateDensityFineToCoarse(
+        amrex::Array4<amrex::Real const> const fine,
+        amrex::Array4<amrex::Real      > const coarse,
+        int const refinement_ratio,
+        int const number_of_components
+    ) : m_fine(std::move(fine)),
+        m_coarse(std::move(coarse)),
+        m_refinement_ratio(std::move(refinement_ratio)),
+        m_number_of_components(std::move(number_of_components))
+    {
+        //! @note constants and stencils in operator() implementation assume 2x refinement
+        BL_ASSERT(refinement_ratio == 2);
+    }
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    void
+    operator()(
+        int const i,
+        int const j,
+        int const k
+    ) const noexcept
+    {
+        using namespace amrex;
+
+        auto const & fine_unsafe = m_fine;   // out-of-bounds access not secured with zero-values yet
+        auto const & coarse      = m_coarse; // out-of-bounds access not secured but will also not occur
+
+        // return zero for out-of-bounds accesses during interpolation
+        // this is efficiently used as a method to add neutral elements beyond guards in the average below
+        auto const fine = [fine_unsafe] AMREX_GPU_DEVICE (int const j, int const k, int const l, int const m) noexcept
+        {
+            return fine_unsafe.contains(j, k, l) ? fine_unsafe(j, k, l, m) :  amrex::Real{0.};
+        };
+
+        int const ii = i * m_refinement_ratio;
+        int const jj = j * m_refinement_ratio;
+        int const kk = k * m_refinement_ratio;
+        for( int m = 0; m < m_number_of_components; ++m ) {
+#if AMREX_SPACEDIM == 2
+            coarse(i,j,kk,m) = 0.25_rt * (
+                fine(ii,jj,kk,m) +
+                0.5_rt  * (
+                    fine(ii-1,jj  ,kk,m) + fine(ii+1,jj  ,kk,m) +
+                    fine(ii  ,jj-1,kk,m) + fine(ii  ,jj+1,kk,m)
+                ) +
+                0.25_rt * (
+                    fine(ii-1,jj-1,kk,m) + fine(ii+1,jj-1,kk,m) +
+                    fine(ii-1,jj+1,kk,m) + fine(ii+1,jj+1,kk,m)
+                )
+            );
+#elif AMREX_SPACEDIM == 3
+            coarse(i,j,k,m) = 0.125_rt * (
+                fine(ii,jj,kk,m) +
+                0.5_rt   * (
+                    fine(ii-1,jj  ,kk  ,m) + fine(ii+1,jj  ,kk  ,m) +
+                    fine(ii  ,jj-1,kk  ,m) + fine(ii  ,jj+1,kk  ,m) +
+                    fine(ii  ,jj  ,kk-1,m) + fine(ii  ,jj  ,kk+1,m)
+                ) +
+                0.25_rt  * (
+                    fine(ii-1,jj-1,kk  ,m) + fine(ii+1,jj-1,kk  ,m) +
+                    fine(ii-1,jj+1,kk  ,m) + fine(ii+1,jj+1,kk  ,m) +
+                    fine(ii-1,jj  ,kk-1,m) + fine(ii+1,jj  ,kk-1,m) +
+                    fine(ii-1,jj  ,kk+1,m) + fine(ii+1,jj  ,kk+1,m) +
+                    fine(ii  ,jj-1,kk-1,m) + fine(ii  ,jj+1,kk-1,m) +
+                    fine(ii  ,jj-1,kk+1,m) + fine(ii  ,jj+1,kk+1,m)
+                ) +
+                0.125_rt * (
+                    fine(ii-1,jj-1,kk-1,m) + fine(ii-1,jj-1,kk+1,m) +
+                    fine(ii-1,jj+1,kk-1,m) + fine(ii-1,jj+1,kk+1,m) +
+                    fine(ii+1,jj-1,kk-1,m) + fine(ii+1,jj-1,kk+1,m) +
+                    fine(ii+1,jj+1,kk-1,m) + fine(ii+1,jj+1,kk+1,m)
+                )
+            );
+#endif
+        }
+    }
+private:
+    amrex::Array4< amrex::Real const > m_fine;
+    amrex::Array4< amrex::Real       > m_coarse;
+    int m_refinement_ratio;
+    int m_number_of_components;
+};
+
+#endif // INTERPOLATE_DENSITY_FINE_TO_COARSE_H
diff --git a/Source/Parallelization/WarpXComm.cpp b/Source/Parallelization/WarpXComm.cpp
index 4f870e79c..92f0b4f09 100644
--- a/Source/Parallelization/WarpXComm.cpp
+++ b/Source/Parallelization/WarpXComm.cpp
@@ -3,10 +3,12 @@
 #include <WarpX_f.H>
 #include <WarpXSumGuardCells.H>
 #include <Parallelization/InterpolateCurrentFineToCoarse.H>
+#include <Parallelization/InterpolateDensityFineToCoarse.H>
 
 #include <algorithm>
 #include <cstdlib>
 
+
 using namespace amrex;
 
 void
@@ -354,7 +356,7 @@ WarpX::interpolateCurrentFineToCoarse ( std::array< amrex::MultiFab const *, 3 >
                                         std::array< amrex::MultiFab       *, 3 > const & coarse,
                                         int const refinement_ratio)
 {
-    BL_PROFILE("InterpolateCurrentFineToCoarse()");
+    BL_PROFILE("interpolateCurrentFineToCoarse()");
     BL_ASSERT(refinement_ratio == 2);
     const IntVect& ng = (fine[0]->nGrowVect() + 1) / refinement_ratio; // add equivalent no. of guards to coarse patch
 
@@ -386,6 +388,8 @@ WarpX::interpolateCurrentFineToCoarse ( std::array< amrex::MultiFab const *, 3 >
 void
 WarpX::SyncRho ()
 {
+    BL_PROFILE("SyncRho()");
+
     if (!rho_fp[0]) return;
     const int ncomp = rho_fp[0]->nComp();
 
@@ -395,7 +399,7 @@ WarpX::SyncRho ()
     {
         rho_cp[lev]->setVal(0.0);
         const IntVect& refinement_ratio = refRatio(lev-1);
-        SyncRho(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
+        interpolateDensityFineToCoarse(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
     }
 
     // For each level
@@ -408,29 +412,26 @@ WarpX::SyncRho ()
 }
 
 void
-WarpX::SyncRho (const MultiFab& fine, MultiFab& crse, int refinement_ratio)
+WarpX::interpolateDensityFineToCoarse (const MultiFab& fine, MultiFab& coarse, int const refinement_ratio)
 {
+    BL_PROFILE("interpolateDensityFineToCoarse()");
     BL_ASSERT(refinement_ratio == 2);
-    const IntVect& ng = (fine.nGrowVect()+1)/refinement_ratio;
+    const IntVect& ng = (fine.nGrowVect() + 1) / refinement_ratio;  // add equivalent no. of guards to coarse patch
     const int nc = fine.nComp();
 
 #ifdef _OPEMP
 #pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
     {
-        FArrayBox ffab;
-        for (MFIter mfi(crse,true); mfi.isValid(); ++mfi)
+        // OMP in-box decomposition of coarse into tilebox
+        for (MFIter mfi(coarse, TilingIfNotGPU()); mfi.isValid(); ++mfi)
         {
-            const Box& bx = mfi.growntilebox(ng);
-            Box fbx = amrex::grow(amrex::refine(bx,refinement_ratio),1);
-            ffab.resize(fbx, nc);
-            fbx &= fine[mfi].box();
-            ffab.setVal(0.0);
-            ffab.copy(fine[mfi], fbx, 0, fbx, 0, nc);
-            WRPX_SYNC_RHO(bx.loVect(), bx.hiVect(),
-                          BL_TO_FORTRAN_ANYD(crse[mfi]),
-                          BL_TO_FORTRAN_ANYD(ffab),
-                          &nc);
+            const Box& bx = mfi.growntilebox(ng); // only grow to outer directions of tileboxes for filling guards
+
+            amrex::ParallelFor(
+                bx,
+                InterpolateDensityFineToCoarse(fine.const_array(mfi), coarse.array(mfi), refinement_ratio, nc)
+            );
         }
     }
 }
@@ -562,7 +563,7 @@ WarpX::RestrictRhoFromFineToCoarsePatch (int lev)
     if (rho_fp[lev]) {
         rho_cp[lev]->setVal(0.0);
         const IntVect& refinement_ratio = refRatio(lev-1);
-        SyncRho(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
+        interpolateDensityFineToCoarse(*rho_fp[lev], *rho_cp[lev], refinement_ratio[0]);
     }
 }
 
diff --git a/Source/WarpX.H b/Source/WarpX.H
index c59802427..0da1cf350 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -436,10 +436,12 @@ private:
      *  averaging the values of the charge density of the fine patch (on the same level).
      *
      * \param[in]  fine             fine patches to interpolate from
-     * \param[out] crse             coarse patches to interpolate to
-     * \param[in]  ref_ratio        integer ratio between the two
+     * \param[out] coarse           coarse patches to interpolate to
+     * \param[in]  refinement_ratio integer ratio between the two
      */
-    void SyncRho (const amrex::MultiFab& fine, amrex::MultiFab& crse, int ref_ratio);
+    void interpolateDensityFineToCoarse (const amrex::MultiFab& fine,
+                                               amrex::MultiFab& coarse,
+                                         int const refinement_ratio);
 
     void ExchangeWithPmlB (int lev);
     void ExchangeWithPmlE (int lev);
-- 
cgit v1.2.3


From 6b70e4abd40d70b48bf4a40cd1a7675a77c8a7ff Mon Sep 17 00:00:00 2001
From: Weiqun Zhang <weiqunzhang@lbl.gov>
Date: Fri, 4 Oct 2019 16:38:41 -0700
Subject: fix for cases with warpx.do_nodal=1

---
 Source/Parallelization/WarpXComm.cpp | 10 +++++-----
 Source/WarpX.H                       |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'Source/Parallelization/WarpXComm.cpp')

diff --git a/Source/Parallelization/WarpXComm.cpp b/Source/Parallelization/WarpXComm.cpp
index 6128b27a4..c5a8877d3 100644
--- a/Source/Parallelization/WarpXComm.cpp
+++ b/Source/Parallelization/WarpXComm.cpp
@@ -52,15 +52,15 @@ WarpX::UpdateAuxilaryData ()
 {
     BL_PROFILE("UpdateAuxilaryData()");
 
-    if (Bfield_aux[0][0]->is_nodal()) {
-        UpdateNodalAuxilaryData();
+    if (Bfield_aux[0][0]->ixType() == Bfield_fp[0][0]->ixType()) {
+        UpdateAuxilaryDataSameType();
     } else {
-        UpdateStagAuxilaryData();
+        UpdateAuxilaryDataStagToNodal();
     }
 }
 
 void
-WarpX::UpdateNodalAuxilaryData ()
+WarpX::UpdateAuxilaryDataStagToNodal ()
 {
     // For level 0, we only need to do the average.
 #ifdef _OPENMP
@@ -201,7 +201,7 @@ WarpX::UpdateNodalAuxilaryData ()
 }
 
 void
-WarpX::UpdateStagAuxilaryData ()
+WarpX::UpdateAuxilaryDataSameType ()
 {
     for (int lev = 1; lev <= finest_level; ++lev)
     {
diff --git a/Source/WarpX.H b/Source/WarpX.H
index 561e0533d..dd86dfc14 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -203,8 +203,8 @@ public:
     // This function does aux(lev) = fp(lev) + I(aux(lev-1)-cp(lev)).
     // Caller must make sure fp and cp have ghost cells filled.
     void UpdateAuxilaryData ();
-    void UpdateNodalAuxilaryData ();
-    void UpdateStagAuxilaryData ();
+    void UpdateAuxilaryDataStagToNodal ();
+    void UpdateAuxilaryDataSameType ();
 
     // Fill boundary cells including coarse/fine boundaries
     void FillBoundaryB ();
-- 
cgit v1.2.3