4 files changed, 139 insertions, 91 deletions
diff --git a/Source/BoundaryConditions/PML.H b/Source/BoundaryConditions/PML.H
index 7a86bf0da..25dfc7996 100644
--- a/Source/BoundaryConditions/PML.H
+++ b/Source/BoundaryConditions/PML.H
@@ -63,11 +63,11 @@ namespace amrex {
             delete fab;
         }
 #ifdef AMREX_USE_GPU
-        virtual SigmaBox* createHostAlias (SigmaBox const& src) const final
+        virtual SigmaBox* createDeviceAlias (SigmaBox const& src) const final
         {
             return const_cast<SigmaBox*>(&src);
         }
-        virtual void destroyHostAlias (SigmaBox* fab) const final {}
+        virtual void destroyDeviceAlias (SigmaBox* fab) const final {}
 #endif
         virtual FabFactory<SigmaBox>* clone () const {
             return new FabFactory<SigmaBox>(*this);
diff --git a/Source/Filter/BilinearFilter.H b/Source/Filter/BilinearFilter.H
index 8a1b87e97..9bade3c77 100644
--- a/Source/Filter/BilinearFilter.H
+++ b/Source/Filter/BilinearFilter.H
@@ -15,7 +15,11 @@ public:
     amrex::IntVect npass_each_dir;
     amrex::IntVect stencil_length_each_dir;
 
-    void Filter(const amrex::Box& tbx, amrex::FArrayBox const& tmpfab, amrex::FArrayBox &dstfab, int scomp, int dcomp, int ncomp);
+    // public for cuda
+    void Filter(const amrex::Box& tbx,
+                amrex::Array4<amrex::Real const> const& tmp,
+                amrex::Array4<amrex::Real      > const& dst,
+                int scomp, int dcomp, int ncomp);
 
 private:
 
diff --git a/Source/Filter/BilinearFilter.cpp b/Source/Filter/BilinearFilter.cpp
index 4017d3f73..f6acaa5df 100644
--- a/Source/Filter/BilinearFilter.cpp
+++ b/Source/Filter/BilinearFilter.cpp
@@ -70,60 +70,54 @@ void BilinearFilter::ComputeStencils(){
 }
 
 
+#ifdef AMREX_USE_CUDA
+
 void
 BilinearFilter::ApplyStencil (MultiFab& dstmf, const MultiFab& srcmf, int scomp, int dcomp, int ncomp)
 {
     BL_PROFILE("BilinearFilter::ApplyStencil()");
     ncomp = std::min(ncomp, srcmf.nComp());
-#ifdef _OPENMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
+
+    for (MFIter mfi(dstmf); mfi.isValid(); ++mfi)
     {
-        FArrayBox tmpfab;
-        for (MFIter mfi(dstmf,TilingIfNotGPU()); mfi.isValid(); ++mfi){
-            const auto& srcfab = srcmf[mfi];
-            auto& dstfab = dstmf[mfi];
-            const Box& tbx = mfi.growntilebox();
-            const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
-            // tmpfab has enough ghost cells for the stencil
-            AsyncFab tmp_async_fab(tmpfab,gbx,ncomp);
-            FArrayBox* tmpfab_ptr = tmp_async_fab.fabPtr();
-            const FArrayBox* srcfab_ptr = srcmf.fabPtr(mfi);
-            // Copy values in srcfab into tmpfab
-            const Box& ibx = gbx & srcfab.box();
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA(gbx, tgbx,
-            {
-                tmpfab_ptr->setVal(0.0, tgbx, 0, ncomp);
-            });
+        const auto& src = srcmf.array(mfi);
+        const auto& dst = dstmf.array(mfi);
+        const Box& tbx = mfi.growntilebox();
+        const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
 
-            AMREX_LAUNCH_HOST_DEVICE_LAMBDA(ibx, tibx,
-            {
-                tmpfab_ptr->copy(*srcfab_ptr, tibx, scomp, tibx, 0, ncomp);
-            });
+        // tmpfab has enough ghost cells for the stencil
+        FArrayBox tmp_fab(gbx,ncomp);
+        Elixir tmp_eli = tmp_fab.elixir();  // Prevent the tmp data from being deleted too early
+        auto const& tmp = tmp_fab.array();
 
-            // Apply filter
-            Filter(tbx, tmp_async_fab.hostFab(), dstfab, 0, dcomp, ncomp);
-        }
+        // Copy values in srcfab into tmpfab
+        const Box& ibx = gbx & srcmf[mfi].box();
+        AMREX_PARALLEL_FOR_4D ( gbx, ncomp, i, j, k, n,
+        {
+            if (ibx.contains(IntVect(AMREX_D_DECL(i,j,k)))) {
+                tmp(i,j,k,n) = src(i,j,k,n+scomp);
+            } else {
+                tmp(i,j,k,n) = 0.0;
+            }
+        });
+
+        // Apply filter
+        Filter(tbx, tmp, dst, 0, dcomp, ncomp);
     }
 }
 
-void BilinearFilter::Filter (const Box& tbx, FArrayBox const& tmpfab, FArrayBox &dstfab,
+void BilinearFilter::Filter (const Box& tbx,
+                             Array4<Real const> const& tmp,
+                             Array4<Real      > const& dst,
                              int scomp, int dcomp, int ncomp)
 {
-    const auto lo = amrex::lbound(tbx);
-    const auto hi = amrex::ubound(tbx);
-    const auto tmp = tmpfab.array();
-    const auto dst = dstfab.array();
-    // tmp and dst are of type Array4 (Fortran ordering)
-    amrex::Real const* AMREX_RESTRICT sx = stencil_x.dataPtr();
-    amrex::Real const* AMREX_RESTRICT sy = stencil_y.dataPtr();
-    amrex::Real const* AMREX_RESTRICT sz = stencil_z.dataPtr();
-#ifdef AMREX_USE_CUDA
+    amrex::Real const* AMREX_RESTRICT sx = stencil_x.data();
+    amrex::Real const* AMREX_RESTRICT sy = stencil_y.data();
+    amrex::Real const* AMREX_RESTRICT sz = stencil_z.data();
     Dim3 slen_local = slen;
-    amrex::ParallelFor(tbx, ncomp,
-    [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+    AMREX_PARALLEL_FOR_4D ( tbx, ncomp, i, j, k, n,
     {
-        dst(i,j,k,dcomp+n) = 0.0;
+        Real d = 0.0;
 
         for         (int iz=0; iz < slen_local.z; ++iz){
             for     (int iy=0; iy < slen_local.y; ++iy){
@@ -134,25 +128,68 @@ void BilinearFilter::Filter (const Box& tbx, FArrayBox const& tmpfab, FArrayBox
                     Real sss = sx[ix]*sz[iy];
 #endif                        
 #if (AMREX_SPACEDIM == 3)
-                    dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k-iz,scomp+n)
-                                              +tmp(i+ix,j-iy,k-iz,scomp+n)
-                                              +tmp(i-ix,j+iy,k-iz,scomp+n)
-                                              +tmp(i+ix,j+iy,k-iz,scomp+n)
-                                              +tmp(i-ix,j-iy,k+iz,scomp+n)
-                                              +tmp(i+ix,j-iy,k+iz,scomp+n)
-                                              +tmp(i-ix,j+iy,k+iz,scomp+n)
-                                              +tmp(i+ix,j+iy,k+iz,scomp+n));
+                    d += sss*( tmp(i-ix,j-iy,k-iz,scomp+n)
+                              +tmp(i+ix,j-iy,k-iz,scomp+n)
+                              +tmp(i-ix,j+iy,k-iz,scomp+n)
+                              +tmp(i+ix,j+iy,k-iz,scomp+n)
+                              +tmp(i-ix,j-iy,k+iz,scomp+n)
+                              +tmp(i+ix,j-iy,k+iz,scomp+n)
+                              +tmp(i-ix,j+iy,k+iz,scomp+n)
+                              +tmp(i+ix,j+iy,k+iz,scomp+n));
 #else
-                    dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k,scomp+n)
-                                              +tmp(i+ix,j-iy,k,scomp+n)
-                                              +tmp(i-ix,j+iy,k,scomp+n)
-                                              +tmp(i+ix,j+iy,k,scomp+n));
+                    d += sss*( tmp(i-ix,j-iy,k,scomp+n)
+                              +tmp(i+ix,j-iy,k,scomp+n)
+                              +tmp(i-ix,j+iy,k,scomp+n)
+                              +tmp(i+ix,j+iy,k,scomp+n));
 #endif
                 }
             }
         }
+
+        dst(i,j,k,dcomp+n) = d;
     });
-#else  // if not USE_CUDA
+}
+
+#else
+
+void
+BilinearFilter::ApplyStencil (MultiFab& dstmf, const MultiFab& srcmf, int scomp, int dcomp, int ncomp)
+{
+    BL_PROFILE("BilinearFilter::ApplyStencil()");
+    ncomp = std::min(ncomp, srcmf.nComp());
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+    {
+        FArrayBox tmpfab;
+        for (MFIter mfi(dstmf,true); mfi.isValid(); ++mfi){
+            const auto& srcfab = srcmf[mfi];
+            auto& dstfab = dstmf[mfi];
+            const Box& tbx = mfi.growntilebox();
+            const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
+            // tmpfab has enough ghost cells for the stencil
+            tmpfab.resize(gbx,ncomp);
+            tmpfab.setVal(0.0, gbx, 0, ncomp);
+            // Copy values in srcfab into tmpfab
+            const Box& ibx = gbx & srcfab.box();
+            tmpfab.copy(srcfab, ibx, scomp, ibx, 0, ncomp);
+            // Apply filter
+            Filter(tbx, tmpfab.array(), dstfab.array(), 0, dcomp, ncomp);
+        }
+    }
+}
+
+void BilinearFilter::Filter (const Box& tbx,
+                             Array4<Real const> const& tmp,
+                             Array4<Real      > const& dst,
+                             int scomp, int dcomp, int ncomp)
+{
+    const auto lo = amrex::lbound(tbx);
+    const auto hi = amrex::ubound(tbx);
+    // tmp and dst are of type Array4 (Fortran ordering)
+    amrex::Real const* AMREX_RESTRICT sx = stencil_x.data();
+    amrex::Real const* AMREX_RESTRICT sy = stencil_y.data();
+    amrex::Real const* AMREX_RESTRICT sz = stencil_z.data();
     for (int n = 0; n < ncomp; ++n) {
         // Set dst value to 0.
         for         (int k = lo.z; k <= hi.z; ++k) {
@@ -175,28 +212,29 @@ void BilinearFilter::Filter (const Box& tbx, FArrayBox const& tmpfab, FArrayBox
                     for         (int k = lo.z; k <= hi.z; ++k) {
                         for     (int j = lo.y; j <= hi.y; ++j) {
                             AMREX_PRAGMA_SIMD
-                                for (int i = lo.x; i <= hi.x; ++i) {
+                            for (int i = lo.x; i <= hi.x; ++i) {
 #if (AMREX_SPACEDIM == 3)
-                                    dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k-iz,scomp+n)
-                                                               +tmp(i+ix,j-iy,k-iz,scomp+n)
-                                                               +tmp(i-ix,j+iy,k-iz,scomp+n)
-                                                               +tmp(i+ix,j+iy,k-iz,scomp+n)
-                                                               +tmp(i-ix,j-iy,k+iz,scomp+n)
-                                                               +tmp(i+ix,j-iy,k+iz,scomp+n)
-                                                               +tmp(i-ix,j+iy,k+iz,scomp+n)
-                                                               +tmp(i+ix,j+iy,k+iz,scomp+n));
+                                dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k-iz,scomp+n)
+                                                          +tmp(i+ix,j-iy,k-iz,scomp+n)
+                                                          +tmp(i-ix,j+iy,k-iz,scomp+n)
+                                                          +tmp(i+ix,j+iy,k-iz,scomp+n)
+                                                          +tmp(i-ix,j-iy,k+iz,scomp+n)
+                                                          +tmp(i+ix,j-iy,k+iz,scomp+n)
+                                                          +tmp(i-ix,j+iy,k+iz,scomp+n)
+                                                          +tmp(i+ix,j+iy,k+iz,scomp+n));
 #else
-                                    dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k,scomp+n)
-                                                               +tmp(i+ix,j-iy,k,scomp+n)
-                                                               +tmp(i-ix,j+iy,k,scomp+n)
-                                                               +tmp(i+ix,j+iy,k,scomp+n));
+                                dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k,scomp+n)
+                                                          +tmp(i+ix,j-iy,k,scomp+n)
+                                                          +tmp(i-ix,j+iy,k,scomp+n)
+                                                          +tmp(i+ix,j+iy,k,scomp+n));
 #endif
-                                }
                             }
                         }
                     }
                 }
             }
         }
-#endif // USE_CUDA
+    }
 }
+
+#endif
diff --git a/Source/Utils/WarpXMovingWindow.cpp b/Source/Utils/WarpXMovingWindow.cpp
index 908c70573..05e171f22 100644
--- a/Source/Utils/WarpXMovingWindow.cpp
+++ b/Source/Utils/WarpXMovingWindow.cpp
@@ -189,7 +189,7 @@ WarpX::shiftMF (MultiFab& mf, const Geometry& geom, int num_shift, int dir)
 
     AMREX_ALWAYS_ASSERT(ng.min() >= num_shift);
 
-    MultiFab tmpmf(ba, dm, nc, ng);
+    MultiFab tmpmf(ba, dm, nc, ng, MFInfo().SetDeviceFab(false));
     MultiFab::Copy(tmpmf, mf, 0, 0, nc, ng);
     tmpmf.FillBoundary(geom.periodicity());
 
@@ -217,29 +217,35 @@ WarpX::shiftMF (MultiFab& mf, const Geometry& geom, int num_shift, int dir)
         }
     }
 
+    IntVect shiftiv(0);
+    shiftiv[dir] = num_shift;
+    Dim3 shift = shiftiv.dim3();
+
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
     for (MFIter mfi(tmpmf); mfi.isValid(); ++mfi )
     {
-      FArrayBox* dstfab = mf.fabPtr(mfi);
-      FArrayBox* srcfab = tmpmf.fabPtr(mfi);
-      Box outbox = mfi.fabbox();
-      outbox &= adjBox;
-      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(outbox, toutbox,
-      {
-	srcfab->setVal(0.0, toutbox, 0, nc);
-      });
-      Box dstBox = mf[mfi].box();
-      if (num_shift > 0) {
-	dstBox.growHi(dir, -num_shift);
-      } else {
-	dstBox.growLo(dir,  num_shift);
-      }
-      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(dstBox, tdstBox,
-      {
-        dstfab->setVal(0.0, tdstBox, 0, nc);
-        dstfab->copy(*srcfab, amrex::shift(tdstBox,dir,num_shift), 0, tdstBox, 0, nc);
-      });
+        auto const& dstfab = mf.array(mfi);
+        auto const& srcfab = tmpmf.array(mfi);
+
+        const Box& outbox = mfi.fabbox() & adjBox;
+        if (outbox.ok()) {
+            AMREX_PARALLEL_FOR_4D ( outbox, nc, i, j, k, n,
+            {
+                srcfab(i,j,k,n) = 0.0;
+            });
+        }
+
+        Box dstBox = mf[mfi].box();
+        if (num_shift > 0) {
+            dstBox.growHi(dir, -num_shift);
+        } else {
+            dstBox.growLo(dir,  num_shift);
+        }
+        AMREX_PARALLEL_FOR_4D ( dstBox, nc, i, j, k, n,
+        {
+            dstfab(i,j,k,n) = srcfab(i+shift.x,j+shift.y,k+shift.z,n);
+        });
     }
 }