aboutsummaryrefslogtreecommitdiff
path: root/Source/Filter/BilinearFilter.cpp
diff options
context:
space:
mode:
authorGravatar Weiqun Zhang <weiqunzhang@lbl.gov> 2019-04-17 15:27:39 -0700
committerGravatar Weiqun Zhang <weiqunzhang@lbl.gov> 2019-04-17 15:27:39 -0700
commit0a76cc206a162a12f4a9107a62da7b75727cc0e3 (patch)
tree6f82437b05d20f09fe1842fe27a0dc2d9ba057db /Source/Filter/BilinearFilter.cpp
parent1c06a41593cf99f91ed92db647013b648658ea91 (diff)
downloadWarpX-0a76cc206a162a12f4a9107a62da7b75727cc0e3.tar.gz
WarpX-0a76cc206a162a12f4a9107a62da7b75727cc0e3.tar.zst
WarpX-0a76cc206a162a12f4a9107a62da7b75727cc0e3.zip
split filter into cuda and cpu versions
Diffstat (limited to 'Source/Filter/BilinearFilter.cpp')
-rw-r--r--Source/Filter/BilinearFilter.cpp172
1 files changed, 105 insertions, 67 deletions
diff --git a/Source/Filter/BilinearFilter.cpp b/Source/Filter/BilinearFilter.cpp
index 4017d3f73..f6acaa5df 100644
--- a/Source/Filter/BilinearFilter.cpp
+++ b/Source/Filter/BilinearFilter.cpp
@@ -70,60 +70,54 @@ void BilinearFilter::ComputeStencils(){
}
+#ifdef AMREX_USE_CUDA
+
void
BilinearFilter::ApplyStencil (MultiFab& dstmf, const MultiFab& srcmf, int scomp, int dcomp, int ncomp)
{
BL_PROFILE("BilinearFilter::ApplyStencil()");
ncomp = std::min(ncomp, srcmf.nComp());
-#ifdef _OPENMP
-#pragma omp parallel if (Gpu::notInLaunchRegion())
-#endif
+
+ for (MFIter mfi(dstmf); mfi.isValid(); ++mfi)
{
- FArrayBox tmpfab;
- for (MFIter mfi(dstmf,TilingIfNotGPU()); mfi.isValid(); ++mfi){
- const auto& srcfab = srcmf[mfi];
- auto& dstfab = dstmf[mfi];
- const Box& tbx = mfi.growntilebox();
- const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
- // tmpfab has enough ghost cells for the stencil
- AsyncFab tmp_async_fab(tmpfab,gbx,ncomp);
- FArrayBox* tmpfab_ptr = tmp_async_fab.fabPtr();
- const FArrayBox* srcfab_ptr = srcmf.fabPtr(mfi);
- // Copy values in srcfab into tmpfab
- const Box& ibx = gbx & srcfab.box();
- AMREX_LAUNCH_HOST_DEVICE_LAMBDA(gbx, tgbx,
- {
- tmpfab_ptr->setVal(0.0, tgbx, 0, ncomp);
- });
+ const auto& src = srcmf.array(mfi);
+ const auto& dst = dstmf.array(mfi);
+ const Box& tbx = mfi.growntilebox();
+ const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
- AMREX_LAUNCH_HOST_DEVICE_LAMBDA(ibx, tibx,
- {
- tmpfab_ptr->copy(*srcfab_ptr, tibx, scomp, tibx, 0, ncomp);
- });
+ // tmpfab has enough ghost cells for the stencil
+ FArrayBox tmp_fab(gbx,ncomp);
+ Elixir tmp_eli = tmp_fab.elixir(); // Prevent the tmp data from being deleted too early
+ auto const& tmp = tmp_fab.array();
- // Apply filter
- Filter(tbx, tmp_async_fab.hostFab(), dstfab, 0, dcomp, ncomp);
- }
+ // Copy values in srcfab into tmpfab
+ const Box& ibx = gbx & srcmf[mfi].box();
+ AMREX_PARALLEL_FOR_4D ( gbx, ncomp, i, j, k, n,
+ {
+ if (ibx.contains(IntVect(AMREX_D_DECL(i,j,k)))) {
+ tmp(i,j,k,n) = src(i,j,k,n+scomp);
+ } else {
+ tmp(i,j,k,n) = 0.0;
+ }
+ });
+
+ // Apply filter
+ Filter(tbx, tmp, dst, 0, dcomp, ncomp);
}
}
-void BilinearFilter::Filter (const Box& tbx, FArrayBox const& tmpfab, FArrayBox &dstfab,
+void BilinearFilter::Filter (const Box& tbx,
+ Array4<Real const> const& tmp,
+ Array4<Real > const& dst,
int scomp, int dcomp, int ncomp)
{
- const auto lo = amrex::lbound(tbx);
- const auto hi = amrex::ubound(tbx);
- const auto tmp = tmpfab.array();
- const auto dst = dstfab.array();
- // tmp and dst are of type Array4 (Fortran ordering)
- amrex::Real const* AMREX_RESTRICT sx = stencil_x.dataPtr();
- amrex::Real const* AMREX_RESTRICT sy = stencil_y.dataPtr();
- amrex::Real const* AMREX_RESTRICT sz = stencil_z.dataPtr();
-#ifdef AMREX_USE_CUDA
+ amrex::Real const* AMREX_RESTRICT sx = stencil_x.data();
+ amrex::Real const* AMREX_RESTRICT sy = stencil_y.data();
+ amrex::Real const* AMREX_RESTRICT sz = stencil_z.data();
Dim3 slen_local = slen;
- amrex::ParallelFor(tbx, ncomp,
- [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept
+ AMREX_PARALLEL_FOR_4D ( tbx, ncomp, i, j, k, n,
{
- dst(i,j,k,dcomp+n) = 0.0;
+ Real d = 0.0;
for (int iz=0; iz < slen_local.z; ++iz){
for (int iy=0; iy < slen_local.y; ++iy){
@@ -134,25 +128,68 @@ void BilinearFilter::Filter (const Box& tbx, FArrayBox const& tmpfab, FArrayBox
Real sss = sx[ix]*sz[iy];
#endif
#if (AMREX_SPACEDIM == 3)
- dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k-iz,scomp+n)
- +tmp(i+ix,j-iy,k-iz,scomp+n)
- +tmp(i-ix,j+iy,k-iz,scomp+n)
- +tmp(i+ix,j+iy,k-iz,scomp+n)
- +tmp(i-ix,j-iy,k+iz,scomp+n)
- +tmp(i+ix,j-iy,k+iz,scomp+n)
- +tmp(i-ix,j+iy,k+iz,scomp+n)
- +tmp(i+ix,j+iy,k+iz,scomp+n));
+ d += sss*( tmp(i-ix,j-iy,k-iz,scomp+n)
+ +tmp(i+ix,j-iy,k-iz,scomp+n)
+ +tmp(i-ix,j+iy,k-iz,scomp+n)
+ +tmp(i+ix,j+iy,k-iz,scomp+n)
+ +tmp(i-ix,j-iy,k+iz,scomp+n)
+ +tmp(i+ix,j-iy,k+iz,scomp+n)
+ +tmp(i-ix,j+iy,k+iz,scomp+n)
+ +tmp(i+ix,j+iy,k+iz,scomp+n));
#else
- dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k,scomp+n)
- +tmp(i+ix,j-iy,k,scomp+n)
- +tmp(i-ix,j+iy,k,scomp+n)
- +tmp(i+ix,j+iy,k,scomp+n));
+ d += sss*( tmp(i-ix,j-iy,k,scomp+n)
+ +tmp(i+ix,j-iy,k,scomp+n)
+ +tmp(i-ix,j+iy,k,scomp+n)
+ +tmp(i+ix,j+iy,k,scomp+n));
#endif
}
}
}
+
+ dst(i,j,k,dcomp+n) = d;
});
-#else // if not USE_CUDA
+}
+
+#else
+
+void
+BilinearFilter::ApplyStencil (MultiFab& dstmf, const MultiFab& srcmf, int scomp, int dcomp, int ncomp)
+{
+ BL_PROFILE("BilinearFilter::ApplyStencil()");
+ ncomp = std::min(ncomp, srcmf.nComp());
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+ {
+ FArrayBox tmpfab;
+ for (MFIter mfi(dstmf,true); mfi.isValid(); ++mfi){
+ const auto& srcfab = srcmf[mfi];
+ auto& dstfab = dstmf[mfi];
+ const Box& tbx = mfi.growntilebox();
+ const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
+ // tmpfab has enough ghost cells for the stencil
+ tmpfab.resize(gbx,ncomp);
+ tmpfab.setVal(0.0, gbx, 0, ncomp);
+ // Copy values in srcfab into tmpfab
+ const Box& ibx = gbx & srcfab.box();
+ tmpfab.copy(srcfab, ibx, scomp, ibx, 0, ncomp);
+ // Apply filter
+ Filter(tbx, tmpfab.array(), dstfab.array(), 0, dcomp, ncomp);
+ }
+ }
+}
+
+void BilinearFilter::Filter (const Box& tbx,
+ Array4<Real const> const& tmp,
+ Array4<Real > const& dst,
+ int scomp, int dcomp, int ncomp)
+{
+ const auto lo = amrex::lbound(tbx);
+ const auto hi = amrex::ubound(tbx);
+ // tmp and dst are of type Array4 (Fortran ordering)
+ amrex::Real const* AMREX_RESTRICT sx = stencil_x.data();
+ amrex::Real const* AMREX_RESTRICT sy = stencil_y.data();
+ amrex::Real const* AMREX_RESTRICT sz = stencil_z.data();
for (int n = 0; n < ncomp; ++n) {
// Set dst value to 0.
for (int k = lo.z; k <= hi.z; ++k) {
@@ -175,28 +212,29 @@ void BilinearFilter::Filter (const Box& tbx, FArrayBox const& tmpfab, FArrayBox
for (int k = lo.z; k <= hi.z; ++k) {
for (int j = lo.y; j <= hi.y; ++j) {
AMREX_PRAGMA_SIMD
- for (int i = lo.x; i <= hi.x; ++i) {
+ for (int i = lo.x; i <= hi.x; ++i) {
#if (AMREX_SPACEDIM == 3)
- dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k-iz,scomp+n)
- +tmp(i+ix,j-iy,k-iz,scomp+n)
- +tmp(i-ix,j+iy,k-iz,scomp+n)
- +tmp(i+ix,j+iy,k-iz,scomp+n)
- +tmp(i-ix,j-iy,k+iz,scomp+n)
- +tmp(i+ix,j-iy,k+iz,scomp+n)
- +tmp(i-ix,j+iy,k+iz,scomp+n)
- +tmp(i+ix,j+iy,k+iz,scomp+n));
+ dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k-iz,scomp+n)
+ +tmp(i+ix,j-iy,k-iz,scomp+n)
+ +tmp(i-ix,j+iy,k-iz,scomp+n)
+ +tmp(i+ix,j+iy,k-iz,scomp+n)
+ +tmp(i-ix,j-iy,k+iz,scomp+n)
+ +tmp(i+ix,j-iy,k+iz,scomp+n)
+ +tmp(i-ix,j+iy,k+iz,scomp+n)
+ +tmp(i+ix,j+iy,k+iz,scomp+n));
#else
- dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k,scomp+n)
- +tmp(i+ix,j-iy,k,scomp+n)
- +tmp(i-ix,j+iy,k,scomp+n)
- +tmp(i+ix,j+iy,k,scomp+n));
+ dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k,scomp+n)
+ +tmp(i+ix,j-iy,k,scomp+n)
+ +tmp(i-ix,j+iy,k,scomp+n)
+ +tmp(i+ix,j+iy,k,scomp+n));
#endif
- }
}
}
}
}
}
}
-#endif // USE_CUDA
+ }
}
+
+#endif