diff options
-rw-r--r-- | Source/BoundaryConditions/PML.H | 4 | ||||
-rw-r--r-- | Source/Filter/BilinearFilter.H | 6 | ||||
-rw-r--r-- | Source/Filter/BilinearFilter.cpp | 172 | ||||
-rw-r--r-- | Source/Utils/WarpXMovingWindow.cpp | 48 |
4 files changed, 139 insertions, 91 deletions
diff --git a/Source/BoundaryConditions/PML.H b/Source/BoundaryConditions/PML.H index 7a86bf0da..25dfc7996 100644 --- a/Source/BoundaryConditions/PML.H +++ b/Source/BoundaryConditions/PML.H @@ -63,11 +63,11 @@ namespace amrex { delete fab; } #ifdef AMREX_USE_GPU - virtual SigmaBox* createHostAlias (SigmaBox const& src) const final + virtual SigmaBox* createDeviceAlias (SigmaBox const& src) const final { return const_cast<SigmaBox*>(&src); } - virtual void destroyHostAlias (SigmaBox* fab) const final {} + virtual void destroyDeviceAlias (SigmaBox* fab) const final {} #endif virtual FabFactory<SigmaBox>* clone () const { return new FabFactory<SigmaBox>(*this); diff --git a/Source/Filter/BilinearFilter.H b/Source/Filter/BilinearFilter.H index 8a1b87e97..9bade3c77 100644 --- a/Source/Filter/BilinearFilter.H +++ b/Source/Filter/BilinearFilter.H @@ -15,7 +15,11 @@ public: amrex::IntVect npass_each_dir; amrex::IntVect stencil_length_each_dir; - void Filter(const amrex::Box& tbx, amrex::FArrayBox const& tmpfab, amrex::FArrayBox &dstfab, int scomp, int dcomp, int ncomp); + // public for cuda + void Filter(const amrex::Box& tbx, + amrex::Array4<amrex::Real const> const& tmp, + amrex::Array4<amrex::Real > const& dst, + int scomp, int dcomp, int ncomp); private: diff --git a/Source/Filter/BilinearFilter.cpp b/Source/Filter/BilinearFilter.cpp index 4017d3f73..f6acaa5df 100644 --- a/Source/Filter/BilinearFilter.cpp +++ b/Source/Filter/BilinearFilter.cpp @@ -70,60 +70,54 @@ void BilinearFilter::ComputeStencils(){ } +#ifdef AMREX_USE_CUDA + void BilinearFilter::ApplyStencil (MultiFab& dstmf, const MultiFab& srcmf, int scomp, int dcomp, int ncomp) { BL_PROFILE("BilinearFilter::ApplyStencil()"); ncomp = std::min(ncomp, srcmf.nComp()); -#ifdef _OPENMP -#pragma omp parallel if (Gpu::notInLaunchRegion()) -#endif + + for (MFIter mfi(dstmf); mfi.isValid(); ++mfi) { - FArrayBox tmpfab; - for (MFIter mfi(dstmf,TilingIfNotGPU()); mfi.isValid(); ++mfi){ - const auto& srcfab = srcmf[mfi]; - auto& dstfab = dstmf[mfi]; - const Box& tbx = mfi.growntilebox(); - const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1); - // tmpfab has enough ghost cells for the stencil - AsyncFab tmp_async_fab(tmpfab,gbx,ncomp); - FArrayBox* tmpfab_ptr = tmp_async_fab.fabPtr(); - const FArrayBox* srcfab_ptr = srcmf.fabPtr(mfi); - // Copy values in srcfab into tmpfab - const Box& ibx = gbx & srcfab.box(); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA(gbx, tgbx, - { - tmpfab_ptr->setVal(0.0, tgbx, 0, ncomp); - }); + const auto& src = srcmf.array(mfi); + const auto& dst = dstmf.array(mfi); + const Box& tbx = mfi.growntilebox(); + const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1); - AMREX_LAUNCH_HOST_DEVICE_LAMBDA(ibx, tibx, - { - tmpfab_ptr->copy(*srcfab_ptr, tibx, scomp, tibx, 0, ncomp); - }); + // tmpfab has enough ghost cells for the stencil + FArrayBox tmp_fab(gbx,ncomp); + Elixir tmp_eli = tmp_fab.elixir(); // Prevent the tmp data from being deleted too early + auto const& tmp = tmp_fab.array(); - // Apply filter - Filter(tbx, tmp_async_fab.hostFab(), dstfab, 0, dcomp, ncomp); - } + // Copy values in srcfab into tmpfab + const Box& ibx = gbx & srcmf[mfi].box(); + AMREX_PARALLEL_FOR_4D ( gbx, ncomp, i, j, k, n, + { + if (ibx.contains(IntVect(AMREX_D_DECL(i,j,k)))) { + tmp(i,j,k,n) = src(i,j,k,n+scomp); + } else { + tmp(i,j,k,n) = 0.0; + } + }); + + // Apply filter + Filter(tbx, tmp, dst, 0, dcomp, ncomp); } } -void BilinearFilter::Filter (const Box& tbx, FArrayBox const& tmpfab, FArrayBox &dstfab, +void BilinearFilter::Filter (const Box& tbx, + Array4<Real const> const& tmp, + Array4<Real > const& dst, int scomp, int dcomp, int ncomp) { - const auto lo = amrex::lbound(tbx); - const auto hi = amrex::ubound(tbx); - const auto tmp = tmpfab.array(); - const auto dst = dstfab.array(); - // tmp and dst are of type Array4 (Fortran ordering) - amrex::Real const* AMREX_RESTRICT sx = stencil_x.dataPtr(); - amrex::Real const* AMREX_RESTRICT sy = stencil_y.dataPtr(); - amrex::Real const* AMREX_RESTRICT sz = stencil_z.dataPtr(); -#ifdef AMREX_USE_CUDA + amrex::Real const* AMREX_RESTRICT sx = stencil_x.data(); + amrex::Real const* AMREX_RESTRICT sy = stencil_y.data(); + amrex::Real const* AMREX_RESTRICT sz = stencil_z.data(); Dim3 slen_local = slen; - amrex::ParallelFor(tbx, ncomp, - [=] AMREX_GPU_DEVICE (int i, int j, int k, int n) noexcept + AMREX_PARALLEL_FOR_4D ( tbx, ncomp, i, j, k, n, { - dst(i,j,k,dcomp+n) = 0.0; + Real d = 0.0; for (int iz=0; iz < slen_local.z; ++iz){ for (int iy=0; iy < slen_local.y; ++iy){ @@ -134,25 +128,68 @@ void BilinearFilter::Filter (const Box& tbx, FArrayBox const& tmpfab, FArrayBox Real sss = sx[ix]*sz[iy]; #endif #if (AMREX_SPACEDIM == 3) - dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k-iz,scomp+n) - +tmp(i+ix,j-iy,k-iz,scomp+n) - +tmp(i-ix,j+iy,k-iz,scomp+n) - +tmp(i+ix,j+iy,k-iz,scomp+n) - +tmp(i-ix,j-iy,k+iz,scomp+n) - +tmp(i+ix,j-iy,k+iz,scomp+n) - +tmp(i-ix,j+iy,k+iz,scomp+n) - +tmp(i+ix,j+iy,k+iz,scomp+n)); + d += sss*( tmp(i-ix,j-iy,k-iz,scomp+n) + +tmp(i+ix,j-iy,k-iz,scomp+n) + +tmp(i-ix,j+iy,k-iz,scomp+n) + +tmp(i+ix,j+iy,k-iz,scomp+n) + +tmp(i-ix,j-iy,k+iz,scomp+n) + +tmp(i+ix,j-iy,k+iz,scomp+n) + +tmp(i-ix,j+iy,k+iz,scomp+n) + +tmp(i+ix,j+iy,k+iz,scomp+n)); #else - dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k,scomp+n) - +tmp(i+ix,j-iy,k,scomp+n) - +tmp(i-ix,j+iy,k,scomp+n) - +tmp(i+ix,j+iy,k,scomp+n)); + d += sss*( tmp(i-ix,j-iy,k,scomp+n) + +tmp(i+ix,j-iy,k,scomp+n) + +tmp(i-ix,j+iy,k,scomp+n) + +tmp(i+ix,j+iy,k,scomp+n)); #endif } } } + + dst(i,j,k,dcomp+n) = d; }); -#else // if not USE_CUDA +} + +#else + +void +BilinearFilter::ApplyStencil (MultiFab& dstmf, const MultiFab& srcmf, int scomp, int dcomp, int ncomp) +{ + BL_PROFILE("BilinearFilter::ApplyStencil()"); + ncomp = std::min(ncomp, srcmf.nComp()); +#ifdef _OPENMP +#pragma omp parallel +#endif + { + FArrayBox tmpfab; + for (MFIter mfi(dstmf,true); mfi.isValid(); ++mfi){ + const auto& srcfab = srcmf[mfi]; + auto& dstfab = dstmf[mfi]; + const Box& tbx = mfi.growntilebox(); + const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1); + // tmpfab has enough ghost cells for the stencil + tmpfab.resize(gbx,ncomp); + tmpfab.setVal(0.0, gbx, 0, ncomp); + // Copy values in srcfab into tmpfab + const Box& ibx = gbx & srcfab.box(); + tmpfab.copy(srcfab, ibx, scomp, ibx, 0, ncomp); + // Apply filter + Filter(tbx, tmpfab.array(), dstfab.array(), 0, dcomp, ncomp); + } + } +} + +void BilinearFilter::Filter (const Box& tbx, + Array4<Real const> const& tmp, + Array4<Real > const& dst, + int scomp, int dcomp, int ncomp) +{ + const auto lo = amrex::lbound(tbx); + const auto hi = amrex::ubound(tbx); + // tmp and dst are of type Array4 (Fortran ordering) + amrex::Real const* AMREX_RESTRICT sx = stencil_x.data(); + amrex::Real const* AMREX_RESTRICT sy = stencil_y.data(); + amrex::Real const* AMREX_RESTRICT sz = stencil_z.data(); for (int n = 0; n < ncomp; ++n) { // Set dst value to 0. for (int k = lo.z; k <= hi.z; ++k) { @@ -175,28 +212,29 @@ void BilinearFilter::Filter (const Box& tbx, FArrayBox const& tmpfab, FArrayBox for (int k = lo.z; k <= hi.z; ++k) { for (int j = lo.y; j <= hi.y; ++j) { AMREX_PRAGMA_SIMD - for (int i = lo.x; i <= hi.x; ++i) { + for (int i = lo.x; i <= hi.x; ++i) { #if (AMREX_SPACEDIM == 3) - dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k-iz,scomp+n) - +tmp(i+ix,j-iy,k-iz,scomp+n) - +tmp(i-ix,j+iy,k-iz,scomp+n) - +tmp(i+ix,j+iy,k-iz,scomp+n) - +tmp(i-ix,j-iy,k+iz,scomp+n) - +tmp(i+ix,j-iy,k+iz,scomp+n) - +tmp(i-ix,j+iy,k+iz,scomp+n) - +tmp(i+ix,j+iy,k+iz,scomp+n)); + dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k-iz,scomp+n) + +tmp(i+ix,j-iy,k-iz,scomp+n) + +tmp(i-ix,j+iy,k-iz,scomp+n) + +tmp(i+ix,j+iy,k-iz,scomp+n) + +tmp(i-ix,j-iy,k+iz,scomp+n) + +tmp(i+ix,j-iy,k+iz,scomp+n) + +tmp(i-ix,j+iy,k+iz,scomp+n) + +tmp(i+ix,j+iy,k+iz,scomp+n)); #else - dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k,scomp+n) - +tmp(i+ix,j-iy,k,scomp+n) - +tmp(i-ix,j+iy,k,scomp+n) - +tmp(i+ix,j+iy,k,scomp+n)); + dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k,scomp+n) + +tmp(i+ix,j-iy,k,scomp+n) + +tmp(i-ix,j+iy,k,scomp+n) + +tmp(i+ix,j+iy,k,scomp+n)); #endif - } } } } } } } -#endif // USE_CUDA + } } + +#endif diff --git a/Source/Utils/WarpXMovingWindow.cpp b/Source/Utils/WarpXMovingWindow.cpp index 908c70573..05e171f22 100644 --- a/Source/Utils/WarpXMovingWindow.cpp +++ b/Source/Utils/WarpXMovingWindow.cpp @@ -189,7 +189,7 @@ WarpX::shiftMF (MultiFab& mf, const Geometry& geom, int num_shift, int dir) AMREX_ALWAYS_ASSERT(ng.min() >= num_shift); - MultiFab tmpmf(ba, dm, nc, ng); + MultiFab tmpmf(ba, dm, nc, ng, MFInfo().SetDeviceFab(false)); MultiFab::Copy(tmpmf, mf, 0, 0, nc, ng); tmpmf.FillBoundary(geom.periodicity()); @@ -217,29 +217,35 @@ WarpX::shiftMF (MultiFab& mf, const Geometry& geom, int num_shift, int dir) } } + IntVect shiftiv(0); + shiftiv[dir] = num_shift; + Dim3 shift = shiftiv.dim3(); + #ifdef _OPENMP -#pragma omp parallel +#pragma omp parallel if (Gpu::notInLaunchRegion()) #endif for (MFIter mfi(tmpmf); mfi.isValid(); ++mfi ) { - FArrayBox* dstfab = mf.fabPtr(mfi); - FArrayBox* srcfab = tmpmf.fabPtr(mfi); - Box outbox = mfi.fabbox(); - outbox &= adjBox; - AMREX_LAUNCH_HOST_DEVICE_LAMBDA(outbox, toutbox, - { - srcfab->setVal(0.0, toutbox, 0, nc); - }); - Box dstBox = mf[mfi].box(); - if (num_shift > 0) { - dstBox.growHi(dir, -num_shift); - } else { - dstBox.growLo(dir, num_shift); - } - AMREX_LAUNCH_HOST_DEVICE_LAMBDA(dstBox, tdstBox, - { - dstfab->setVal(0.0, tdstBox, 0, nc); - dstfab->copy(*srcfab, amrex::shift(tdstBox,dir,num_shift), 0, tdstBox, 0, nc); - }); + auto const& dstfab = mf.array(mfi); + auto const& srcfab = tmpmf.array(mfi); + + const Box& outbox = mfi.fabbox() & adjBox; + if (outbox.ok()) { + AMREX_PARALLEL_FOR_4D ( outbox, nc, i, j, k, n, + { + srcfab(i,j,k,n) = 0.0; + }); + } + + Box dstBox = mf[mfi].box(); + if (num_shift > 0) { + dstBox.growHi(dir, -num_shift); + } else { + dstBox.growLo(dir, num_shift); + } + AMREX_PARALLEL_FOR_4D ( dstBox, nc, i, j, k, n, + { + dstfab(i,j,k,n) = srcfab(i+shift.x,j+shift.y,k+shift.z,n); + }); } } |