diff options
author | 2020-09-01 14:11:35 -0700 | |
---|---|---|
committer | 2020-09-01 14:11:35 -0700 | |
commit | 3c2eadfde21745eae87c7d13ebaacb1732a8619d (patch) | |
tree | 730bc2878800058a66fa308a67e4ca53c0715e71 /Source/FieldSolver/SpectralSolver | |
parent | 1652cfa796c9f258832254906ef4148a32768c2b (diff) | |
download | WarpX-3c2eadfde21745eae87c7d13ebaacb1732a8619d.tar.gz WarpX-3c2eadfde21745eae87c7d13ebaacb1732a8619d.tar.zst WarpX-3c2eadfde21745eae87c7d13ebaacb1732a8619d.zip |
Remove ManagedVector from spectral solver (#1270)
Diffstat (limited to 'Source/FieldSolver/SpectralSolver')
8 files changed, 131 insertions, 104 deletions
diff --git a/Source/FieldSolver/SpectralSolver/SpectralBinomialFilter.H b/Source/FieldSolver/SpectralSolver/SpectralBinomialFilter.H index 9d3ab70f5..e15cc38e8 100644 --- a/Source/FieldSolver/SpectralSolver/SpectralBinomialFilter.H +++ b/Source/FieldSolver/SpectralSolver/SpectralBinomialFilter.H @@ -19,9 +19,9 @@ class SpectralBinomialFilter { public: - // `KFilterArray` holds a one 1D array ("ManagedVector") that + // `KFilterArray` holds a one 1D array ("DeviceVector") that // implements the filter. - using KFilterArray = amrex::Gpu::ManagedVector<amrex::Real>; + using KFilterArray = amrex::Gpu::DeviceVector<amrex::Real>; SpectralBinomialFilter () {}; void InitFilterArray (RealKVector const & kvec, diff --git a/Source/FieldSolver/SpectralSolver/SpectralBinomialFilter.cpp b/Source/FieldSolver/SpectralSolver/SpectralBinomialFilter.cpp index c4ecd0969..e0ac3e886 100644 --- a/Source/FieldSolver/SpectralSolver/SpectralBinomialFilter.cpp +++ b/Source/FieldSolver/SpectralSolver/SpectralBinomialFilter.cpp @@ -20,19 +20,21 @@ SpectralBinomialFilter::InitFilterArray (HankelTransform::RealVector const & kve bool const compensation, KFilterArray & filter) { - - filter.resize(kvec.size()); - - for (int i=0 ; i < kvec.size() ; i++) { - amrex::Real const ss = std::sin(0.5_rt*kvec[i]*dels); + const int N = kvec.size(); + filter.resize(N); + amrex::Real* p_filter = filter.data(); + amrex::Real const* p_kvec = kvec.data(); + + amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (int i) noexcept + { + amrex::Real const ss = std::sin(0.5_rt*p_kvec[i]*dels); amrex::Real const ss2 = ss*ss; amrex::Real filt = std::pow(1._rt - ss2, npasses); if (compensation) { filt *= (1._rt + npasses*ss2); } - filter[i] = filt; - } - + p_filter[i] = filt; + }); } /* \brief Initialize the radial and longitudinal filter arrays */ diff --git a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/BesselRoots.H b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/BesselRoots.H index c7a816c61..8f569ec4e 100644 --- a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/BesselRoots.H +++ b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/BesselRoots.H @@ -51,7 +51,7 @@ void SecantRootFinder(int n, int nitmx, amrex::Real tol, amrex::Real *zeroj, int * abramowitz m. & stegun irene a. * handbook of mathematical functions */ -void GetBesselRoots(int n, int nk, HankelTransform::RealVector& roots, amrex::Vector<int> &ier) { +void GetBesselRoots(int n, int nk, amrex::Vector<amrex::Real>& roots, amrex::Vector<int> &ier) { amrex::Real zeroj; int ierror, ik, k; diff --git a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.H b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.H index 46fb28447..5a87031c4 100644 --- a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.H +++ b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.H @@ -21,7 +21,7 @@ class HankelTransform { public: - using RealVector = amrex::Gpu::ManagedVector<amrex::Real>; + using RealVector = amrex::Gpu::DeviceVector<amrex::Real>; // Constructor HankelTransform(const int hankel_order, @@ -43,8 +43,8 @@ class HankelTransform RealVector m_kr; - RealVector invM; - RealVector M; + RealVector m_invM; + RealVector m_M; }; #endif diff --git a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp index c5249d54f..24e9d7f81 100644 --- a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp +++ b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp @@ -25,7 +25,7 @@ HankelTransform::HankelTransform (int const hankel_order, AMREX_ALWAYS_ASSERT_WITH_MESSAGE(hankel_order-1 <= azimuthal_mode && azimuthal_mode <= hankel_order+1, "azimuthal_mode must be either hankel_order-1, hankel_order or hankel_order+1"); - RealVector alphas; + amrex::Vector<amrex::Real> alphas; amrex::Vector<int> alpha_errors; GetBesselRoots(azimuthal_mode, m_nk, alphas, alpha_errors); @@ -33,13 +33,13 @@ HankelTransform::HankelTransform (int const hankel_order, AMREX_ALWAYS_ASSERT(std::all_of(alpha_errors.begin(), alpha_errors.end(), [](int i) { return i == 0; })); // Calculate the spectral grid - m_kr.resize(m_nk); + amrex::Vector<amrex::Real> kr(m_nk); for (int ik=0 ; ik < m_nk ; ik++) { - m_kr[ik] = alphas[ik]/rmax; + kr[ik] = alphas[ik]/rmax; } // Calculate the spatial grid (Uniform grid with a half-cell offset) - RealVector rmesh(m_nr); + amrex::Vector<amrex::Real> rmesh(m_nr); amrex::Real dr = rmax/m_nr; for (int ir=0 ; ir < m_nr ; ir++) { rmesh[ir] = dr*(ir + 0.5_rt); @@ -57,22 +57,22 @@ HankelTransform::HankelTransform (int const hankel_order, p_denom = hankel_order; } - RealVector denom(m_nk); + amrex::Vector<amrex::Real> denom(m_nk); for (int ik=0 ; ik < m_nk ; ik++) { const amrex::Real jna = jn(p_denom, alphas[ik]); denom[ik] = MathConst::pi*rmax*rmax*jna*jna; } - RealVector num(m_nk*m_nr); + amrex::Vector<amrex::Real> num(m_nk*m_nr); for (int ir=0 ; ir < m_nr ; ir++) { for (int ik=0 ; ik < m_nk ; ik++) { int const ii = ik + ir*m_nk; - num[ii] = jn(hankel_order, rmesh[ir]*m_kr[ik]); + num[ii] = jn(hankel_order, rmesh[ir]*kr[ik]); } } // Get the inverse matrix - invM.resize(m_nk*m_nr); + amrex::Vector<amrex::Real> invM(m_nk*m_nr); if (azimuthal_mode > 0) { for (int ir=0 ; ir < m_nr ; ir++) { for (int ik=1 ; ik < m_nk ; ik++) { @@ -107,18 +107,20 @@ HankelTransform::HankelTransform (int const hankel_order, } } + amrex::Vector<amrex::Real> M; + // Calculate the matrix M by inverting invM if (azimuthal_mode !=0 && hankel_order != azimuthal_mode-1) { // In this case, invM is singular, thus we calculate the pseudo-inverse. // The Moore-Penrose psuedo-inverse is calculated using the SVD method. M.resize(m_nk*m_nr, 0.); - RealVector invMcopy(invM); - RealVector sdiag(m_nk-1, 0.); - RealVector u((m_nk-1)*(m_nk-1), 0.); - RealVector vt((m_nr)*(m_nr), 0.); - RealVector sp((m_nr)*(m_nk-1), 0.); - RealVector temp((m_nr)*(m_nk-1), 0.); + amrex::Vector<amrex::Real> invMcopy(invM); + amrex::Vector<amrex::Real> sdiag(m_nk-1, 0.); + amrex::Vector<amrex::Real> u((m_nk-1)*(m_nk-1), 0.); + amrex::Vector<amrex::Real> vt((m_nr)*(m_nr), 0.); + amrex::Vector<amrex::Real> sp((m_nr)*(m_nk-1), 0.); + amrex::Vector<amrex::Real> temp((m_nr)*(m_nk-1), 0.); // Calculate the singlular-value-decomposition of invM (leaving out the first row). // invM = u*sdiag*vt @@ -169,6 +171,13 @@ HankelTransform::HankelTransform (int const hankel_order, } + m_kr.resize(kr.size()); + m_invM.resize(invM.size()); + m_M.resize(M.size()); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, kr.begin(), kr.end(), m_kr.begin()); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, invM.begin(), invM.end(), m_invM.begin()); + amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, M.begin(), M.end(), m_M.begin()); + amrex::Gpu::synchronize(); } void @@ -193,7 +202,7 @@ HankelTransform::HankelForwardTransform (amrex::FArrayBox const& F, int const F_ // Note that M is flagged to be transposed since it has dimensions (m_nr, m_nk) blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans, m_nk, nz, m_nr, 1._rt, - M.dataPtr(), m_nk, + m_M.dataPtr(), m_nk, F.dataPtr(F_icomp)+ngr, nrF, 0._rt, G.dataPtr(G_icomp), m_nk); @@ -202,13 +211,13 @@ HankelTransform::HankelForwardTransform (amrex::FArrayBox const& F, int const F_ // It is not clear if the GPU gemm wasn't build properly, it is cycling data out and back // in to the device, or if it is because gemm is launching its own threads. - amrex::Real const * M_arr = M.dataPtr(); + amrex::Real const * M_arr = m_M.dataPtr(); amrex::Array4<const amrex::Real> const & F_arr = F.array(); amrex::Array4< amrex::Real> const & G_arr = G.array(); int const nr = m_nr; - ParallelFor(G_box, + amrex::ParallelFor(G_box, [=] AMREX_GPU_DEVICE(int ik, int iz, int inotused) noexcept { G_arr(ik,iz,G_icomp) = 0.; for (int ir=0 ; ir < nr ; ir++) { @@ -240,10 +249,10 @@ HankelTransform::HankelInverseTransform (amrex::FArrayBox const& G, int const G_ #ifndef AMREX_USE_GPU // On CPU, the blas::gemm is significantly faster - // Note that invM is flagged to be transposed since it has dimensions (m_nk, m_nr) + // Note that m_invM is flagged to be transposed since it has dimensions (m_nk, m_nr) blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans, m_nr, nz, m_nk, 1._rt, - invM.dataPtr(), m_nr, + m_invM.dataPtr(), m_nr, G.dataPtr(G_icomp), m_nk, 0._rt, F.dataPtr(F_icomp)+ngr, nrF); @@ -252,13 +261,13 @@ HankelTransform::HankelInverseTransform (amrex::FArrayBox const& G, int const G_ // It is not clear if the GPU gemm wasn't build properly, it is cycling data out and back // in to the device, or if it is because gemm is launching its own threads. - amrex::Real const * invM_arr = invM.dataPtr(); + amrex::Real const * invM_arr = m_invM.dataPtr(); amrex::Array4<const amrex::Real> const & G_arr = G.array(); amrex::Array4< amrex::Real> const & F_arr = F.array(); int const nk = m_nk; - ParallelFor(G_box, + amrex::ParallelFor(G_box, [=] AMREX_GPU_DEVICE(int ir, int iz, int inotused) noexcept { F_arr(ir,iz,F_icomp) = 0.; for (int ik=0 ; ik < nk ; ik++) { diff --git a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/SpectralHankelTransformer.cpp b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/SpectralHankelTransformer.cpp index 7dd297418..518eea03e 100644 --- a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/SpectralHankelTransformer.cpp +++ b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/SpectralHankelTransformer.cpp @@ -49,6 +49,7 @@ SpectralHankelTransformer::ExtractKrArray () kr_array[ii] = kr_m_array[ir]; }); } + amrex::Gpu::synchronize(); } /* \brief Converts a scalar field from the physical to the spectral space for all modes */ diff --git a/Source/FieldSolver/SpectralSolver/SpectralKSpace.H b/Source/FieldSolver/SpectralSolver/SpectralKSpace.H index 01bbb4b35..5816a3477 100644 --- a/Source/FieldSolver/SpectralSolver/SpectralKSpace.H +++ b/Source/FieldSolver/SpectralSolver/SpectralKSpace.H @@ -15,12 +15,12 @@ // `KVectorComponent` and `SpectralShiftFactor` hold one 1D array -// ("ManagedVector") for each box ("LayoutData"). The arrays are +// ("DeviceVector") for each box ("LayoutData"). The arrays are // only allocated if the corresponding box is owned by the local MPI rank. -using RealKVector = amrex::Gpu::ManagedVector<amrex::Real>; +using RealKVector = amrex::Gpu::DeviceVector<amrex::Real>; using KVectorComponent = amrex::LayoutData< RealKVector >; using SpectralShiftFactor = amrex::LayoutData< - amrex::Gpu::ManagedVector<Complex> >; + amrex::Gpu::DeviceVector<Complex> >; // Indicate the type of correction "shift" factor to apply // when the FFT is performed from/to a cell-centered grid in real space. diff --git a/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp b/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp index 2c60cdc38..dd12fb2b8 100644 --- a/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp +++ b/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp @@ -75,17 +75,18 @@ SpectralKSpace::getKComponent( const DistributionMapping& dm, const int i_dim, const bool only_positive_k ) const { - // Initialize an empty ManagedVector in each box + // Initialize an empty DeviceVector in each box KVectorComponent k_comp(spectralspace_ba, dm); - // Loop over boxes and allocate the corresponding ManagedVector + // Loop over boxes and allocate the corresponding DeviceVector // for each box owned by the local MPI proc for ( MFIter mfi(spectralspace_ba, dm); mfi.isValid(); ++mfi ){ Box bx = spectralspace_ba[mfi]; - ManagedVector<Real>& k = k_comp[mfi]; + DeviceVector<Real>& k = k_comp[mfi]; // Allocate k to the right size int N = bx.length( i_dim ); k.resize( N ); + Real* pk = k.data(); // Fill the k vector IntVect fft_size = realspace_ba[mfi].length(); @@ -97,21 +98,24 @@ SpectralKSpace::getKComponent( const DistributionMapping& dm, if (only_positive_k){ // Fill the full axis with positive k values // (typically: first axis, in a real-to-complex FFT) - for (int i=0; i<N; i++ ){ - k[i] = i*dk; - } + amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (int i) noexcept + { + pk[i] = i*dk; + }); } else { const int mid_point = (N+1)/2; - // Fill positive values of k - // (FFT conventions: first half is positive) - for (int i=0; i<mid_point; i++ ){ - k[i] = i*dk; - } - // Fill negative values of k - // (FFT conventions: second half is negative) - for (int i=mid_point; i<N; i++){ - k[i] = (i-N)*dk; - } + amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (int i) noexcept + { + if (i < mid_point) { + // Fill positive values of k + // (FFT conventions: first half is positive) + pk[i] = i*dk; + } else { + // Fill negative values of k + // (FFT conventions: second half is negative) + pk[i] = (i-N)*dk; + } + }); } } return k_comp; @@ -131,16 +135,19 @@ SpectralKSpace::getSpectralShiftFactor( const DistributionMapping& dm, const int i_dim, const int shift_type ) const { - // Initialize an empty ManagedVector in each box + // Initialize an empty DeviceVector in each box SpectralShiftFactor shift_factor( spectralspace_ba, dm ); - // Loop over boxes and allocate the corresponding ManagedVector + // Loop over boxes and allocate the corresponding DeviceVector // for each box owned by the local MPI proc for ( MFIter mfi(spectralspace_ba, dm); mfi.isValid(); ++mfi ){ - const ManagedVector<Real>& k = k_vec[i_dim][mfi]; - ManagedVector<Complex>& shift = shift_factor[mfi]; + const DeviceVector<Real>& k = k_vec[i_dim][mfi]; + DeviceVector<Complex>& shift = shift_factor[mfi]; // Allocate shift coefficients - shift.resize( k.size() ); + const int N = k.size(); + shift.resize(N); + Real const* pk = k.data(); + Complex* pshift = shift.data(); // Fill the shift coefficients Real sign = 0; @@ -149,11 +156,10 @@ SpectralKSpace::getSpectralShiftFactor( const DistributionMapping& dm, case ShiftType::TransformToCellCentered: sign = 1.; } const Complex I{0,1}; - int i = 0; - for (auto const& kv : k){ - shift[i] = exp( I*sign*kv*0.5_rt*dx[i_dim]); - i++; - } + amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (int i) noexcept + { + pshift[i] = amrex::exp( I*sign*pk[i]*0.5_rt*dx[i_dim]); + }); } return shift_factor; } @@ -177,72 +183,81 @@ SpectralKSpace::getModifiedKComponent( const DistributionMapping& dm, const int n_order, const bool nodal ) const { - // Initialize an empty ManagedVector in each box + // Initialize an empty DeviceVector in each box KVectorComponent modified_k_comp(spectralspace_ba, dm); if (n_order == -1) { // Infinite-order case for ( MFIter mfi(spectralspace_ba, dm); mfi.isValid(); ++mfi ){ - const ManagedVector<Real>& k = k_vec[i_dim][mfi]; - ManagedVector<Real>& modified_k = modified_k_comp[mfi]; + const DeviceVector<Real>& k = k_vec[i_dim][mfi]; + DeviceVector<Real>& modified_k = modified_k_comp[mfi]; // Allocate modified_k to the same size as k + const int N = k.size(); modified_k.resize( k.size() ); // Fill the modified k vector - int i = 0; - for (auto const& kv : k){ - modified_k[i] = k[i]; // infinite-order case. - i++; - } + Gpu::copyAsync(Gpu::deviceToDevice, k.begin(), k.end(), modified_k.begin()); } } else { // Compute real-space stencil coefficients - Vector<Real> stencil_coef = getFonbergStencilCoefficients(n_order, nodal); + Vector<Real> h_stencil_coef = getFonbergStencilCoefficients(n_order, nodal); + DeviceVector<Real> d_stencil_coef(h_stencil_coef.size()); + Gpu::copyAsync(Gpu::hostToDevice, h_stencil_coef.begin(), h_stencil_coef.end(), + d_stencil_coef.begin()); + Gpu::synchronize(); + const int nstencil = d_stencil_coef.size(); + Real const* p_stencil_coef = d_stencil_coef.data(); - // Loop over boxes and allocate the corresponding ManagedVector + // Loop over boxes and allocate the corresponding DeviceVector // for each box owned by the local MPI proc for ( MFIter mfi(spectralspace_ba, dm); mfi.isValid(); ++mfi ){ Real delta_x = dx[i_dim]; - const ManagedVector<Real>& k = k_vec[i_dim][mfi]; - ManagedVector<Real>& modified_k = modified_k_comp[mfi]; + const DeviceVector<Real>& k = k_vec[i_dim][mfi]; + DeviceVector<Real>& modified_k = modified_k_comp[mfi]; // Allocate modified_k to the same size as k - modified_k.resize( k.size() ); + const int N = k.size(); + modified_k.resize(N); + Real const* p_k = k.data(); + Real * p_modified_k = modified_k.data(); // Fill the modified k vector - int i = 0; - for (auto const& kv : k){ - modified_k[i] = 0; - for (int n=1; n<stencil_coef.size(); n++){ + amrex::ParallelFor(N, [=] AMREX_GPU_DEVICE (int i) noexcept + { + p_modified_k[i] = 0; + for (int n=1; n<nstencil; n++){ if (nodal){ - modified_k[i] += stencil_coef[n]* \ - std::sin( k[i]*n*delta_x )/( n*delta_x ); + p_modified_k[i] += p_stencil_coef[n]* + std::sin( p_k[i]*n*delta_x )/( n*delta_x ); } else { - modified_k[i] += stencil_coef[n]* \ - std::sin( k[i]*(n-0.5)*delta_x )/( (n-0.5)*delta_x ); + p_modified_k[i] += p_stencil_coef[n]* \ + std::sin( p_k[i]*(n-0.5)*delta_x )/( (n-0.5)*delta_x ); } } - i++; - } - // By construction, at finite order and for a nodal grid, - // the *modified* k corresponding to the Nyquist frequency - // (i.e. highest *real* k) is 0. However, the above calculation - // based on stencil coefficients does not give 0 to machine precision. - // Therefore, we need to enforce the fact that the modified k be 0 here. - if (nodal){ - if (i_dim == 0){ - // Because of the real-to-complex FFTs, the first axis (idim=0) - // contains only the positive k, and the Nyquist frequency is - // the last element of the array. - modified_k[k.size()-1] = 0.0_rt; - } else { - // The other axes contains both positive and negative k ; - // the Nyquist frequency is in the middle of the array. - modified_k[k.size()/2] = 0.0_rt; - } - } + // By construction, at finite order and for a nodal grid, + // the *modified* k corresponding to the Nyquist frequency + // (i.e. highest *real* k) is 0. However, the above calculation + // based on stencil coefficients does not give 0 to machine precision. + // Therefore, we need to enforce the fact that the modified k be 0 here. + if (nodal){ + if (i_dim == 0){ + // Because of the real-to-complex FFTs, the first axis (idim=0) + // contains only the positive k, and the Nyquist frequency is + // the last element of the array. + if (i == N-1) { + p_modified_k[i] = 0.0_rt; + } + } else { + // The other axes contains both positive and negative k ; + // the Nyquist frequency is in the middle of the array. + if (i == N/2) { + p_modified_k[i] = 0.0_rt; + } + } + } + }); } } return modified_k_comp; |