diff options
Diffstat (limited to 'Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp')
-rw-r--r-- | Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp | 272 |
1 files changed, 272 insertions, 0 deletions
diff --git a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp new file mode 100644 index 000000000..c5249d54f --- /dev/null +++ b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp @@ -0,0 +1,272 @@ +/* Copyright 2019 David Grote + * + * This file is part of WarpX. + * + * License: BSD-3-Clause-LBNL + */ +#include "WarpX.H" + +#include "HankelTransform.H" +#include "BesselRoots.H" + +#include <blas.hh> +#include <lapack.hh> + +using amrex::operator""_rt; + +HankelTransform::HankelTransform (int const hankel_order, + int const azimuthal_mode, + int const nr, + const amrex::Real rmax) +: m_nr(nr), m_nk(nr) +{ + + // Check that azimuthal_mode has a valid value + AMREX_ALWAYS_ASSERT_WITH_MESSAGE(hankel_order-1 <= azimuthal_mode && azimuthal_mode <= hankel_order+1, + "azimuthal_mode must be either hankel_order-1, hankel_order or hankel_order+1"); + + RealVector alphas; + amrex::Vector<int> alpha_errors; + + GetBesselRoots(azimuthal_mode, m_nk, alphas, alpha_errors); + // Add of check of alpha_errors, should be all zeros + AMREX_ALWAYS_ASSERT(std::all_of(alpha_errors.begin(), alpha_errors.end(), [](int i) { return i == 0; })); + + // Calculate the spectral grid + m_kr.resize(m_nk); + for (int ik=0 ; ik < m_nk ; ik++) { + m_kr[ik] = alphas[ik]/rmax; + } + + // Calculate the spatial grid (Uniform grid with a half-cell offset) + RealVector rmesh(m_nr); + amrex::Real dr = rmax/m_nr; + for (int ir=0 ; ir < m_nr ; ir++) { + rmesh[ir] = dr*(ir + 0.5_rt); + } + + // Calculate and store the inverse matrix invM + // (imposed by the constraints on the DHT of Bessel modes) + // NB: When compared with the FBPIC article, all the matrices here + // are calculated in transposed form. This is done so as to use the + // `dot` and `gemm` functions, in the `transform` method. + int p_denom; + if (hankel_order == azimuthal_mode) { + p_denom = hankel_order + 1; + } else { + p_denom = hankel_order; + } + + RealVector denom(m_nk); + for (int ik=0 ; ik < m_nk ; ik++) { + const amrex::Real jna = jn(p_denom, alphas[ik]); + denom[ik] = MathConst::pi*rmax*rmax*jna*jna; + } + + RealVector num(m_nk*m_nr); + for (int ir=0 ; ir < m_nr ; ir++) { + for (int ik=0 ; ik < m_nk ; ik++) { + int const ii = ik + ir*m_nk; + num[ii] = jn(hankel_order, rmesh[ir]*m_kr[ik]); + } + } + + // Get the inverse matrix + invM.resize(m_nk*m_nr); + if (azimuthal_mode > 0) { + for (int ir=0 ; ir < m_nr ; ir++) { + for (int ik=1 ; ik < m_nk ; ik++) { + int const ii = ik + ir*m_nk; + invM[ii] = num[ii]/denom[ik]; + } + } + // ik = 0 + // In this case, the functions are represented by Bessel functions + // *and* an additional mode (below) which satisfies the same + // algebric relations for curl/div/grad as the regular Bessel modes, + // with the value kperp=0. + // The normalization of this mode is arbitrary, and is chosen + // so that the condition number of invM is close to 1 + if (hankel_order == azimuthal_mode-1) { + for (int ir=0 ; ir < m_nr ; ir++) { + int const ii = ir*m_nk; + invM[ii] = std::pow(rmesh[ir], (azimuthal_mode-1))/(MathConst::pi*std::pow(rmax, (azimuthal_mode+1))); + } + } else { + for (int ir=0 ; ir < m_nr ; ir++) { + int const ii = ir*m_nk; + invM[ii] = 0.; + } + } + } else { + for (int ir=0 ; ir < m_nr ; ir++) { + for (int ik=0 ; ik < m_nk ; ik++) { + int const ii = ik + ir*m_nk; + invM[ii] = num[ii]/denom[ik]; + } + } + } + + // Calculate the matrix M by inverting invM + if (azimuthal_mode !=0 && hankel_order != azimuthal_mode-1) { + // In this case, invM is singular, thus we calculate the pseudo-inverse. + // The Moore-Penrose psuedo-inverse is calculated using the SVD method. + + M.resize(m_nk*m_nr, 0.); + RealVector invMcopy(invM); + RealVector sdiag(m_nk-1, 0.); + RealVector u((m_nk-1)*(m_nk-1), 0.); + RealVector vt((m_nr)*(m_nr), 0.); + RealVector sp((m_nr)*(m_nk-1), 0.); + RealVector temp((m_nr)*(m_nk-1), 0.); + + // Calculate the singlular-value-decomposition of invM (leaving out the first row). + // invM = u*sdiag*vt + // Note that invMcopy.dataPtr()+1 is passed in so that the first ik row is skipped + // A copy is passed in since the matrix is destroyed + lapack::gesvd(lapack::Job::AllVec, lapack::Job::AllVec, + m_nk-1, m_nr, invMcopy.dataPtr()+1, m_nk, + sdiag.dataPtr(), u.dataPtr(), m_nk-1, + vt.dataPtr(), m_nr); + + // Calculate the pseudo-inverse of sdiag, which is trivial since it + // only has values of the diagonal. + for (int i=0 ; i < m_nk-1 ; i++) { + if (sdiag[i] != 0.) { + int const j = i + i*m_nk; + sp[j] = 1._rt/sdiag[i]; + } + } + + // M can be calculated from the SVD + // M = v*sp*u_transpose + + // Do the second multiply, temp = sp*u_transpose + // temp(1:n,1:m) = matmul(transpose(vt(1:n,1:n)), matmul(sp(1:n,1:m), transpose(u(1:m,1:m)))) + blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::Trans, + m_nr, m_nk-1, m_nk-1, 1._rt, + sp.dataPtr(), m_nr, + u.dataPtr(), m_nk-1, 0._rt, + temp.dataPtr(), m_nr); + + // Do the first mutiply, M = vt*temp + // Note that M.dataPtr()+m_nr is passed in so that the first ir column is skipped + blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans, + m_nr, m_nk-1, m_nr, 1., + vt.dataPtr(), m_nr, + temp.dataPtr(), m_nr, 0._rt, + M.dataPtr()+m_nr, m_nr); + + } else { + // In this case, invM is invertible; calculate the inverse. + // getrf calculates the LU decomposition + // getri calculates the inverse from the LU decomposition + + M = invM; + amrex::Vector<int64_t> ipiv(m_nr); + lapack::getrf(m_nk, m_nr, M.dataPtr(), m_nk, ipiv.dataPtr()); + lapack::getri(m_nr, M.dataPtr(), m_nr, ipiv.dataPtr()); + + } + +} + +void +HankelTransform::HankelForwardTransform (amrex::FArrayBox const& F, int const F_icomp, + amrex::FArrayBox & G, int const G_icomp) +{ + amrex::Box const& F_box = F.box(); + amrex::Box const& G_box = G.box(); + + int const nrF = F_box.length(0); + int const nz = F_box.length(1); + int const ngr = G_box.smallEnd(0) - F_box.smallEnd(0); + + AMREX_ALWAYS_ASSERT(m_nr == G_box.length(0)); + AMREX_ALWAYS_ASSERT(nz == G_box.length(1)); + AMREX_ALWAYS_ASSERT(ngr >= 0); + AMREX_ALWAYS_ASSERT(F_box.bigEnd(0)+1 >= m_nr); + +#ifndef AMREX_USE_GPU + // On CPU, the blas::gemm is significantly faster + + // Note that M is flagged to be transposed since it has dimensions (m_nr, m_nk) + blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans, + m_nk, nz, m_nr, 1._rt, + M.dataPtr(), m_nk, + F.dataPtr(F_icomp)+ngr, nrF, 0._rt, + G.dataPtr(G_icomp), m_nk); + +#else + // On GPU, the explicit loop is significantly faster + // It is not clear if the GPU gemm wasn't build properly, it is cycling data out and back + // in to the device, or if it is because gemm is launching its own threads. + + amrex::Real const * M_arr = M.dataPtr(); + amrex::Array4<const amrex::Real> const & F_arr = F.array(); + amrex::Array4< amrex::Real> const & G_arr = G.array(); + + int const nr = m_nr; + + ParallelFor(G_box, + [=] AMREX_GPU_DEVICE(int ik, int iz, int inotused) noexcept { + G_arr(ik,iz,G_icomp) = 0.; + for (int ir=0 ; ir < nr ; ir++) { + int const ii = ir + ik*nr; + G_arr(ik,iz,G_icomp) += M_arr[ii]*F_arr(ir,iz,F_icomp); + } + }); + +#endif + +} + +void +HankelTransform::HankelInverseTransform (amrex::FArrayBox const& G, int const G_icomp, + amrex::FArrayBox & F, int const F_icomp) +{ + amrex::Box const& G_box = G.box(); + amrex::Box const& F_box = F.box(); + + int const nrF = F_box.length(0); + int const nz = F_box.length(1); + int const ngr = G_box.smallEnd(0) - F_box.smallEnd(0); + + AMREX_ALWAYS_ASSERT(m_nr == G_box.length(0)); + AMREX_ALWAYS_ASSERT(nz == G_box.length(1)); + AMREX_ALWAYS_ASSERT(ngr >= 0); + AMREX_ALWAYS_ASSERT(F_box.bigEnd(0)+1 >= m_nr); + +#ifndef AMREX_USE_GPU + // On CPU, the blas::gemm is significantly faster + + // Note that invM is flagged to be transposed since it has dimensions (m_nk, m_nr) + blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans, + m_nr, nz, m_nk, 1._rt, + invM.dataPtr(), m_nr, + G.dataPtr(G_icomp), m_nk, 0._rt, + F.dataPtr(F_icomp)+ngr, nrF); + +#else + // On GPU, the explicit loop is significantly faster + // It is not clear if the GPU gemm wasn't build properly, it is cycling data out and back + // in to the device, or if it is because gemm is launching its own threads. + + amrex::Real const * invM_arr = invM.dataPtr(); + amrex::Array4<const amrex::Real> const & G_arr = G.array(); + amrex::Array4< amrex::Real> const & F_arr = F.array(); + + int const nk = m_nk; + + ParallelFor(G_box, + [=] AMREX_GPU_DEVICE(int ir, int iz, int inotused) noexcept { + F_arr(ir,iz,F_icomp) = 0.; + for (int ik=0 ; ik < nk ; ik++) { + int const ii = ik + ir*nk; + F_arr(ir,iz,F_icomp) += invM_arr[ii]*G_arr(ik,iz,G_icomp); + } + }); + +#endif + +} |