aboutsummaryrefslogtreecommitdiff
path: root/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp
diff options
context:
space:
mode:
authorGravatar David Grote <grote1@llnl.gov> 2020-04-14 15:51:10 -0700
committerGravatar GitHub <noreply@github.com> 2020-04-14 15:51:10 -0700
commitd61cdd0fbef823f5135f7e9c4251ec09ee586651 (patch)
tree7fc39e980f69013840402c9419c7e2134b3d3f6c /Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp
parentb28820fe545779411bfb32ca3c574ad89a2a611a (diff)
downloadWarpX-d61cdd0fbef823f5135f7e9c4251ec09ee586651.tar.gz
WarpX-d61cdd0fbef823f5135f7e9c4251ec09ee586651.tar.zst
WarpX-d61cdd0fbef823f5135f7e9c4251ec09ee586651.zip
Implementation of the RZ spectral solver (#816)
* For diagnostics, added RZ modes of scalars, allowed different centerings * For RZ, generalized the centering of the inverse volume scaling of J and rho * Fixed spacing in ConstructTotalRZScalarField Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Added Python wrapper of charge density arrays * Add assert ensuring that Jr is never node-centered Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Small fixes to Python to better handle particle weights * Implementation of the RZ spectral solver * Removed k-space filtering code * Removed more k-space filter code from RZ spectral solver * For RZ spectral, added _rt for literals and cleaned up namespace use * In RZ spectral solver, cleaned up some member names * Update Docs/source/building/rzgeometry.rst Small fix for clarity. Co-Authored-By: Remi Lehe <remi.lehe@normalesup.org> * Update Source/Evolve/WarpXEvolve.cpp Fix macro indentation Co-Authored-By: Remi Lehe <remi.lehe@normalesup.org> * Update Source/Evolve/WarpXEvolve.cpp Fix more macro indentation Co-Authored-By: Remi Lehe <remi.lehe@normalesup.org> * Update Source/Evolve/WarpXEvolve.cpp Fix another macro indentation Co-Authored-By: Remi Lehe <remi.lehe@normalesup.org> * New diagnostics support RZ (#836) * Start implementation of new averaging with staggering: - face-to-cell-center and edge-to-cell-center replaced so far; - TODO: node-to-cell-center and 1D behavior (AMREX_SPACEDIM=1). * first implementation of Diags base classes * add example, temporarily * Continue implementation of new averaging with staggering: - new function takes reference to single MultiFab (no vector); - TODO: node-to-cell-center still in progress. * Fix small bug and clean up * Fix bug in loop over n=0,...,ncomp-1 and clean up * add more functions * Add Doxygen documentation and clean up * Small clean-up in Doxygen documentation * Compile in single precision: add _rt suffix to avoid unnecessary conversions * Avoid accessing staggering index directly from IntVect in innermost loops * Replace do-while loop with for loop (default ncomp=1) * Remove temporary pointer and pass reference to MultiFab (instead of MultiFab*) * Replace AMREX_LAUNCH_HOST_DEVICE_LAMBDA with ParallelFor * cleaning and initialize output mf * use general average routine * move flush in new class, and implemented the Plotfile derived class * add comments * eol * free memory in destructor * typo * typo * no need to clear MF pointers there * though shalt not break existing tests * FlushRaw doesnt have to be virtual for now * The importance of being constant * Capability to select fields in output files * EOL * revert to old inputs * const in right place * avoid brace initializer there * oops, fix logic error in is_in * user can choose flush interval, same behavior as plot_int * Add option to plot raw fields * eol * replace ter flush with dump to avoid confusion * add options * Diagnostics stores a vector of functors to compute diags on the fly * eol * Field gather from diags to sync particle quantities * New diagnostics handle RZ with same behavior as old ones * cleaning and doc * const ref for string * smarter for loop from Axel and typo fix from Reva * simplify code following Dave's comments * rename mode_avg to convertRZmodes2cartesian * Update CellCenterFunctor.H * fill varnames and vector of functors at the same time * WarpX instance not needed here * add const Co-authored-by: Edoardo Zoni <ezoni@lbl.gov> * Add load balance options documentation (#842) * Add load balance options documentation * Add load balance options documentations * EOL * Replace tilebox by growntilebox (#849) * Updated Profiling information in running_cpp (#776) * Fixed link that was pointing to 404 error page * Added motivation for profiling and TINYPROFILERS explanation * Moved section on NERSC profiling to developers Docs * Update Docs/source/running_cpp/profiling.rst Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Update tiny profilers suggestion Docs/source/running_cpp/profiling.rst Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Fix typo Docs/source/running_cpp/profiling.rst Co-authored-by: MaxThevenet <mthevenet@lbl.gov> * Add a few additional diags (divE etc.) (#844) * Start implementation of new averaging with staggering: - face-to-cell-center and edge-to-cell-center replaced so far; - TODO: node-to-cell-center and 1D behavior (AMREX_SPACEDIM=1). * first implementation of Diags base classes * add example, temporarily * Continue implementation of new averaging with staggering: - new function takes reference to single MultiFab (no vector); - TODO: node-to-cell-center still in progress. * Fix small bug and clean up * Fix bug in loop over n=0,...,ncomp-1 and clean up * add more functions * Add Doxygen documentation and clean up * Small clean-up in Doxygen documentation * Compile in single precision: add _rt suffix to avoid unnecessary conversions * Avoid accessing staggering index directly from IntVect in innermost loops * Replace do-while loop with for loop (default ncomp=1) * Remove temporary pointer and pass reference to MultiFab (instead of MultiFab*) * Replace AMREX_LAUNCH_HOST_DEVICE_LAMBDA with ParallelFor * cleaning and initialize output mf * use general average routine * move flush in new class, and implemented the Plotfile derived class * add comments * eol * free memory in destructor * typo * typo * no need to clear MF pointers there * though shalt not break existing tests * FlushRaw doesnt have to be virtual for now * The importance of being constant * Capability to select fields in output files * EOL * revert to old inputs * const in right place * avoid brace initializer there * oops, fix logic error in is_in * user can choose flush interval, same behavior as plot_int * Add option to plot raw fields * eol * replace ter flush with dump to avoid confusion * add options * Diagnostics stores a vector of functors to compute diags on the fly * eol * Field gather from diags to sync particle quantities * New diagnostics handle RZ with same behavior as old ones * cleaning and doc * const ref for string * smarter for loop from Axel and typo fix from Reva * Functors to compute some fields * simplify code following Dave's comments * Create subfolders and add more output options (divE etc.) * eol * rename mode_avg to convertRZmodes2cartesian * Update CellCenterFunctor.H * fill varnames and vector of functors at the same time * output rho_new, not rho_old * WarpX instance not needed here * add const * little bit more of reorganization * Apply suggestions from code review Co-Authored-By: Axel Huebl <axel.huebl@plasma.ninja> * add a bunch of const * make derived classes final Co-authored-by: Edoardo Zoni <ezoni@lbl.gov> Co-authored-by: Axel Huebl <axel.huebl@plasma.ninja> * Add Initial Distribution Test (#735) * Add Histogram * Add normalization * Add doc * Minor * Minor * Fix a bug * Add gaussian distribution test * Fix alert and change amr.plot_int * Add maxwell-boltzmann distribution test * Add maxwell-boltzmann distribution test * Add maxwell-boltzmann distribution test * Add maxwell-juttner * Minor * Typo * Minor * Minor * Add const * Apply suggestions from code review Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Modify based on suggestions. * Add histogram name * Add bin values * Don't add histogram name * Modify read_raw_data.py * Add doc * Change ux,uy,uz units * Change ux,uy,uz units * Change if format * Save some variables * Change more * Minor * Fix a bug on GPU * Fix a bug on GPU * Add wrong species name abort * Minor doc * Change #include format * Apply suggestions from code review Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Add const * Change to member variables * revert * Change units based on changes of PR#727 * merge * Add Gaussian position distribution test * Minor * Change based on suggestions * Use read_raw_data.py * Minor * Change to no normalization * Add more in doc * doc * doc * Use relative error * Don't divide by bin_size * Change based on suggestions Co-authored-by: MaxThevenet <mthevenet@lbl.gov> * Tests: Fix Bool Switch Typo OMP (#854) useOMP is 0 (False) or 1 (True) * Costs vector of (pointer to) vector (#829) * [WIP] costs mf --> costs vector * [WIP] costs vector * [WIP] vector costs * formatting * makefile * [WIP] costs vector * [WIP] *= costs * wts do not need to divide by num cells * Tiling safety on CPU * Add tests * EOL * Remove unneeded input * Update Source/WarpX.H costs documentation Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Update timers with times only if user Timers update * warpx.-->WarpX:: * warpx.-->WarpX:: * warpx.-->WarpX:: * warpx.-->WarpX:: * warpx.-->WarpX:: * add dev synch * Update Regression/WarpX-tests.ini Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Delete inputs_loadbalance_costs_heuristic * Update and rename inputs_loadbalancecosts_timers to inputs_loadbalancecosts * Update WarpX-tests.ini Co-authored-by: MaxThevenet <mthevenet@lbl.gov> * [mini-PR] Read species distribution from OPMD file (#847) * Added <species>.profile=external_file and .profile_file * Added description of input parameters to Docs * Changed from profile to injection option for external file * Fix typo in amrex abort message (due to copy paste) * Added the OpenPMD use amrex abort message * Minor fix - not sure how to remove EOL issue * Tried to add AddExternalFileBeam functon to PhysicalParticleContainer * Trued to fix EOL white space issue * Added read/print species name from OPMD file * Update Source/Initialization/PlasmaInjector.cpp Co-Authored-By: Axel Huebl <axel.huebl@plasma.ninja> * Update Source/Particles/PhysicalParticleContainer.cpp Co-Authored-By: Axel Huebl <axel.huebl@plasma.ninja> * Update Source/Particles/PhysicalParticleContainer.cpp Co-Authored-By: Axel Huebl <axel.huebl@plasma.ninja> * Update Source/Particles/PhysicalParticleContainer.H Co-Authored-By: Axel Huebl <axel.huebl@plasma.ninja> * Update Source/Particles/PhysicalParticleContainer.cpp Co-Authored-By: Axel Huebl <axel.huebl@plasma.ninja> * No need to include openPMD header yet * Fix EOL according to @ax3l's recommendation in #845 * Remove commented out AbortMessage Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Removed commented out part initialization (used only in branch for next PR) Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Added warning that this is WIP Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Changed function name to AddPlasmaFromFile * Removed AMReX warning from loop Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> Co-authored-by: Axel Huebl <axel.huebl@plasma.ninja> Co-authored-by: MaxThevenet <mthevenet@lbl.gov> * Ignore python build/dist and egg folders (#850) * Travis CI: set max numprocs=2 and do not overwrite (#860) * [mini-PR] Fix bug in Breit-Wheeler engine (#852) * fixed bug in BW engine * fixed bug * fixed bug * fixed bug * fixed bug * fixed bug * eliminate useless variable * updated test * updated inputfile * Updated tests * increase tolerance from .04 to .07 in QED 3D BW test * do plot pos_bw and ele_bw Co-authored-by: MaxThevenet <mthevenet@lbl.gov> * Documentation update - towards full SI (#301) * Added blank line after list. Changed characters in link to Q. H. Liu paper so hyoerlink works with sphinx-build 2.1.2. * Added line cut unintentionally. * Removed line added unintentionally. * Same as before. * Same as before, but hopefully successfully * Same as before, but hopefully successfully * Minor changes to description of PWFA example run * Revert "Minor changes to description of PWFA example run" This reverts commit a4d7fa969c906959b018efe683a3e585cbd741f9. * Revert "Profiler wrapper to allow for cudaDeviceSynchronize (#738)" This reverts commit bbefc3dad687f4370afd5bc85386d983201cb321. * Revert "Revert "Minor changes to description of PWFA example run"" This reverts commit 965982d35361ff54d0ad10101ffc31605117e5ac. * Revert "Minor changes to description of PWFA example run" This reverts commit a4d7fa969c906959b018efe683a3e585cbd741f9. * I am making a huge mess with merging * Minor changes to description of PWFA example run * Added explanation PWFA simulation section * Re-structuring. Adding sections for each choice. * Minor fix to note * Minor changes to text * Time step description + fixed line length * Added FDTD and CKC selection * Added max time step calculations * Trying to fix EOL issue * Added mesh refinement and moving window * Fixed minor issues * Fix EOL issues again * Fixed typo - auxiliary Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> Co-authored-by: Diana Amorim <diana@henrivincenti.dhcp.lbl.gov> Co-authored-by: MaxThevenet <mthevenet@lbl.gov> * Remove compiler warnings (#843) * Fix compiler warnings with DIM=2 * Fix compiler warnings with USE_RZ=TRUE * Fix compiler warnings with USE_PSATD=TRUE and DIM=2 * Fix compiler warnings with USE_PSATD=TRUE and DIM=3 * Fix bug: discard only return value when calling DefineAndReturnParticleTile * Remove unused variables not triggering warnings * [mini-PR] Fix energy calculation for photons in reduced diagnostics (#861) * fix energy calculation for photons * fixed typo and mada calculation clearer * added photon particles in reduced diags test * Update Source/Diagnostics/ReducedDiags/ParticleEnergy.cpp Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Update Source/Diagnostics/ReducedDiags/ParticleEnergy.cpp Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> Co-authored-by: MaxThevenet <mthevenet@lbl.gov> * Port rigid injection to the gpu (#862) * port rigid injection to the gpu * eol * Apply suggestions from code review Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * define csqi Co-authored-by: MaxThevenet <mthevenet@lbl.gov> * Added blocking factor to 2d and RZ geometries (#864) * doc: fix formatting for ascent yaml examples (#865) * [mini-PR] Clarifying ionizable particle charge (#863) * Added documentation note on ionization particle charge * Added correct charge of ion to be ionized * Corrected multiplication symbol Co-Authored-By: MaxThevenet <mthevenet@lbl.gov> * Testing doxygen issue * Charge correction only to ionizable species * Trying to fix doxygen url fetch issue Co-authored-by: MaxThevenet <mthevenet@lbl.gov> * In HankelTransform, added explicit matrix multiply for GPU * In RZ spectral solver, update setval to be on device * Removed CEXE_headers from FieldSolver/SpectralSolver/Make.package * In HankelTransform, added check of the Bessel root finder * Updated includes to RZ spectral solver * Added comments on how Hankel transform matrix is calculated * Added more comments to Hankel transform calculation * For RZ spectral solver, cleaned up naming and add subdirectory for Hankel transform files * Cleaned up code in PsatdAlgorithmRZ.cpp * Updated comment for fields in SpectralFieldDataRZ.cpp * Changed HankelTransformer to MultiSpectralHankelTransformer * Updates comments in RZ spectral solver * Removed code for k-space filtering that will be added in a later PR * For RZ spectral solver, passed lev in * Fixed comment in SpectralFieldDataRZ.cpp Co-authored-by: MaxThevenet <mthevenet@lbl.gov> Co-authored-by: Remi Lehe <remi.lehe@normalesup.org> Co-authored-by: Edoardo Zoni <ezoni@lbl.gov> Co-authored-by: Michael E Rowan <38045958+mrowan137@users.noreply.github.com> Co-authored-by: NeilZaim <49716072+NeilZaim@users.noreply.github.com> Co-authored-by: L. Diana Amorim <LDianaAmorim@lbl.gov> Co-authored-by: Axel Huebl <axel.huebl@plasma.ninja> Co-authored-by: Yinjian Zhao <yinjianzhao@lbl.gov> Co-authored-by: Edoardo Zoni <59625522+EZoni@users.noreply.github.com> Co-authored-by: Luca Fedeli <luca.fedeli@cea.fr> Co-authored-by: Diana Amorim <diana@henrivincenti.dhcp.lbl.gov> Co-authored-by: Andrew Myers <atmyers@lbl.gov> Co-authored-by: Cyrus Harrison <cyrush@llnl.gov>
Diffstat (limited to 'Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp')
-rw-r--r--Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp272
1 files changed, 272 insertions, 0 deletions
diff --git a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp
new file mode 100644
index 000000000..c5249d54f
--- /dev/null
+++ b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp
@@ -0,0 +1,272 @@
+/* Copyright 2019 David Grote
+ *
+ * This file is part of WarpX.
+ *
+ * License: BSD-3-Clause-LBNL
+ */
+#include "WarpX.H"
+
+#include "HankelTransform.H"
+#include "BesselRoots.H"
+
+#include <blas.hh>
+#include <lapack.hh>
+
+using amrex::operator""_rt;
+
+HankelTransform::HankelTransform (int const hankel_order,
+ int const azimuthal_mode,
+ int const nr,
+ const amrex::Real rmax)
+: m_nr(nr), m_nk(nr)
+{
+
+ // Check that azimuthal_mode has a valid value
+ AMREX_ALWAYS_ASSERT_WITH_MESSAGE(hankel_order-1 <= azimuthal_mode && azimuthal_mode <= hankel_order+1,
+ "azimuthal_mode must be either hankel_order-1, hankel_order or hankel_order+1");
+
+ RealVector alphas;
+ amrex::Vector<int> alpha_errors;
+
+ GetBesselRoots(azimuthal_mode, m_nk, alphas, alpha_errors);
+ // Add of check of alpha_errors, should be all zeros
+ AMREX_ALWAYS_ASSERT(std::all_of(alpha_errors.begin(), alpha_errors.end(), [](int i) { return i == 0; }));
+
+ // Calculate the spectral grid
+ m_kr.resize(m_nk);
+ for (int ik=0 ; ik < m_nk ; ik++) {
+ m_kr[ik] = alphas[ik]/rmax;
+ }
+
+ // Calculate the spatial grid (Uniform grid with a half-cell offset)
+ RealVector rmesh(m_nr);
+ amrex::Real dr = rmax/m_nr;
+ for (int ir=0 ; ir < m_nr ; ir++) {
+ rmesh[ir] = dr*(ir + 0.5_rt);
+ }
+
+ // Calculate and store the inverse matrix invM
+ // (imposed by the constraints on the DHT of Bessel modes)
+ // NB: When compared with the FBPIC article, all the matrices here
+ // are calculated in transposed form. This is done so as to use the
+ // `dot` and `gemm` functions, in the `transform` method.
+ int p_denom;
+ if (hankel_order == azimuthal_mode) {
+ p_denom = hankel_order + 1;
+ } else {
+ p_denom = hankel_order;
+ }
+
+ RealVector denom(m_nk);
+ for (int ik=0 ; ik < m_nk ; ik++) {
+ const amrex::Real jna = jn(p_denom, alphas[ik]);
+ denom[ik] = MathConst::pi*rmax*rmax*jna*jna;
+ }
+
+ RealVector num(m_nk*m_nr);
+ for (int ir=0 ; ir < m_nr ; ir++) {
+ for (int ik=0 ; ik < m_nk ; ik++) {
+ int const ii = ik + ir*m_nk;
+ num[ii] = jn(hankel_order, rmesh[ir]*m_kr[ik]);
+ }
+ }
+
+ // Get the inverse matrix
+ invM.resize(m_nk*m_nr);
+ if (azimuthal_mode > 0) {
+ for (int ir=0 ; ir < m_nr ; ir++) {
+ for (int ik=1 ; ik < m_nk ; ik++) {
+ int const ii = ik + ir*m_nk;
+ invM[ii] = num[ii]/denom[ik];
+ }
+ }
+ // ik = 0
+ // In this case, the functions are represented by Bessel functions
+ // *and* an additional mode (below) which satisfies the same
+ // algebric relations for curl/div/grad as the regular Bessel modes,
+ // with the value kperp=0.
+ // The normalization of this mode is arbitrary, and is chosen
+ // so that the condition number of invM is close to 1
+ if (hankel_order == azimuthal_mode-1) {
+ for (int ir=0 ; ir < m_nr ; ir++) {
+ int const ii = ir*m_nk;
+ invM[ii] = std::pow(rmesh[ir], (azimuthal_mode-1))/(MathConst::pi*std::pow(rmax, (azimuthal_mode+1)));
+ }
+ } else {
+ for (int ir=0 ; ir < m_nr ; ir++) {
+ int const ii = ir*m_nk;
+ invM[ii] = 0.;
+ }
+ }
+ } else {
+ for (int ir=0 ; ir < m_nr ; ir++) {
+ for (int ik=0 ; ik < m_nk ; ik++) {
+ int const ii = ik + ir*m_nk;
+ invM[ii] = num[ii]/denom[ik];
+ }
+ }
+ }
+
+ // Calculate the matrix M by inverting invM
+ if (azimuthal_mode !=0 && hankel_order != azimuthal_mode-1) {
+ // In this case, invM is singular, thus we calculate the pseudo-inverse.
+ // The Moore-Penrose psuedo-inverse is calculated using the SVD method.
+
+ M.resize(m_nk*m_nr, 0.);
+ RealVector invMcopy(invM);
+ RealVector sdiag(m_nk-1, 0.);
+ RealVector u((m_nk-1)*(m_nk-1), 0.);
+ RealVector vt((m_nr)*(m_nr), 0.);
+ RealVector sp((m_nr)*(m_nk-1), 0.);
+ RealVector temp((m_nr)*(m_nk-1), 0.);
+
+ // Calculate the singlular-value-decomposition of invM (leaving out the first row).
+ // invM = u*sdiag*vt
+ // Note that invMcopy.dataPtr()+1 is passed in so that the first ik row is skipped
+ // A copy is passed in since the matrix is destroyed
+ lapack::gesvd(lapack::Job::AllVec, lapack::Job::AllVec,
+ m_nk-1, m_nr, invMcopy.dataPtr()+1, m_nk,
+ sdiag.dataPtr(), u.dataPtr(), m_nk-1,
+ vt.dataPtr(), m_nr);
+
+ // Calculate the pseudo-inverse of sdiag, which is trivial since it
+ // only has values of the diagonal.
+ for (int i=0 ; i < m_nk-1 ; i++) {
+ if (sdiag[i] != 0.) {
+ int const j = i + i*m_nk;
+ sp[j] = 1._rt/sdiag[i];
+ }
+ }
+
+ // M can be calculated from the SVD
+ // M = v*sp*u_transpose
+
+ // Do the second multiply, temp = sp*u_transpose
+ // temp(1:n,1:m) = matmul(transpose(vt(1:n,1:n)), matmul(sp(1:n,1:m), transpose(u(1:m,1:m))))
+ blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::Trans,
+ m_nr, m_nk-1, m_nk-1, 1._rt,
+ sp.dataPtr(), m_nr,
+ u.dataPtr(), m_nk-1, 0._rt,
+ temp.dataPtr(), m_nr);
+
+ // Do the first mutiply, M = vt*temp
+ // Note that M.dataPtr()+m_nr is passed in so that the first ir column is skipped
+ blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans,
+ m_nr, m_nk-1, m_nr, 1.,
+ vt.dataPtr(), m_nr,
+ temp.dataPtr(), m_nr, 0._rt,
+ M.dataPtr()+m_nr, m_nr);
+
+ } else {
+ // In this case, invM is invertible; calculate the inverse.
+ // getrf calculates the LU decomposition
+ // getri calculates the inverse from the LU decomposition
+
+ M = invM;
+ amrex::Vector<int64_t> ipiv(m_nr);
+ lapack::getrf(m_nk, m_nr, M.dataPtr(), m_nk, ipiv.dataPtr());
+ lapack::getri(m_nr, M.dataPtr(), m_nr, ipiv.dataPtr());
+
+ }
+
+}
+
+void
+HankelTransform::HankelForwardTransform (amrex::FArrayBox const& F, int const F_icomp,
+ amrex::FArrayBox & G, int const G_icomp)
+{
+ amrex::Box const& F_box = F.box();
+ amrex::Box const& G_box = G.box();
+
+ int const nrF = F_box.length(0);
+ int const nz = F_box.length(1);
+ int const ngr = G_box.smallEnd(0) - F_box.smallEnd(0);
+
+ AMREX_ALWAYS_ASSERT(m_nr == G_box.length(0));
+ AMREX_ALWAYS_ASSERT(nz == G_box.length(1));
+ AMREX_ALWAYS_ASSERT(ngr >= 0);
+ AMREX_ALWAYS_ASSERT(F_box.bigEnd(0)+1 >= m_nr);
+
+#ifndef AMREX_USE_GPU
+ // On CPU, the blas::gemm is significantly faster
+
+ // Note that M is flagged to be transposed since it has dimensions (m_nr, m_nk)
+ blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans,
+ m_nk, nz, m_nr, 1._rt,
+ M.dataPtr(), m_nk,
+ F.dataPtr(F_icomp)+ngr, nrF, 0._rt,
+ G.dataPtr(G_icomp), m_nk);
+
+#else
+ // On GPU, the explicit loop is significantly faster
+ // It is not clear if the GPU gemm wasn't build properly, it is cycling data out and back
+ // in to the device, or if it is because gemm is launching its own threads.
+
+ amrex::Real const * M_arr = M.dataPtr();
+ amrex::Array4<const amrex::Real> const & F_arr = F.array();
+ amrex::Array4< amrex::Real> const & G_arr = G.array();
+
+ int const nr = m_nr;
+
+ ParallelFor(G_box,
+ [=] AMREX_GPU_DEVICE(int ik, int iz, int inotused) noexcept {
+ G_arr(ik,iz,G_icomp) = 0.;
+ for (int ir=0 ; ir < nr ; ir++) {
+ int const ii = ir + ik*nr;
+ G_arr(ik,iz,G_icomp) += M_arr[ii]*F_arr(ir,iz,F_icomp);
+ }
+ });
+
+#endif
+
+}
+
+void
+HankelTransform::HankelInverseTransform (amrex::FArrayBox const& G, int const G_icomp,
+ amrex::FArrayBox & F, int const F_icomp)
+{
+ amrex::Box const& G_box = G.box();
+ amrex::Box const& F_box = F.box();
+
+ int const nrF = F_box.length(0);
+ int const nz = F_box.length(1);
+ int const ngr = G_box.smallEnd(0) - F_box.smallEnd(0);
+
+ AMREX_ALWAYS_ASSERT(m_nr == G_box.length(0));
+ AMREX_ALWAYS_ASSERT(nz == G_box.length(1));
+ AMREX_ALWAYS_ASSERT(ngr >= 0);
+ AMREX_ALWAYS_ASSERT(F_box.bigEnd(0)+1 >= m_nr);
+
+#ifndef AMREX_USE_GPU
+ // On CPU, the blas::gemm is significantly faster
+
+ // Note that invM is flagged to be transposed since it has dimensions (m_nk, m_nr)
+ blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans,
+ m_nr, nz, m_nk, 1._rt,
+ invM.dataPtr(), m_nr,
+ G.dataPtr(G_icomp), m_nk, 0._rt,
+ F.dataPtr(F_icomp)+ngr, nrF);
+
+#else
+ // On GPU, the explicit loop is significantly faster
+ // It is not clear if the GPU gemm wasn't build properly, it is cycling data out and back
+ // in to the device, or if it is because gemm is launching its own threads.
+
+ amrex::Real const * invM_arr = invM.dataPtr();
+ amrex::Array4<const amrex::Real> const & G_arr = G.array();
+ amrex::Array4< amrex::Real> const & F_arr = F.array();
+
+ int const nk = m_nk;
+
+ ParallelFor(G_box,
+ [=] AMREX_GPU_DEVICE(int ir, int iz, int inotused) noexcept {
+ F_arr(ir,iz,F_icomp) = 0.;
+ for (int ik=0 ; ik < nk ; ik++) {
+ int const ii = ik + ir*nk;
+ F_arr(ir,iz,F_icomp) += invM_arr[ii]*G_arr(ik,iz,G_icomp);
+ }
+ });
+
+#endif
+
+}