1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
|
/* Copyright 2019 David Grote
*
* This file is part of WarpX.
*
* License: BSD-3-Clause-LBNL
*/
#include "HankelTransform.H"
#include "BesselRoots.H"
#include "Utils/WarpXConst.H"
#include "WarpX.H"
#include <blas.hh>
#include <lapack.hh>
using amrex::operator""_rt;
HankelTransform::HankelTransform (int const hankel_order,
int const azimuthal_mode,
int const nr,
const amrex::Real rmax)
: m_nr(nr), m_nk(nr)
{
// Check that azimuthal_mode has a valid value
AMREX_ALWAYS_ASSERT_WITH_MESSAGE(hankel_order-1 <= azimuthal_mode && azimuthal_mode <= hankel_order+1,
"azimuthal_mode must be either hankel_order-1, hankel_order or hankel_order+1");
amrex::Vector<amrex::Real> alphas;
amrex::Vector<int> alpha_errors;
GetBesselRoots(azimuthal_mode, m_nk, alphas, alpha_errors);
// Add of check of alpha_errors, should be all zeros
AMREX_ALWAYS_ASSERT(std::all_of(alpha_errors.begin(), alpha_errors.end(), [](int i) { return i == 0; }));
// Calculate the spectral grid
amrex::Vector<amrex::Real> kr(m_nk);
for (int ik=0 ; ik < m_nk ; ik++) {
kr[ik] = alphas[ik]/rmax;
}
// Calculate the spatial grid (Uniform grid with a half-cell offset)
amrex::Vector<amrex::Real> rmesh(m_nr);
amrex::Real dr = rmax/m_nr;
for (int ir=0 ; ir < m_nr ; ir++) {
rmesh[ir] = dr*(ir + 0.5_rt);
}
// Calculate and store the inverse matrix invM
// (imposed by the constraints on the DHT of Bessel modes)
// NB: When compared with the FBPIC article, all the matrices here
// are calculated in transposed form. This is done so as to use the
// `dot` and `gemm` functions, in the `transform` method.
int p_denom;
if (hankel_order == azimuthal_mode) {
p_denom = hankel_order + 1;
} else {
p_denom = hankel_order;
}
amrex::Vector<amrex::Real> denom(m_nk);
for (int ik=0 ; ik < m_nk ; ik++) {
const amrex::Real jna = jn(p_denom, alphas[ik]);
denom[ik] = MathConst::pi*rmax*rmax*jna*jna;
}
amrex::Vector<amrex::Real> num(m_nk*m_nr);
for (int ir=0 ; ir < m_nr ; ir++) {
for (int ik=0 ; ik < m_nk ; ik++) {
int const ii = ik + ir*m_nk;
num[ii] = jn(hankel_order, rmesh[ir]*kr[ik]);
}
}
// Get the inverse matrix
amrex::Vector<amrex::Real> invM(m_nk*m_nr);
if (azimuthal_mode > 0) {
for (int ir=0 ; ir < m_nr ; ir++) {
for (int ik=1 ; ik < m_nk ; ik++) {
int const ii = ik + ir*m_nk;
invM[ii] = num[ii]/denom[ik];
}
}
// ik = 0
// In this case, the functions are represented by Bessel functions
// *and* an additional mode (below) which satisfies the same
// algebric relations for curl/div/grad as the regular Bessel modes,
// with the value kperp=0.
// The normalization of this mode is arbitrary, and is chosen
// so that the condition number of invM is close to 1
if (hankel_order == azimuthal_mode-1) {
for (int ir=0 ; ir < m_nr ; ir++) {
int const ii = ir*m_nk;
invM[ii] = std::pow(rmesh[ir], (azimuthal_mode-1))/(MathConst::pi*std::pow(rmax, (azimuthal_mode+1)));
}
} else {
for (int ir=0 ; ir < m_nr ; ir++) {
int const ii = ir*m_nk;
invM[ii] = 0.;
}
}
} else {
for (int ir=0 ; ir < m_nr ; ir++) {
for (int ik=0 ; ik < m_nk ; ik++) {
int const ii = ik + ir*m_nk;
invM[ii] = num[ii]/denom[ik];
}
}
}
amrex::Vector<amrex::Real> M;
// Calculate the matrix M by inverting invM
if (azimuthal_mode !=0 && hankel_order != azimuthal_mode-1) {
// In this case, invM is singular, thus we calculate the pseudo-inverse.
// The Moore-Penrose psuedo-inverse is calculated using the SVD method.
M.resize(m_nk*m_nr, 0.);
amrex::Vector<amrex::Real> invMcopy(invM);
amrex::Vector<amrex::Real> sdiag(m_nk-1, 0.);
amrex::Vector<amrex::Real> u((m_nk-1)*(m_nk-1), 0.);
amrex::Vector<amrex::Real> vt((m_nr)*(m_nr), 0.);
amrex::Vector<amrex::Real> sp((m_nr)*(m_nk-1), 0.);
amrex::Vector<amrex::Real> temp((m_nr)*(m_nk-1), 0.);
// Calculate the singlular-value-decomposition of invM (leaving out the first row).
// invM = u*sdiag*vt
// Note that invMcopy.dataPtr()+1 is passed in so that the first ik row is skipped
// A copy is passed in since the matrix is destroyed
lapack::gesvd(lapack::Job::AllVec, lapack::Job::AllVec,
m_nk-1, m_nr, invMcopy.dataPtr()+1, m_nk,
sdiag.dataPtr(), u.dataPtr(), m_nk-1,
vt.dataPtr(), m_nr);
// Calculate the pseudo-inverse of sdiag, which is trivial since it
// only has values of the diagonal.
for (int i=0 ; i < m_nk-1 ; i++) {
if (sdiag[i] != 0.) {
int const j = i + i*m_nk;
sp[j] = 1._rt/sdiag[i];
}
}
// M can be calculated from the SVD
// M = v*sp*u_transpose
// Do the second multiply, temp = sp*u_transpose
// temp(1:n,1:m) = matmul(transpose(vt(1:n,1:n)), matmul(sp(1:n,1:m), transpose(u(1:m,1:m))))
blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::Trans,
m_nr, m_nk-1, m_nk-1, 1._rt,
sp.dataPtr(), m_nr,
u.dataPtr(), m_nk-1, 0._rt,
temp.dataPtr(), m_nr);
// Do the first mutiply, M = vt*temp
// Note that M.dataPtr()+m_nr is passed in so that the first ir column is skipped
blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans,
m_nr, m_nk-1, m_nr, 1.,
vt.dataPtr(), m_nr,
temp.dataPtr(), m_nr, 0._rt,
M.dataPtr()+m_nr, m_nr);
} else {
// In this case, invM is invertible; calculate the inverse.
// getrf calculates the LU decomposition
// getri calculates the inverse from the LU decomposition
M = invM;
amrex::Vector<int64_t> ipiv(m_nr);
lapack::getrf(m_nk, m_nr, M.dataPtr(), m_nk, ipiv.dataPtr());
lapack::getri(m_nr, M.dataPtr(), m_nr, ipiv.dataPtr());
}
m_kr.resize(kr.size());
m_invM.resize(invM.size());
m_M.resize(M.size());
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, kr.begin(), kr.end(), m_kr.begin());
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, invM.begin(), invM.end(), m_invM.begin());
amrex::Gpu::copyAsync(amrex::Gpu::hostToDevice, M.begin(), M.end(), m_M.begin());
amrex::Gpu::synchronize();
}
void
HankelTransform::HankelForwardTransform (amrex::FArrayBox const& F, int const F_icomp,
amrex::FArrayBox & G, int const G_icomp)
{
amrex::Box const& F_box = F.box();
amrex::Box const& G_box = G.box();
int const nrF = F_box.length(0);
int const nz = F_box.length(1);
int const ngr = G_box.smallEnd(0) - F_box.smallEnd(0);
AMREX_ALWAYS_ASSERT(m_nr == G_box.length(0));
AMREX_ALWAYS_ASSERT(nz == G_box.length(1));
AMREX_ALWAYS_ASSERT(ngr >= 0);
AMREX_ALWAYS_ASSERT(F_box.bigEnd(0)+1 >= m_nr);
#ifndef AMREX_USE_GPU
// On CPU, the blas::gemm is significantly faster
// Note that M is flagged to be transposed since it has dimensions (m_nr, m_nk)
blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans,
m_nk, nz, m_nr, 1._rt,
m_M.dataPtr(), m_nk,
F.dataPtr(F_icomp)+ngr, nrF, 0._rt,
G.dataPtr(G_icomp), m_nk);
#else
// On GPU, the explicit loop is significantly faster
// It is not clear if the GPU gemm wasn't build properly, it is cycling data out and back
// in to the device, or if it is because gemm is launching its own threads.
amrex::Real const * M_arr = m_M.dataPtr();
amrex::Array4<const amrex::Real> const & F_arr = F.array();
amrex::Array4< amrex::Real> const & G_arr = G.array();
int const nr = m_nr;
amrex::ParallelFor(G_box,
[=] AMREX_GPU_DEVICE(int ik, int iz, int inotused) noexcept {
G_arr(ik,iz,G_icomp) = 0.;
for (int ir=0 ; ir < nr ; ir++) {
int const ii = ir + ik*nr;
G_arr(ik,iz,G_icomp) += M_arr[ii]*F_arr(ir,iz,F_icomp);
}
});
#endif
}
void
HankelTransform::HankelInverseTransform (amrex::FArrayBox const& G, int const G_icomp,
amrex::FArrayBox & F, int const F_icomp)
{
amrex::Box const& G_box = G.box();
amrex::Box const& F_box = F.box();
int const nrF = F_box.length(0);
int const nz = F_box.length(1);
int const ngr = G_box.smallEnd(0) - F_box.smallEnd(0);
AMREX_ALWAYS_ASSERT(m_nr == G_box.length(0));
AMREX_ALWAYS_ASSERT(nz == G_box.length(1));
AMREX_ALWAYS_ASSERT(ngr >= 0);
AMREX_ALWAYS_ASSERT(F_box.bigEnd(0)+1 >= m_nr);
#ifndef AMREX_USE_GPU
// On CPU, the blas::gemm is significantly faster
// Note that m_invM is flagged to be transposed since it has dimensions (m_nk, m_nr)
blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans,
m_nr, nz, m_nk, 1._rt,
m_invM.dataPtr(), m_nr,
G.dataPtr(G_icomp), m_nk, 0._rt,
F.dataPtr(F_icomp)+ngr, nrF);
#else
// On GPU, the explicit loop is significantly faster
// It is not clear if the GPU gemm wasn't build properly, it is cycling data out and back
// in to the device, or if it is because gemm is launching its own threads.
amrex::Real const * invM_arr = m_invM.dataPtr();
amrex::Array4<const amrex::Real> const & G_arr = G.array();
amrex::Array4< amrex::Real> const & F_arr = F.array();
int const nk = m_nk;
amrex::ParallelFor(G_box,
[=] AMREX_GPU_DEVICE(int ir, int iz, int inotused) noexcept {
F_arr(ir,iz,F_icomp) = 0.;
for (int ik=0 ; ik < nk ; ik++) {
int const ii = ik + ir*nk;
F_arr(ir,iz,F_icomp) += invM_arr[ii]*G_arr(ik,iz,G_icomp);
}
});
#endif
}
|