aboutsummaryrefslogtreecommitdiff
path: root/Source/Parser/GpuParser.H
diff options
context:
space:
mode:
Diffstat (limited to 'Source/Parser/GpuParser.H')
-rw-r--r--Source/Parser/GpuParser.H116
1 files changed, 88 insertions, 28 deletions
diff --git a/Source/Parser/GpuParser.H b/Source/Parser/GpuParser.H
index c6d870800..65db03524 100644
--- a/Source/Parser/GpuParser.H
+++ b/Source/Parser/GpuParser.H
@@ -10,42 +10,36 @@
#include <WarpXParser.H>
#include <AMReX_Gpu.H>
+#include <AMReX_Array.H>
+#include <AMReX_TypeTraits.H>
// When compiled for CPU, wrap WarpXParser and enable threading.
// When compiled for GPU, store one copy of the parser in
// CUDA managed memory for __device__ code, and one copy of the parser
// in CUDA managed memory for __host__ code. This way, the parser can be
// efficiently called from both host and device.
+template <int N>
class GpuParser
{
public:
GpuParser (WarpXParser const& wp);
void clear ();
+ template <typename... Ts>
AMREX_GPU_HOST_DEVICE
- amrex::Real
- operator() (amrex::Real x, amrex::Real y, amrex::Real z, amrex::Real t=0.0) const noexcept
+ std::enable_if_t<sizeof...(Ts) == N
+ and amrex::Same<amrex::Real,Ts...>::value,
+ amrex::Real>
+ operator() (Ts... var) const noexcept
{
#ifdef AMREX_USE_GPU
-
-#ifdef AMREX_DEVICE_COMPILE
+ amrex::GpuArray<amrex::Real,N> l_var{var...};
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
// WarpX compiled for GPU, function compiled for __device__
- // the 3D position of each particle is stored in shared memory.
- amrex::Gpu::SharedMemory<amrex::Real> gsm;
- amrex::Real* p = gsm.dataPtr();
- int tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*(blockDim.x*blockDim.y);
- p[tid*4] = x;
- p[tid*4+1] = y;
- p[tid*4+2] = z;
- p[tid*4+3] = t;
- return wp_ast_eval(m_gpu_parser.ast);
+ return wp_ast_eval(m_gpu_parser.ast, l_var.data());
#else
// WarpX compiled for GPU, function compiled for __host__
- m_var.x = x;
- m_var.y = y;
- m_var.z = z;
- m_t = t;
- return wp_ast_eval(m_cpu_parser.ast);
+ return wp_ast_eval(m_cpu_parser->ast, nullptr);
#endif
#else
@@ -55,11 +49,8 @@ public:
#else
int tid = 0;
#endif
- m_var[tid].x = x;
- m_var[tid].y = y;
- m_var[tid].z = z;
- m_t[tid] = t;
- return wp_ast_eval(m_parser[tid]->ast);
+ m_var[tid] = amrex::GpuArray<amrex::Real,N>{var...};
+ return wp_ast_eval(m_parser[tid]->ast, nullptr);
#endif
}
@@ -70,16 +61,85 @@ private:
// Copy of the parser running on __device__
struct wp_parser m_gpu_parser;
// Copy of the parser running on __host__
- struct wp_parser m_cpu_parser;
- mutable amrex::XDim3 m_var;
- mutable amrex::Real m_t;
+ struct wp_parser* m_cpu_parser;
+ mutable amrex::GpuArray<amrex::Real,N> m_var;
#else
// Only one parser
struct wp_parser** m_parser;
- mutable amrex::XDim3* m_var;
- mutable amrex::Real* m_t;
+ mutable amrex::GpuArray<amrex::Real,N>* m_var;
int nthreads;
#endif
};
+template <int N>
+GpuParser<N>::GpuParser (WarpXParser const& wp)
+{
+#ifdef AMREX_USE_GPU
+
+ struct wp_parser* a_wp = wp.m_parser;
+ // Initialize GPU parser: allocate memory in CUDA managed memory,
+ // copy all data needed on GPU to m_gpu_parser
+ m_gpu_parser.sz_mempool = wp_ast_size(a_wp->ast);
+ m_gpu_parser.p_root = (struct wp_node*)
+ amrex::The_Managed_Arena()->alloc(m_gpu_parser.sz_mempool);
+ m_gpu_parser.p_free = m_gpu_parser.p_root;
+ // 0: don't free the source
+ m_gpu_parser.ast = wp_parser_ast_dup(&m_gpu_parser, a_wp->ast, 0);
+ for (int i = 0; i < N; ++i) {
+ wp_parser_regvar_gpu(&m_gpu_parser, wp.m_varnames[i].c_str(), i);
+ }
+
+ // Initialize CPU parser:
+ m_cpu_parser = wp_parser_dup(a_wp);
+ for (int i = 0; i < N; ++i) {
+ wp_parser_regvar(m_cpu_parser, wp.m_varnames[i].c_str(), &m_var[i]);
+ }
+
+#else // not defined AMREX_USE_GPU
+
+#ifdef _OPENMP
+ nthreads = omp_get_max_threads();
+#else // _OPENMP
+ nthreads = 1;
+#endif // _OPENMP
+
+ m_parser = ::new struct wp_parser*[nthreads];
+ m_var = ::new amrex::GpuArray<amrex::Real,N>[nthreads];
+
+ for (int tid = 0; tid < nthreads; ++tid)
+ {
+#ifdef _OPENMP
+ m_parser[tid] = wp_parser_dup(wp.m_parser[tid]);
+ for (int i = 0; i < N; ++i) {
+ wp_parser_regvar(m_parser[tid], wp.m_varnames[tid][i].c_str(), &(m_var[tid][i]));
+ }
+#else // _OPENMP
+ m_parser[tid] = wp_parser_dup(wp.m_parser);
+ for (int i = 0; i < N; ++i) {
+ wp_parser_regvar(m_parser[tid], wp.m_varnames[i].c_str(), &(m_var[tid][i]));
+ }
+#endif // _OPENMP
+ }
+
+#endif // AMREX_USE_GPU
+}
+
+
+template <int N>
+void
+GpuParser<N>::clear ()
+{
+#ifdef AMREX_USE_GPU
+ amrex::The_Managed_Arena()->free(m_gpu_parser.ast);
+ wp_parser_delete(m_cpu_parser);
+#else
+ for (int tid = 0; tid < nthreads; ++tid)
+ {
+ wp_parser_delete(m_parser[tid]);
+ }
+ ::delete[] m_parser;
+ ::delete[] m_var;
+#endif
+}
+
#endif