diff options
Diffstat (limited to 'Source/Parser/GpuParser.H')
-rw-r--r-- | Source/Parser/GpuParser.H | 116 |
1 files changed, 88 insertions, 28 deletions
diff --git a/Source/Parser/GpuParser.H b/Source/Parser/GpuParser.H index c6d870800..65db03524 100644 --- a/Source/Parser/GpuParser.H +++ b/Source/Parser/GpuParser.H @@ -10,42 +10,36 @@ #include <WarpXParser.H> #include <AMReX_Gpu.H> +#include <AMReX_Array.H> +#include <AMReX_TypeTraits.H> // When compiled for CPU, wrap WarpXParser and enable threading. // When compiled for GPU, store one copy of the parser in // CUDA managed memory for __device__ code, and one copy of the parser // in CUDA managed memory for __host__ code. This way, the parser can be // efficiently called from both host and device. +template <int N> class GpuParser { public: GpuParser (WarpXParser const& wp); void clear (); + template <typename... Ts> AMREX_GPU_HOST_DEVICE - amrex::Real - operator() (amrex::Real x, amrex::Real y, amrex::Real z, amrex::Real t=0.0) const noexcept + std::enable_if_t<sizeof...(Ts) == N + and amrex::Same<amrex::Real,Ts...>::value, + amrex::Real> + operator() (Ts... var) const noexcept { #ifdef AMREX_USE_GPU - -#ifdef AMREX_DEVICE_COMPILE + amrex::GpuArray<amrex::Real,N> l_var{var...}; +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) // WarpX compiled for GPU, function compiled for __device__ - // the 3D position of each particle is stored in shared memory. - amrex::Gpu::SharedMemory<amrex::Real> gsm; - amrex::Real* p = gsm.dataPtr(); - int tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*(blockDim.x*blockDim.y); - p[tid*4] = x; - p[tid*4+1] = y; - p[tid*4+2] = z; - p[tid*4+3] = t; - return wp_ast_eval(m_gpu_parser.ast); + return wp_ast_eval(m_gpu_parser.ast, l_var.data()); #else // WarpX compiled for GPU, function compiled for __host__ - m_var.x = x; - m_var.y = y; - m_var.z = z; - m_t = t; - return wp_ast_eval(m_cpu_parser.ast); + return wp_ast_eval(m_cpu_parser->ast, nullptr); #endif #else @@ -55,11 +49,8 @@ public: #else int tid = 0; #endif - m_var[tid].x = x; - m_var[tid].y = y; - m_var[tid].z = z; - m_t[tid] = t; - return wp_ast_eval(m_parser[tid]->ast); + m_var[tid] = amrex::GpuArray<amrex::Real,N>{var...}; + return wp_ast_eval(m_parser[tid]->ast, nullptr); #endif } @@ -70,16 +61,85 @@ private: // Copy of the parser running on __device__ struct wp_parser m_gpu_parser; // Copy of the parser running on __host__ - struct wp_parser m_cpu_parser; - mutable amrex::XDim3 m_var; - mutable amrex::Real m_t; + struct wp_parser* m_cpu_parser; + mutable amrex::GpuArray<amrex::Real,N> m_var; #else // Only one parser struct wp_parser** m_parser; - mutable amrex::XDim3* m_var; - mutable amrex::Real* m_t; + mutable amrex::GpuArray<amrex::Real,N>* m_var; int nthreads; #endif }; +template <int N> +GpuParser<N>::GpuParser (WarpXParser const& wp) +{ +#ifdef AMREX_USE_GPU + + struct wp_parser* a_wp = wp.m_parser; + // Initialize GPU parser: allocate memory in CUDA managed memory, + // copy all data needed on GPU to m_gpu_parser + m_gpu_parser.sz_mempool = wp_ast_size(a_wp->ast); + m_gpu_parser.p_root = (struct wp_node*) + amrex::The_Managed_Arena()->alloc(m_gpu_parser.sz_mempool); + m_gpu_parser.p_free = m_gpu_parser.p_root; + // 0: don't free the source + m_gpu_parser.ast = wp_parser_ast_dup(&m_gpu_parser, a_wp->ast, 0); + for (int i = 0; i < N; ++i) { + wp_parser_regvar_gpu(&m_gpu_parser, wp.m_varnames[i].c_str(), i); + } + + // Initialize CPU parser: + m_cpu_parser = wp_parser_dup(a_wp); + for (int i = 0; i < N; ++i) { + wp_parser_regvar(m_cpu_parser, wp.m_varnames[i].c_str(), &m_var[i]); + } + +#else // not defined AMREX_USE_GPU + +#ifdef _OPENMP + nthreads = omp_get_max_threads(); +#else // _OPENMP + nthreads = 1; +#endif // _OPENMP + + m_parser = ::new struct wp_parser*[nthreads]; + m_var = ::new amrex::GpuArray<amrex::Real,N>[nthreads]; + + for (int tid = 0; tid < nthreads; ++tid) + { +#ifdef _OPENMP + m_parser[tid] = wp_parser_dup(wp.m_parser[tid]); + for (int i = 0; i < N; ++i) { + wp_parser_regvar(m_parser[tid], wp.m_varnames[tid][i].c_str(), &(m_var[tid][i])); + } +#else // _OPENMP + m_parser[tid] = wp_parser_dup(wp.m_parser); + for (int i = 0; i < N; ++i) { + wp_parser_regvar(m_parser[tid], wp.m_varnames[i].c_str(), &(m_var[tid][i])); + } +#endif // _OPENMP + } + +#endif // AMREX_USE_GPU +} + + +template <int N> +void +GpuParser<N>::clear () +{ +#ifdef AMREX_USE_GPU + amrex::The_Managed_Arena()->free(m_gpu_parser.ast); + wp_parser_delete(m_cpu_parser); +#else + for (int tid = 0; tid < nthreads; ++tid) + { + wp_parser_delete(m_parser[tid]); + } + ::delete[] m_parser; + ::delete[] m_var; +#endif +} + #endif |