8 files changed, 126 insertions, 157 deletions
diff --git a/Source/Parser/GpuParser.H b/Source/Parser/GpuParser.H
index c6d870800..65db03524 100644
--- a/Source/Parser/GpuParser.H
+++ b/Source/Parser/GpuParser.H
@@ -10,42 +10,36 @@
 
 #include <WarpXParser.H>
 #include <AMReX_Gpu.H>
+#include <AMReX_Array.H>
+#include <AMReX_TypeTraits.H>
 
 // When compiled for CPU, wrap WarpXParser and enable threading.
 // When compiled for GPU, store one copy of the parser in
 // CUDA managed memory for __device__ code, and one copy of the parser
 // in CUDA managed memory for __host__ code. This way, the parser can be
 // efficiently called from both host and device.
+template <int N>
 class GpuParser
 {
 public:
     GpuParser (WarpXParser const& wp);
     void clear ();
 
+    template <typename... Ts>
     AMREX_GPU_HOST_DEVICE
-    amrex::Real
-    operator() (amrex::Real x, amrex::Real y, amrex::Real z, amrex::Real t=0.0) const noexcept
+    std::enable_if_t<sizeof...(Ts) == N
+                     and amrex::Same<amrex::Real,Ts...>::value,
+                     amrex::Real>
+    operator() (Ts... var) const noexcept
     {
 #ifdef AMREX_USE_GPU
-
-#ifdef AMREX_DEVICE_COMPILE
+        amrex::GpuArray<amrex::Real,N> l_var{var...};
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
 // WarpX compiled for GPU, function compiled for __device__
-        // the 3D position of each particle is stored in shared memory.
-        amrex::Gpu::SharedMemory<amrex::Real> gsm;
-        amrex::Real* p = gsm.dataPtr();
-        int tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*(blockDim.x*blockDim.y);
-        p[tid*4] = x;
-        p[tid*4+1] = y;
-        p[tid*4+2] = z;
-        p[tid*4+3] = t;
-        return wp_ast_eval(m_gpu_parser.ast);
+        return wp_ast_eval(m_gpu_parser.ast, l_var.data());
 #else
 // WarpX compiled for GPU, function compiled for __host__
-        m_var.x = x;
-        m_var.y = y;
-        m_var.z = z;
-        m_t = t;
-        return wp_ast_eval(m_cpu_parser.ast);
+        return wp_ast_eval(m_cpu_parser->ast, nullptr);
 #endif
 
 #else
@@ -55,11 +49,8 @@ public:
 #else
         int tid = 0;
 #endif
-        m_var[tid].x = x;
-        m_var[tid].y = y;
-        m_var[tid].z = z;
-        m_t[tid] = t;
-        return wp_ast_eval(m_parser[tid]->ast);
+        m_var[tid] = amrex::GpuArray<amrex::Real,N>{var...};
+        return wp_ast_eval(m_parser[tid]->ast, nullptr);
 #endif
     }
 
@@ -70,16 +61,85 @@ private:
     // Copy of the parser running on __device__
     struct wp_parser m_gpu_parser;
     // Copy of the parser running on __host__
-    struct wp_parser m_cpu_parser;
-    mutable amrex::XDim3 m_var;
-    mutable amrex::Real m_t;
+    struct wp_parser* m_cpu_parser;
+    mutable amrex::GpuArray<amrex::Real,N> m_var;
 #else
     // Only one parser
     struct wp_parser** m_parser;
-    mutable amrex::XDim3* m_var;
-    mutable amrex::Real* m_t;
+    mutable amrex::GpuArray<amrex::Real,N>* m_var;
     int nthreads;
 #endif
 };
 
+template <int N>
+GpuParser<N>::GpuParser (WarpXParser const& wp)
+{
+#ifdef AMREX_USE_GPU
+
+    struct wp_parser* a_wp = wp.m_parser;
+    // Initialize GPU parser: allocate memory in CUDA managed memory,
+    // copy all data needed on GPU to m_gpu_parser
+    m_gpu_parser.sz_mempool = wp_ast_size(a_wp->ast);
+    m_gpu_parser.p_root = (struct wp_node*)
+        amrex::The_Managed_Arena()->alloc(m_gpu_parser.sz_mempool);
+    m_gpu_parser.p_free = m_gpu_parser.p_root;
+    // 0: don't free the source
+    m_gpu_parser.ast = wp_parser_ast_dup(&m_gpu_parser, a_wp->ast, 0);
+    for (int i = 0; i < N; ++i) {
+        wp_parser_regvar_gpu(&m_gpu_parser, wp.m_varnames[i].c_str(), i);
+    }
+
+    // Initialize CPU parser:
+    m_cpu_parser = wp_parser_dup(a_wp);
+    for (int i = 0; i < N; ++i) {
+        wp_parser_regvar(m_cpu_parser, wp.m_varnames[i].c_str(), &m_var[i]);
+    }
+
+#else // not defined AMREX_USE_GPU
+
+#ifdef _OPENMP
+    nthreads = omp_get_max_threads();
+#else // _OPENMP
+    nthreads = 1;
+#endif // _OPENMP
+
+    m_parser = ::new struct wp_parser*[nthreads];
+    m_var = ::new amrex::GpuArray<amrex::Real,N>[nthreads];
+
+    for (int tid = 0; tid < nthreads; ++tid)
+    {
+#ifdef _OPENMP
+        m_parser[tid] = wp_parser_dup(wp.m_parser[tid]);
+        for (int i = 0; i < N; ++i) {
+            wp_parser_regvar(m_parser[tid], wp.m_varnames[tid][i].c_str(), &(m_var[tid][i]));
+        }
+#else // _OPENMP
+        m_parser[tid] = wp_parser_dup(wp.m_parser);
+        for (int i = 0; i < N; ++i) {
+            wp_parser_regvar(m_parser[tid], wp.m_varnames[i].c_str(), &(m_var[tid][i]));
+        }
+#endif // _OPENMP
+    }
+
+#endif // AMREX_USE_GPU
+}
+
+
+template <int N>
+void
+GpuParser<N>::clear ()
+{
+#ifdef AMREX_USE_GPU
+    amrex::The_Managed_Arena()->free(m_gpu_parser.ast);
+    wp_parser_delete(m_cpu_parser);
+#else
+    for (int tid = 0; tid < nthreads; ++tid)
+    {
+        wp_parser_delete(m_parser[tid]);
+    }
+    ::delete[] m_parser;
+    ::delete[] m_var;
+#endif
+}
+
 #endif
diff --git a/Source/Parser/GpuParser.cpp b/Source/Parser/GpuParser.cpp
deleted file mode 100644
index 22fab6313..000000000
--- a/Source/Parser/GpuParser.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2019-2020 Maxence Thevenet, Revathi Jambunathan, Weiqun Zhang
- *
- *
- * This file is part of WarpX.
- *
- * License: BSD-3-Clause-LBNL
- */
-#include <GpuParser.H>
-
-GpuParser::GpuParser (WarpXParser const& wp)
-{
-#ifdef AMREX_USE_GPU
-
-    struct wp_parser* a_wp = wp.m_parser;
-    // Initialize GPU parser: allocate memory in CUDA managed memory,
-    // copy all data needed on GPU to m_gpu_parser
-    m_gpu_parser.sz_mempool = wp_ast_size(a_wp->ast);
-    m_gpu_parser.p_root = (struct wp_node*)
-        amrex::The_Managed_Arena()->alloc(m_gpu_parser.sz_mempool);
-    m_gpu_parser.p_free = m_gpu_parser.p_root;
-    // 0: don't free the source
-    m_gpu_parser.ast = wp_parser_ast_dup(&m_gpu_parser, a_wp->ast, 0);
-    wp_parser_regvar_gpu(&m_gpu_parser, "x", 0);
-    wp_parser_regvar_gpu(&m_gpu_parser, "y", 1);
-    wp_parser_regvar_gpu(&m_gpu_parser, "z", 2);
-    wp_parser_regvar_gpu(&m_gpu_parser, "t", 3);
-
-    // Initialize CPU parser: allocate memory in CUDA managed memory,
-    // copy all data needed on CPU to m_cpu_parser
-    m_cpu_parser.sz_mempool = wp_ast_size(a_wp->ast);
-    m_cpu_parser.p_root = (struct wp_node*)
-        amrex::The_Managed_Arena()->alloc(m_cpu_parser.sz_mempool);
-    m_cpu_parser.p_free = m_cpu_parser.p_root;
-    // 0: don't free the source
-    m_cpu_parser.ast = wp_parser_ast_dup(&m_cpu_parser, a_wp->ast, 0);
-    wp_parser_regvar(&m_cpu_parser, "x", &(m_var.x));
-    wp_parser_regvar(&m_cpu_parser, "y", &(m_var.y));
-    wp_parser_regvar(&m_cpu_parser, "z", &(m_var.z));
-    wp_parser_regvar(&m_cpu_parser, "t", &(m_t));
-
-#else // not defined AMREX_USE_GPU
-
-#ifdef _OPENMP
-    nthreads = omp_get_max_threads();
-#else // _OPENMP
-    nthreads = 1;
-#endif // _OPENMP
-
-    m_parser = ::new struct wp_parser*[nthreads];
-    m_var = ::new amrex::XDim3[nthreads];
-    m_t = ::new amrex::Real[nthreads];
-
-    for (int tid = 0; tid < nthreads; ++tid)
-    {
-#ifdef _OPENMP
-        m_parser[tid] = wp_parser_dup(wp.m_parser[tid]);
-#else // _OPENMP
-        m_parser[tid] = wp_parser_dup(wp.m_parser);
-#endif // _OPENMP
-        wp_parser_regvar(m_parser[tid], "x", &(m_var[tid].x));
-        wp_parser_regvar(m_parser[tid], "y", &(m_var[tid].y));
-        wp_parser_regvar(m_parser[tid], "z", &(m_var[tid].z));
-        wp_parser_regvar(m_parser[tid], "t", &(m_t[tid]));
-    }
-
-#endif // AMREX_USE_GPU
-}
-
-void
-GpuParser::clear ()
-{
-#ifdef AMREX_USE_GPU
-    amrex::The_Managed_Arena()->free(m_gpu_parser.ast);
-    amrex::The_Managed_Arena()->free(m_cpu_parser.ast);
-#else
-    for (int tid = 0; tid < nthreads; ++tid)
-    {
-        wp_parser_delete(m_parser[tid]);
-    }
-    ::delete[] m_parser;
-    ::delete[] m_var;
-#endif
-}
-
diff --git a/Source/Parser/Make.package b/Source/Parser/Make.package
index 15115c138..be07e3a7d 100644
--- a/Source/Parser/Make.package
+++ b/Source/Parser/Make.package
@@ -4,7 +4,6 @@ cEXE_headers += wp_parser_y.h wp_parser.tab.h wp_parser.lex.h wp_parser_c.h
 CEXE_sources += WarpXParser.cpp
 CEXE_headers += WarpXParser.H
 CEXE_headers += GpuParser.H
-CEXE_sources += GpuParser.cpp
 CEXE_headers += WarpXParserWrapper.H
 
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Parser
diff --git a/Source/Parser/WarpXParser.H b/Source/Parser/WarpXParser.H
index 863b35fb8..703b1effc 100644
--- a/Source/Parser/WarpXParser.H
+++ b/Source/Parser/WarpXParser.H
@@ -21,7 +21,7 @@
 #include <omp.h>
 #endif
 
-class GpuParser;
+template <int N> class GpuParser;
 
 class WarpXParser
 {
@@ -56,7 +56,7 @@ public:
 
     std::set<std::string> symbols () const;
 
-    friend class GpuParser;
+    template <int N> friend class GpuParser;
 
 private:
     void clear ();
@@ -71,9 +71,11 @@ private:
 #ifdef _OPENMP
     std::vector<struct wp_parser*> m_parser;
     mutable std::vector<std::array<amrex::Real,16> > m_variables;
+    mutable std::vector<std::vector<std::string> > m_varnames;
 #else
     struct wp_parser* m_parser = nullptr;
     mutable std::array<amrex::Real,16> m_variables;
+    mutable std::vector<std::string> m_varnames;
 #endif
 };
 
@@ -82,9 +84,9 @@ amrex::Real
 WarpXParser::eval () const noexcept
 {
 #ifdef _OPENMP
-    return wp_ast_eval(m_parser[omp_get_thread_num()]->ast);
+    return wp_ast_eval(m_parser[omp_get_thread_num()]->ast,nullptr);
 #else
-    return wp_ast_eval(m_parser->ast);
+    return wp_ast_eval(m_parser->ast,nullptr);
 #endif
 }
 
diff --git a/Source/Parser/WarpXParser.cpp b/Source/Parser/WarpXParser.cpp
index 8c8be7ecb..dd000792b 100644
--- a/Source/Parser/WarpXParser.cpp
+++ b/Source/Parser/WarpXParser.cpp
@@ -27,6 +27,7 @@ WarpXParser::define (std::string const& func_body)
 
     int nthreads = omp_get_max_threads();
     m_variables.resize(nthreads);
+    m_varnames.resize(nthreads);
     m_parser.resize(nthreads);
     m_parser[0] = wp_c_parser_new(f.c_str());
 #pragma omp parallel
@@ -53,6 +54,7 @@ void
 WarpXParser::clear ()
 {
     m_expression.clear();
+    m_varnames.clear();
 
 #ifdef _OPENMP
 
@@ -80,8 +82,10 @@ WarpXParser::registerVariable (std::string const& name, amrex::Real& var)
     // We assume this is called inside OMP parallel region
 #ifdef _OPENMP
     wp_parser_regvar(m_parser[omp_get_thread_num()], name.c_str(), &var);
+    m_varnames[omp_get_thread_num()].push_back(name);
 #else
     wp_parser_regvar(m_parser, name.c_str(), &var);
+    m_varnames.push_back(name);
 #endif
 }
 
@@ -98,6 +102,7 @@ WarpXParser::registerVariables (std::vector<std::string> const& names)
         auto& v = m_variables[tid];
         for (int j = 0; j < names.size(); ++j) {
             wp_parser_regvar(p, names[j].c_str(), &(v[j]));
+            m_varnames[tid].push_back(names[j]);
         }
     }
 
@@ -105,6 +110,7 @@ WarpXParser::registerVariables (std::vector<std::string> const& names)
 
     for (int j = 0; j < names.size(); ++j) {
         wp_parser_regvar(m_parser, names[j].c_str(), &(m_variables[j]));
+        m_varnames.push_back(names[j]);
     }
 
 #endif
diff --git a/Source/Parser/WarpXParserWrapper.H b/Source/Parser/WarpXParserWrapper.H
index 2c76d97a3..38147aba5 100644
--- a/Source/Parser/WarpXParserWrapper.H
+++ b/Source/Parser/WarpXParserWrapper.H
@@ -18,24 +18,16 @@
  *  in a safe way. The ParserWrapper struct is used to avoid memory leak
  *  in the EB parser functions.
  */
+template <int N>
 struct ParserWrapper
-    : public amrex::Gpu::Managed
+    : public amrex::Gpu::Managed, public GpuParser<N>
 {
-    ParserWrapper (WarpXParser const& a_parser) noexcept
-       : m_parser(a_parser) {}
+    using GpuParser<N>::GpuParser;
 
-    ~ParserWrapper() {
-       m_parser.clear();
-    }
+    ParserWrapper (ParserWrapper<N> const&) = delete;
+    void operator= (ParserWrapper<N> const&) = delete;
 
-    AMREX_GPU_HOST_DEVICE
-    amrex::Real
-    getField (amrex::Real x, amrex::Real y, amrex::Real z, amrex::Real t=0.0) const noexcept
-    {
-        return m_parser(x,y,z,t);
-    }
-
-    GpuParser m_parser;
+    ~ParserWrapper() { GpuParser<N>::clear(); }
 };
 
 #endif
diff --git a/Source/Parser/wp_parser_c.h b/Source/Parser/wp_parser_c.h
index 2cf0e2c00..c9c0d82ac 100644
--- a/Source/Parser/wp_parser_c.h
+++ b/Source/Parser/wp_parser_c.h
@@ -23,16 +23,10 @@ extern "C" {
 
 AMREX_GPU_HOST_DEVICE
 inline amrex_real
-wp_ast_eval (struct wp_node* node)
+wp_ast_eval (struct wp_node* node, amrex_real const* x)
 {
     amrex_real result;
 
-#ifdef AMREX_DEVICE_COMPILE
-    extern __shared__ amrex_real extern_xyz[];
-    int tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*(blockDim.x*blockDim.y);
-    amrex_real* x = extern_xyz + tid*4; // parser assumes 4 independent variables (x,y,z,t)
-#endif
-
     switch (node->type)
     {
     case WP_NUMBER:
@@ -42,7 +36,7 @@ wp_ast_eval (struct wp_node* node)
     }
     case WP_SYMBOL:
     {
-#ifdef AMREX_DEVICE_COMPILE
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         int i =((struct wp_symbol*)node)->ip.i;
         result = x[i];
 #else
@@ -52,45 +46,45 @@ wp_ast_eval (struct wp_node* node)
     }
     case WP_ADD:
     {
-        result = wp_ast_eval(node->l) + wp_ast_eval(node->r);
+        result = wp_ast_eval(node->l,x) + wp_ast_eval(node->r,x);
         break;
     }
     case WP_SUB:
     {
-        result = wp_ast_eval(node->l) - wp_ast_eval(node->r);
+        result = wp_ast_eval(node->l,x) - wp_ast_eval(node->r,x);
         break;
     }
     case WP_MUL:
     {
-        result = wp_ast_eval(node->l) * wp_ast_eval(node->r);
+        result = wp_ast_eval(node->l,x) * wp_ast_eval(node->r,x);
         break;
     }
     case WP_DIV:
     {
-        result = wp_ast_eval(node->l) / wp_ast_eval(node->r);
+        result = wp_ast_eval(node->l,x) / wp_ast_eval(node->r,x);
         break;
     }
     case WP_NEG:
     {
-        result = -wp_ast_eval(node->l);
+        result = -wp_ast_eval(node->l,x);
         break;
     }
     case WP_F1:
     {
         result = wp_call_f1(((struct wp_f1*)node)->ftype,
-                wp_ast_eval(((struct wp_f1*)node)->l));
+                wp_ast_eval(((struct wp_f1*)node)->l,x));
         break;
     }
     case WP_F2:
     {
         result = wp_call_f2(((struct wp_f2*)node)->ftype,
-                wp_ast_eval(((struct wp_f2*)node)->l),
-                wp_ast_eval(((struct wp_f2*)node)->r));
+                wp_ast_eval(((struct wp_f2*)node)->l,x),
+                wp_ast_eval(((struct wp_f2*)node)->r,x));
         break;
     }
     case WP_ADD_VP:
     {
-#ifdef AMREX_DEVICE_COMPILE
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         int i = node->rip.i;
         result = node->lvp.v + x[i];
 #else
@@ -100,7 +94,7 @@ wp_ast_eval (struct wp_node* node)
     }
     case WP_ADD_PP:
     {
-#ifdef AMREX_DEVICE_COMPILE
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         int i = node->lvp.ip.i;
         int j = node->rip.i;
         result = x[i] + x[j];
@@ -111,7 +105,7 @@ wp_ast_eval (struct wp_node* node)
     }
     case WP_SUB_VP:
     {
-#ifdef AMREX_DEVICE_COMPILE
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         int i = node->rip.i;
         result = node->lvp.v - x[i];
 #else
@@ -121,7 +115,7 @@ wp_ast_eval (struct wp_node* node)
     }
     case WP_SUB_PP:
     {
-#ifdef AMREX_DEVICE_COMPILE
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         int i = node->lvp.ip.i;
         int j = node->rip.i;
         result = x[i] - x[j];
@@ -132,7 +126,7 @@ wp_ast_eval (struct wp_node* node)
     }
     case WP_MUL_VP:
     {
-#ifdef AMREX_DEVICE_COMPILE
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         int i = node->rip.i;
         result = node->lvp.v * x[i];
 #else
@@ -142,7 +136,7 @@ wp_ast_eval (struct wp_node* node)
     }
     case WP_MUL_PP:
     {
-#ifdef AMREX_DEVICE_COMPILE
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         int i = node->lvp.ip.i;
         int j = node->rip.i;
         result = x[i] * x[j];
@@ -153,7 +147,7 @@ wp_ast_eval (struct wp_node* node)
     }
     case WP_DIV_VP:
     {
-#ifdef AMREX_DEVICE_COMPILE
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         int i = node->rip.i;
         result = node->lvp.v / x[i];
 #else
@@ -163,7 +157,7 @@ wp_ast_eval (struct wp_node* node)
     }
     case WP_DIV_PP:
     {
-#ifdef AMREX_DEVICE_COMPILE
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         int i = node->lvp.ip.i;
         int j = node->rip.i;
         result = x[i] / x[j];
@@ -174,7 +168,7 @@ wp_ast_eval (struct wp_node* node)
     }
     case WP_NEG_P:
     {
-#ifdef AMREX_DEVICE_COMPILE
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
         int i = node->rip.i;
         result = -x[i];
 #else
diff --git a/Source/Parser/wp_parser_y.c b/Source/Parser/wp_parser_y.c
index b71b42638..57293ab87 100644
--- a/Source/Parser/wp_parser_y.c
+++ b/Source/Parser/wp_parser_y.c
@@ -80,7 +80,7 @@ yyerror (char const *s, ...)
 {
     va_list vl;
     va_start(vl, s);
-#ifdef AMREX_DEVICE_COMPILE
+#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
     printf(s,"\n");
     assert(0);
 #else