86 files changed, 3982 insertions, 2765 deletions
diff --git a/.gitignore b/.gitignore
index 13147b8da..1e9fbc336 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,3 @@
-.DS_Store
-plt*
-chk*
 main?d.*.ex
 d/
 f/
@@ -11,3 +8,32 @@ Backtrace.*
 *.vth
 *.sw[opqrs]
 
+####################
+# AMReX data files #
+####################
+plt*
+chk*
+
+##########
+# Python #
+##########
+*.pyc
+__pycache__
+
+#######
+# IDE #
+#######
+.idea/
+cmake-build-*/
+.kdev?/
+*.kdev?
+
+# File-based project format:
+*.iws
+
+######
+# OS #
+######
+.DS_Store
+.AppleDouble
+.LSOverride
diff --git a/Docs/source/building/cori.rst b/Docs/source/building/cori.rst
index ef07aa893..2d8537ae2 100644
--- a/Docs/source/building/cori.rst
+++ b/Docs/source/building/cori.rst
@@ -95,8 +95,10 @@ First, load the appropriate modules:
 
     module swap craype-haswell craype-mic-knl
     module swap PrgEnv-intel PrgEnv-gnu
-    module load cmake/3.11.4
+    module load cmake/3.14.4
     module load cray-hdf5-parallel
+    module load adios/1.13.1 zlib
+    export CRAYPE_LINK_TYPE=dynamic
 
 Then, in the `warpx_directory`, download and build the openPMD API:
 
@@ -105,7 +107,7 @@ Then, in the `warpx_directory`, download and build the openPMD API:
     git clone https://github.com/openPMD/openPMD-api.git
     mkdir openPMD-api-build
     cd openPMD-api-build
-    cmake ../openPMD-api -DopenPMD_USE_PYTHON=OFF -DopenPMD_USE_JSON=OFF -DCMAKE_INSTALL_PREFIX=../openPMD-install/ -DBUILD_SHARED_LIBS=OFF
+    cmake ../openPMD-api -DopenPMD_USE_PYTHON=OFF -DCMAKE_INSTALL_PREFIX=../openPMD-install/ -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON -DCMAKE_INSTALL_RPATH='$ORIGIN'
     cmake --build . --target install
 
 Finally, compile WarpX:
@@ -113,6 +115,7 @@ Finally, compile WarpX:
 ::
 
     cd ../WarpX
-    export OPENPMD_LIB_PATH=../openPMD-install/lib64
-    export OPENPMD_INCLUDE_PATH=../openPMD-install/include
+    export PKG_CONFIG_PATH=$PWD/../openPMD-install/lib64/pkgconfig:$PKG_CONFIG_PATH
     make -j 16 COMP=gnu USE_OPENPMD=TRUE
+
+In order to run WarpX, load the same modules again.
diff --git a/Docs/source/building/openpmd.rst b/Docs/source/building/openpmd.rst
index 053769f67..80edf5c85 100644
--- a/Docs/source/building/openpmd.rst
+++ b/Docs/source/building/openpmd.rst
@@ -7,7 +7,7 @@ therefore we recommend to use `spack <https://
 spack.io>`__ in order to facilitate the installation.
 
 More specifically, we recommend that you try installing the
-`openPMD-api library <https://openpmd-api.readthedocs.io/en/0.8.0-alpha/>`__
+`openPMD-api library 0.9.0a or newer <https://openpmd-api.readthedocs.io/en/0.9.0-alpha/>`__
 using spack (first section below). If this fails, a back-up solution
 is to install parallel HDF5 with spack, and then install the openPMD-api
 library from source.
@@ -30,14 +30,13 @@ First, install the openPMD-api library:
 
 ::
 
-    spack install openpmd-api -shared -json -python ^hdf5+mpi ^openmpi
+    spack install openpmd-api -python +adios1
 
 Then, ``cd`` into the ``WarpX`` folder, and type:
 
 ::
 
-    spack load openmpi
-    spack load hdf5
+    spack load mpi
     spack load openpmd-api
     make -j 4 USE_OPENPMD=TRUE
 
@@ -45,8 +44,7 @@ You will also need to load the same spack environment when running WarpX, for in
 
 ::
 
-    spack load openmpi
-    spack load hdf5
+    spack load mpi
     spack load openpmd-api
 
     mpirun -np 4 ./warpx.exe inputs
@@ -58,9 +56,10 @@ First, install the openPMD-api library, and load it in your environment:
 
 ::
 
-    spack install hdf5+mpi ^openmpi
-    spack load openmpi
-    spack load hdf5
+    spack install hdf5
+    spack install adios
+    spack load -r hdf5
+    spack load -r adios
 
 Then, in the `warpx_directory`, download and build the openPMD API:
 
@@ -69,7 +68,7 @@ Then, in the `warpx_directory`, download and build the openPMD API:
     git clone https://github.com/openPMD/openPMD-api.git
     mkdir openPMD-api-build
     cd openPMD-api-build
-    cmake ../openPMD-api -DopenPMD_USE_PYTHON=OFF -DopenPMD_USE_JSON=OFF -DCMAKE_INSTALL_PREFIX=../openPMD-install/ -DBUILD_SHARED_LIBS=OFF
+    cmake ../openPMD-api -DopenPMD_USE_PYTHON=OFF -DCMAKE_INSTALL_PREFIX=../openPMD-install/ -DCMAKE_INSTALL_RPATH_USE_LINK_PATH=ON -DCMAKE_INSTALL_RPATH='$ORIGIN'
     cmake --build . --target install
 
 Finally, compile WarpX:
@@ -77,8 +76,7 @@ Finally, compile WarpX:
 ::
 
     cd ../WarpX
-    export OPENPMD_LIB_PATH=../openPMD-install/lib
-    export OPENPMD_INCLUDE_PATH=../openPMD-install/include
+    export PKG_CONFIG_PATH=$PWD/../openPMD-install/lib/pkgconfig:$PKG_CONFIG_PATH
     make -j 4 USE_OPENPMD=TRUE
 
 You will also need to load the same spack environment when running WarpX, for instance:
@@ -87,5 +85,6 @@ You will also need to load the same spack environment when running WarpX, for in
 
     spack load openmpi
     spack load hdf5
+    spack load adios
 
     mpirun -np 4 ./warpx.exe inputs
diff --git a/Docs/source/building/summit.rst b/Docs/source/building/summit.rst
index c1061f29e..88588eb72 100644
--- a/Docs/source/building/summit.rst
+++ b/Docs/source/building/summit.rst
@@ -39,7 +39,9 @@ the following text (replace bracketed variables):
 
     omp=1
     export OMP_NUM_THREADS=${omp}
-    jsrun -n <numberOfNodes> -a 6 -g 6 -c 6 --bind=packed:${omp} --smpiargs="-gpu" <warpxExecutable> <inputScript>
+    num_nodes=$(( $(printf '%s\n' ${LSB_HOSTS} | sort -u | wc -l) - 1 ))
+
+    jsrun -n ${num_nodes} -a 6 -g 6 -c 6 --bind=packed:${omp} --smpiargs="-gpu" <warpxExecutable> <inputScript>
 
 
 Then run
diff --git a/Docs/source/conf.py b/Docs/source/conf.py
index 54e533469..7f8ad0785 100644
--- a/Docs/source/conf.py
+++ b/Docs/source/conf.py
@@ -58,7 +58,7 @@ author = 'WarpX collaboration'
 # built documents.
 #
 # The short X.Y version.
-version = '19.05'
+version = '19.08'
 # The full version, including alpha/beta/rc tags.
 release = ''
 
diff --git a/Docs/source/latex_theory/allbibs.bib b/Docs/source/latex_theory/allbibs.bib
index f5d344491..383b785aa 100644
--- a/Docs/source/latex_theory/allbibs.bib
+++ b/Docs/source/latex_theory/allbibs.bib
@@ -461,8 +461,8 @@ year = {2001}
 }
 @misc{Huebl2015,
 author = {Huebl, Axel and Lehe, Remi and Vay, Jean-Luc and Grote, David P. and Sbalzarini, Ivo and Kuschel, Stephan and Bussmann, Michael},
-title = {{The OpenPMD standard 1.0.0}},
-url = {http://dx.doi.org/10.5281/zenodo.33624},
+title = {{openPMD: A meta data standard for particle and mesh based data.}},
+url = {http://dx.doi.org/10.5281/zenodo.591699},
 year = {2015}
 }
 @article{Vayarxiv10_1,
diff --git a/Docs/source/running_cpp/parameters.rst b/Docs/source/running_cpp/parameters.rst
index 7b2d2c0e6..d411cc0a1 100644
--- a/Docs/source/running_cpp/parameters.rst
+++ b/Docs/source/running_cpp/parameters.rst
@@ -219,6 +219,14 @@ Particle initialization
       ``electrons.density_function(x,y,z) = "n0+n0*x**2*1.e12"`` where ``n0`` is a
       user-defined constant, see above.
 
+* ``<species_name>.density_min`` (`float`) optional (default `0.`)
+    Minimum plasma density. No particle is injected where the density is below
+    this value.
+
+* ``<species_name>.density_max`` (`float`) optional (default `infinity`)
+    Maximum plasma density. The density at each point is the minimum between
+    the value given in the profile, and `density_max`.
+
 * ``<species_name>.radially_weighted`` (`bool`) optional (default `true`)
     Whether particle's weight is varied with their radius. This only applies to cylindrical geometry.
     The only valid value is true.
@@ -293,7 +301,7 @@ Particle initialization
       following parameters, in this order: :math:`L_{ramp,up}` :math:`L_{plateau}`
       :math:`L_{ramp,down}` :math:`R_c` :math:`n_0`
 
-* ``<species_name>.do_backward_injection`` (`bool`)
+* ``<species_name>.do_backward_propagation`` (`bool`)
     Inject a backward-propagating beam to reduce the effect of charge-separation
     fields when running in the boosted frame. See examples.
 
@@ -653,6 +661,12 @@ Boundary conditions
     The characteristic depth, in number of cells, over which
     the absorption coefficients of the PML increases.
 
+* ``warpx.do_pml_Lo`` (`2 ints in 2D`, `3 ints in 3D`; default: `1 1 1`)
+    The directions along which one wants a pml boundary condition for lower boundaries on mother grid.
+
+* ``warpx.do_pml_Hi`` (`2 floats in 2D`, `3 floats in 3D`; default: `1 1 1`)
+    The directions along which one wants a pml boundary condition for upper boundaries on mother grid.
+
 Diagnostics and output
 ----------------------
 
@@ -689,6 +703,13 @@ Diagnostics and output
     The time interval inbetween the lab-frame snapshots (where this
     time interval is expressed in the laboratory frame).
 
+* ``warpx.dz_snapshots_lab`` (`float`, in meters)
+    Only used when ``warpx.do_boosted_frame_diagnostic`` is ``1``.
+    Distance between the lab-frame snapshots (expressed in the laboratory
+    frame). ``dt_snapshots_lab`` is then computed by
+    ``dt_snapshots_lab = dz_snapshots_lab/c``. Either `dt_snapshots_lab`
+    or `dz_snapshot_lab` is required.
+
 * ``warpx.do_boosted_frame_fields`` (`0 or 1`)
     Whether to use the **back-transformed diagnostics** for the fields.
 
diff --git a/Examples/Modules/boosted_diags/inputs.2d b/Examples/Modules/boosted_diags/inputs.2d
index 528eb6cd9..6afe6977d 100644
--- a/Examples/Modules/boosted_diags/inputs.2d
+++ b/Examples/Modules/boosted_diags/inputs.2d
@@ -45,8 +45,8 @@ warpx.boost_direction = z
 
 # Diagnostics
 warpx.do_boosted_frame_diagnostic = 1
-warpx.num_snapshots_lab = 20;
-warpx.dt_snapshots_lab = 7.0e-14;
+warpx.num_snapshots_lab = 20
+warpx.dt_snapshots_lab = 7.0e-14
 
 # Species
 particles.nspecies = 2
diff --git a/Examples/Physics_applications/laser_acceleration/inputs.2d.boost b/Examples/Physics_applications/laser_acceleration/inputs.2d.boost
index a6d45426a..d90c75ada 100644
--- a/Examples/Physics_applications/laser_acceleration/inputs.2d.boost
+++ b/Examples/Physics_applications/laser_acceleration/inputs.2d.boost
@@ -19,7 +19,7 @@ geometry.prob_hi     =  128.e-6     0.96e-6
 #################################
 warpx.verbose = 1
 amrex.v = 1
-algo.current_deposition = direct
+algo.current_deposition = esirkepov
 algo.charge_deposition = standard
 algo.field_gathering = standard
 algo.particle_pusher = vay
diff --git a/Examples/Physics_applications/plasma_acceleration/inputs.2d b/Examples/Physics_applications/plasma_acceleration/inputs.2d
index 5a212d9ab..58f517308 100644
--- a/Examples/Physics_applications/plasma_acceleration/inputs.2d
+++ b/Examples/Physics_applications/plasma_acceleration/inputs.2d
@@ -82,8 +82,6 @@ beam.y_m = 0.
 beam.z_m = -105.e-6
 beam.npart = 1000
 beam.q_tot = -1.e-12
-beam.profile = "constant"
-beam.density = 8.e23                   # number of particles per m^3
 beam.momentum_distribution_type = "gaussian"
 beam.ux_m = 0.0
 beam.uy_m = 0.0
diff --git a/Examples/Tests/Langmuir/inputs.multi.rt b/Examples/Tests/Langmuir/inputs.multi.rt
index 46ddf7754..0dd0da3b8 100644
--- a/Examples/Tests/Langmuir/inputs.multi.rt
+++ b/Examples/Tests/Langmuir/inputs.multi.rt
@@ -58,7 +58,7 @@ electrons.ymin = -20.e-6
 electrons.ymax = 20.e-6
 electrons.zmin = -20.e-6
 electrons.zmax = 20.e-6
-electrons.plot_vars = w ux Bz Ey
+electrons.plot_vars = w ux Ey
 
 electrons.profile = constant
 electrons.density = 2.e24   # number of electrons per m^3
@@ -77,7 +77,7 @@ positrons.ymin = -20.e-6
 positrons.ymax = 20.e-6
 positrons.zmin = -20.e-6
 positrons.zmax = 20.e-6
-positrons.plot_vars = w ux Bz Ey
+positrons.plot_vars = w ux Ey
 
 positrons.profile = constant
 positrons.density = 2.e24   # number of positrons per m^3
diff --git a/Examples/Tests/Langmuir/inputs.multi.rz.rt b/Examples/Tests/Langmuir/inputs.multi.rz.rt
index a9096ae2f..873efb9fc 100644
--- a/Examples/Tests/Langmuir/inputs.multi.rz.rt
+++ b/Examples/Tests/Langmuir/inputs.multi.rz.rt
@@ -27,6 +27,8 @@ warpx.verbose = 1
 
 # Algorithms
 algo.field_gathering = standard
+algo.current_deposition = esirkepov
+algo.use_picsar_deposition = 0
 
 # Interpolation
 interpolation.nox = 1
diff --git a/Examples/Tests/Langmuir/langmuir2d_analysis.py b/Examples/Tests/Langmuir/langmuir2d_analysis.py
index 6c24e532a..ce271d224 100755
--- a/Examples/Tests/Langmuir/langmuir2d_analysis.py
+++ b/Examples/Tests/Langmuir/langmuir2d_analysis.py
@@ -23,7 +23,7 @@ t = ds.current_time.to_ndarray().mean() # in order to extract a single scalar
 data = ds.covering_grid( 0, ds.domain_left_edge, ds.domain_dimensions )
 
 # Check the J fields
-assert np.allclose( data['jz'].to_ndarray(), 0, atol=2.e-2 )
+assert np.allclose( data['jz'].to_ndarray(), 0, atol=0.1 )
 assert np.all( data['jy'].to_ndarray() == 0. )
 # Check the Jx field, which oscillates at wp
 j_predicted = -n0*e*c*ux*np.cos( wp*t*39.5/40 ) # 40 timesteps / j at half-timestep
@@ -32,17 +32,17 @@ assert np.allclose( jx[:32,:,0], j_predicted, rtol=0.1 )
 assert np.allclose( jx[32:,:,0], 0, atol=1.e-2 )
 
 # Check the E fields
-assert np.allclose( data['Ez'].to_ndarray(), 0, atol=5.e-5 )
+assert np.allclose( data['Ez'].to_ndarray(), 0, atol=1.e-4 )
 assert np.all( data['Ey'].to_ndarray() == 0. )
 # Check the Ex field, which oscillates at wp
 E_predicted = m_e * wp * ux * c / e * np.sin(wp*t)
 Ex = data['Ex'].to_ndarray()
 assert np.allclose( Ex[:32,:,0], E_predicted, rtol=0.1 )
-assert np.allclose( Ex[32:,:,0], 0, atol=1.e-5 )
+assert np.allclose( Ex[32:,:,0], 0, atol=1.e-4 )
 
 # Check the B fields
 assert np.all( data['Bx'].to_ndarray() == 0. )
-assert np.allclose( data['By'].to_ndarray(), 0, atol=1.e-12 )
+assert np.allclose( data['By'].to_ndarray(), 0, rtol=2. )
 assert np.all( data['Bz'].to_ndarray() == 0. )
 
 # Save an image to be displayed on the website
diff --git a/Examples/Tests/Langmuir/langmuir_PICMI_rt.py b/Examples/Tests/Langmuir/langmuir_PICMI_rt.py
index 648777824..4b4d93007 100644
--- a/Examples/Tests/Langmuir/langmuir_PICMI_rt.py
+++ b/Examples/Tests/Langmuir/langmuir_PICMI_rt.py
@@ -16,7 +16,7 @@ zmax = +20.e-6
 
 uniform_plasma = picmi.UniformDistribution(density = 1.e25,
                                            upper_bound = [0., None, None],
-                                           directed_velocity = [0.1*picmi.c, 0., 0.])
+                                           directed_velocity = [0.1*picmi.constants.c, 0., 0.])
 
 electrons = picmi.Species(particle_type='electron', name='electrons', initial_distribution=uniform_plasma)
 
diff --git a/Examples/Tests/Langmuir/langmuir_multi_analysis.py b/Examples/Tests/Langmuir/langmuir_multi_analysis.py
index fab613e19..890320be8 100755
--- a/Examples/Tests/Langmuir/langmuir_multi_analysis.py
+++ b/Examples/Tests/Langmuir/langmuir_multi_analysis.py
@@ -66,13 +66,13 @@ ds = yt.load(fn)
 for species in ['electrons', 'positrons']:
     for field in ['particle_weight',
                   'particle_momentum_x',
-                  'particle_Bz',
                   'particle_Ey']:
         assert (species, field) in ds.field_list
     for field in ['particle_momentum_y',
                   'particle_momentum_z',
                   'particle_Bx',
                   'particle_By',
+                  'particle_Bz',
                   'particle_Ex',
                   'particle_Ez']:
         assert (species, field) not in ds.field_list
diff --git a/Examples/Tests/PML/analysis_pml.py b/Examples/Tests/PML/analysis_pml_psatd.py
index 6234cd5d2..ff3bf8413 100755
--- a/Examples/Tests/PML/analysis_pml.py
+++ b/Examples/Tests/PML/analysis_pml_psatd.py
@@ -28,7 +28,7 @@ energyB = np.sum(1./scc.mu_0/2*(Bx**2+By**2+Bz**2))
 energy_end = energyE + energyB
 
 Reflectivity = energy_end/energy_start
-Reflectivity_theory = 5.683000058954201e-07
+Reflectivity_theory = 1.3806831258153887e-06
 
 assert( abs(Reflectivity-Reflectivity_theory) < 5./100 * Reflectivity_theory )
     
diff --git a/Examples/Tests/PML/inputs2d b/Examples/Tests/PML/inputs2d
index 5b936a333..c6abe2b88 100644
--- a/Examples/Tests/PML/inputs2d
+++ b/Examples/Tests/PML/inputs2d
@@ -32,8 +32,6 @@ warpx.do_moving_window = 0
 # warpx.moving_window_dir = z
 # warpx.moving_window_v = 1.0 # in units of the speed of light
 
-warpx.maxwell_fdtd_solver = yee
-
 # Laser
 lasers.nlasers      = 1
 lasers.names        = laser1
diff --git a/GNUmakefile b/GNUmakefile
index 1acd53be7..cd5776cf5 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -2,11 +2,11 @@ AMREX_HOME  ?= ../amrex
 PICSAR_HOME ?= ../picsar
 OPENBC_HOME ?= ../openbc_poisson
 
-DEBUG	= FALSE
+DEBUG = FALSE
 #DEBUG	= TRUE
 
 #DIM     = 2
-DIM	= 3
+DIM = 3
 
 COMP = gcc
 #COMP = intel
diff --git a/LICENSE.txt b/LICENSE.txt
index aeeded3a7..346d92dc3 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-WarpX v19.05 Copyright (c) 2018, The Regents of the University of California, through Lawrence Berkeley National Laboratory, and Lawrence Livermore National Security, LLC, for the operation of Lawrence Livermore National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved.
+WarpX v19.08 Copyright (c) 2018, The Regents of the University of California, through Lawrence Berkeley National Laboratory, and Lawrence Livermore National Security, LLC, for the operation of Lawrence Livermore National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved.
 
 
 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
diff --git a/Python/pywarpx/picmi.py b/Python/pywarpx/picmi.py
index ac272089c..47f8c562f 100644
--- a/Python/pywarpx/picmi.py
+++ b/Python/pywarpx/picmi.py
@@ -1,19 +1,23 @@
 """Classes following the PICMI standard
 """
+import re
 import picmistandard
 import numpy as np
 import pywarpx
+import periodictable
 
 codename = 'warpx'
 picmistandard.register_codename(codename)
 
-# --- Values from WarpXConst.H
-c = 299792458.
-ep0 = 8.854187817e-12
-mu0 = 1.2566370614359173e-06
-q_e = 1.602176462e-19
-m_e = 9.10938291e-31
-m_p = 1.6726231e-27
+class constants:
+    # --- Put the constants in their own namespace
+    # --- Values from WarpXConst.H
+    c = 299792458.
+    ep0 = 8.854187817e-12
+    mu0 = 1.2566370614359173e-06
+    q_e = 1.602176462e-19
+    m_e = 9.10938291e-31
+    m_p = 1.6726231e-27
 
 
 class Species(picmistandard.PICMI_Species):
@@ -31,9 +35,26 @@ class Species(picmistandard.PICMI_Species):
         elif self.particle_type == 'anti-proton':
             if self.charge is None: self.charge = '-q_e'
             if self.mass is None: self.mass = 'm_p'
-        elif self.particle_type == 'H' and self.charge_state == 1:
-            if self.charge is None: self.charge = 'q_e'
-            if self.mass is None: self.mass = 'm_p'
+        else:
+            if self.charge is None and self.charge_state is not None:
+                self.charge = self.charge_state*constants.q_e
+            # Match a string of the format '#nXx', with the '#n' optional isotope number.
+            m = re.match('(?P<iso>#[\d+])*(?P<sym>[A-Za-z]+)', self.particle_type)
+            if m is not None:
+                element = periodictable.elements.symbol(m['sym'])
+                if m['iso'] is not None:
+                    element = element[m['iso'][1:]]
+                if self.charge_state is not None:
+                    assert self.charge_state <= element.number, Exception('%s charge state not valid'%self.particle_type)
+                    try:
+                        element = element.ion[self.charge_state]
+                    except ValueError:
+                        # Note that not all valid charge states are defined in elements,
+                        # so this value error can be ignored.
+                        pass
+                self.element = element
+                if self.mass is None:
+                    self.mass = element.mass*periodictable.constants.atomic_mass_constant
 
     def initialize_inputs(self, layout):
         self.species_number = pywarpx.particles.nspecies
@@ -51,16 +72,18 @@ class Species(picmistandard.PICMI_Species):
         pywarpx.Particles.particles_list.append(self.species)
 
         if self.initial_distribution is not None:
-            self.initial_distribution.initialize_inputs(self.species_number, layout, self.species)
+            self.initial_distribution.initialize_inputs(self.species_number, layout, self.species, self.density_scale)
 
 
 picmistandard.PICMI_MultiSpecies.Species_class = Species
 class MultiSpecies(picmistandard.PICMI_MultiSpecies):
-    pass
+    def initialize_inputs(self, layout):
+        for species in self.species_instances_list:
+            species.initialize_inputs(layout)
 
 
 class GaussianBunchDistribution(picmistandard.PICMI_GaussianBunchDistribution):
-    def initialize_inputs(self, species_number, layout, species):
+    def initialize_inputs(self, species_number, layout, species, density_scale):
         species.injection_style = "gaussian_beam"
         species.x_m = self.centroid_position[0]
         species.y_m = self.centroid_position[1]
@@ -75,10 +98,12 @@ class GaussianBunchDistribution(picmistandard.PICMI_GaussianBunchDistribution):
         # --- Calculate the total charge. Note that charge might be a string instead of a number.
         charge = species.charge
         if charge == 'q_e' or charge == '+q_e':
-            charge = q_e
+            charge = constants.q_e
         elif charge == '-q_e':
-            charge = -q_e
+            charge = -constants.q_e
         species.q_tot = self.n_physical_particles*charge
+        if density_scale is not None:
+            species.q_tot *= density_scale
 
         # --- These need to be defined even though they are not used
         species.profile = "constant"
@@ -97,26 +122,26 @@ class GaussianBunchDistribution(picmistandard.PICMI_GaussianBunchDistribution):
         # --- Note that WarpX takes gamma*beta as input
         if np.any(np.not_equal(self.velocity_divergence, 0.)):
             species.momentum_distribution_type = "radial_expansion"
-            species.u_over_r = self.velocity_divergence[0]/c
-            #species.u_over_y = self.velocity_divergence[1]/c
-            #species.u_over_z = self.velocity_divergence[2]/c
+            species.u_over_r = self.velocity_divergence[0]/constants.c
+            #species.u_over_y = self.velocity_divergence[1]/constants.c
+            #species.u_over_z = self.velocity_divergence[2]/constants.c
         elif np.any(np.not_equal(self.rms_velocity, 0.)):
             species.momentum_distribution_type = "gaussian"
-            species.ux_m = self.centroid_velocity[0]/c
-            species.uy_m = self.centroid_velocity[1]/c
-            species.uz_m = self.centroid_velocity[2]/c
-            species.ux_th = self.rms_velocity[0]/c
-            species.uy_th = self.rms_velocity[1]/c
-            species.uz_th = self.rms_velocity[2]/c
+            species.ux_m = self.centroid_velocity[0]/constants.c
+            species.uy_m = self.centroid_velocity[1]/constants.c
+            species.uz_m = self.centroid_velocity[2]/constants.c
+            species.ux_th = self.rms_velocity[0]/constants.c
+            species.uy_th = self.rms_velocity[1]/constants.c
+            species.uz_th = self.rms_velocity[2]/constants.c
         else:
             species.momentum_distribution_type = "constant"
-            species.ux = self.centroid_velocity[0]/c
-            species.uy = self.centroid_velocity[1]/c
-            species.uz = self.centroid_velocity[2]/c
+            species.ux = self.centroid_velocity[0]/constants.c
+            species.uy = self.centroid_velocity[1]/constants.c
+            species.uz = self.centroid_velocity[2]/constants.c
 
 
 class UniformDistribution(picmistandard.PICMI_UniformDistribution):
-    def initialize_inputs(self, species_number, layout, species):
+    def initialize_inputs(self, species_number, layout, species, density_scale):
 
         if isinstance(layout, GriddedLayout):
             # --- Note that the grid attribute of GriddedLayout is ignored
@@ -139,28 +164,30 @@ class UniformDistribution(picmistandard.PICMI_UniformDistribution):
         # --- Only constant density is supported at this time
         species.profile = "constant"
         species.density = self.density
+        if density_scale is not None:
+            species.density *= density_scale
 
         # --- Note that WarpX takes gamma*beta as input
         if np.any(np.not_equal(self.rms_velocity, 0.)):
             species.momentum_distribution_type = "gaussian"
-            species.ux_m = self.directed_velocity[0]/c
-            species.uy_m = self.directed_velocity[1]/c
-            species.uz_m = self.directed_velocity[2]/c
-            species.ux_th = self.rms_velocity[0]/c
-            species.uy_th = self.rms_velocity[1]/c
-            species.uz_th = self.rms_velocity[2]/c
+            species.ux_m = self.directed_velocity[0]/constants.c
+            species.uy_m = self.directed_velocity[1]/constants.c
+            species.uz_m = self.directed_velocity[2]/constants.c
+            species.ux_th = self.rms_velocity[0]/constants.c
+            species.uy_th = self.rms_velocity[1]/constants.c
+            species.uz_th = self.rms_velocity[2]/constants.c
         else:
             species.momentum_distribution_type = "constant"
-            species.ux = self.directed_velocity[0]/c
-            species.uy = self.directed_velocity[1]/c
-            species.uz = self.directed_velocity[2]/c
+            species.ux = self.directed_velocity[0]/constants.c
+            species.uy = self.directed_velocity[1]/constants.c
+            species.uz = self.directed_velocity[2]/constants.c
 
         if self.fill_in:
             species.do_continuous_injection = 1
 
 
 class AnalyticDistribution(picmistandard.PICMI_AnalyticDistribution):
-    def initialize_inputs(self, species_number, layout, species):
+    def initialize_inputs(self, species_number, layout, species, density_scale):
 
         if isinstance(layout, GriddedLayout):
             # --- Note that the grid attribute of GriddedLayout is ignored
@@ -182,7 +209,10 @@ class AnalyticDistribution(picmistandard.PICMI_AnalyticDistribution):
 
         # --- Only constant density is supported at this time
         species.profile = "parse_density_function"
-        species.__setattr__('density_function(x,y,z)', self.density_expression)
+        if density_scale is None:
+            species.__setattr__('density_function(x,y,z)', self.density_expression)
+        else:
+            species.__setattr__('density_function(x,y,z)', "{}*({})".format(density_scale, self.density_expression))
 
         for k,v in self.user_defined_kw.items():
             setattr(pywarpx.my_constants, k, v)
@@ -190,17 +220,17 @@ class AnalyticDistribution(picmistandard.PICMI_AnalyticDistribution):
         # --- Note that WarpX takes gamma*beta as input
         if np.any(np.not_equal(self.rms_velocity, 0.)):
             species.momentum_distribution_type = "gaussian"
-            species.ux_m = self.directed_velocity[0]/c
-            species.uy_m = self.directed_velocity[1]/c
-            species.uz_m = self.directed_velocity[2]/c
-            species.ux_th = self.rms_velocity[0]/c
-            species.uy_th = self.rms_velocity[1]/c
-            species.uz_th = self.rms_velocity[2]/c
+            species.ux_m = self.directed_velocity[0]/constants.c
+            species.uy_m = self.directed_velocity[1]/constants.c
+            species.uz_m = self.directed_velocity[2]/constants.c
+            species.ux_th = self.rms_velocity[0]/constants.c
+            species.uy_th = self.rms_velocity[1]/constants.c
+            species.uz_th = self.rms_velocity[2]/constants.c
         else:
             species.momentum_distribution_type = "constant"
-            species.ux = self.directed_velocity[0]/c
-            species.uy = self.directed_velocity[1]/c
-            species.uz = self.directed_velocity[2]/c
+            species.ux = self.directed_velocity[0]/constants.c
+            species.uy = self.directed_velocity[1]/constants.c
+            species.uz = self.directed_velocity[2]/constants.c
 
         if self.fill_in:
             species.do_continuous_injection = 1
@@ -212,12 +242,14 @@ class ParticleListDistribution(picmistandard.PICMI_ParticleListDistribution):
         if len(x) > 1:
             raise Exception('Only a single particle can be loaded')
 
-    def initialize_inputs(self, species_number, layout, species):
+    def initialize_inputs(self, species_number, layout, species, density_scale):
 
         species.injection_style = "singleparticle"
         species.single_particle_pos = [self.x[0], self.y[0], self.z[0]]
-        species.single_particle_vel = [self.ux[0]/c, self.uy[0]/c, self.uz[0]/c]
+        species.single_particle_vel = [self.ux[0]/constants.c, self.uy[0]/constants.c, self.uz[0]/constants.c]
         species.single_particle_weight = self.weight
+        if density_scale is not None:
+            species.single_particle_weight *= density_scale
 
         # --- These need to be defined even though they are not used
         species.profile = "constant"
@@ -236,11 +268,22 @@ class GriddedLayout(picmistandard.PICMI_GriddedLayout):
 class PseudoRandomLayout(picmistandard.PICMI_PseudoRandomLayout):
     def init(self, kw):
         if self.seed is not None:
-            print('Warning: WarpX does not support specifying the random number seed')
+            print('Warning: WarpX does not support specifying the random number seed in PseudoRandomLayout')
 
 
 class BinomialSmoother(picmistandard.PICMI_BinomialSmoother):
-    pass
+    def initialize_inputs(self, solver):
+        pywarpx.warpx.use_filter = 1
+        if self.n_pass is None:
+            # If not specified, do at least one pass in each direction.
+            self.n_pass = 1
+        try:
+            # Check if n_pass is a vector
+            len(self.n_pass)
+        except TypeError:
+            # If not, make it a vector
+            self.n_pass = solver.grid.number_of_dimensions*[self.n_pass]
+        pywarpx.warpx.filter_npass_each_dir = self.n_pass
 
 
 class CylindricalGrid(picmistandard.PICMI_CylindricalGrid):
@@ -273,7 +316,7 @@ class CylindricalGrid(picmistandard.PICMI_CylindricalGrid):
                 raise Exception('In cylindrical coordinates, a moving window in r can not be done')
             if self.moving_window_velocity[1] != 0.:
                 pywarpx.warpx.moving_window_dir = 'z'
-                pywarpx.warpx.moving_window_v = self.moving_window_velocity[1]/c  # in units of the speed of light
+                pywarpx.warpx.moving_window_v = self.moving_window_velocity[1]/constants.c  # in units of the speed of light
 
         if self.refined_regions:
             assert len(self.refined_regions) == 1, Exception('WarpX only supports one refined region.')
@@ -308,10 +351,10 @@ class Cartesian2DGrid(picmistandard.PICMI_Cartesian2DGrid):
             pywarpx.warpx.do_moving_window = 1
             if self.moving_window_velocity[0] != 0.:
                 pywarpx.warpx.moving_window_dir = 'x'
-                pywarpx.warpx.moving_window_v = self.moving_window_velocity[0]/c  # in units of the speed of light
+                pywarpx.warpx.moving_window_v = self.moving_window_velocity[0]/constants.c  # in units of the speed of light
             if self.moving_window_velocity[1] != 0.:
                 pywarpx.warpx.moving_window_dir = 'y'
-                pywarpx.warpx.moving_window_v = self.moving_window_velocity[1]/c  # in units of the speed of light
+                pywarpx.warpx.moving_window_v = self.moving_window_velocity[1]/constants.c  # in units of the speed of light
 
         if self.refined_regions:
             assert len(self.refined_regions) == 1, Exception('WarpX only supports one refined region.')
@@ -348,13 +391,13 @@ class Cartesian3DGrid(picmistandard.PICMI_Cartesian3DGrid):
             pywarpx.warpx.do_moving_window = 1
             if self.moving_window_velocity[0] != 0.:
                 pywarpx.warpx.moving_window_dir = 'x'
-                pywarpx.warpx.moving_window_v = self.moving_window_velocity[0]/c  # in units of the speed of light
+                pywarpx.warpx.moving_window_v = self.moving_window_velocity[0]/constants.c  # in units of the speed of light
             if self.moving_window_velocity[1] != 0.:
                 pywarpx.warpx.moving_window_dir = 'y'
-                pywarpx.warpx.moving_window_v = self.moving_window_velocity[1]/c  # in units of the speed of light
+                pywarpx.warpx.moving_window_v = self.moving_window_velocity[1]/constants.c  # in units of the speed of light
             if self.moving_window_velocity[2] != 0.:
                 pywarpx.warpx.moving_window_dir = 'z'
-                pywarpx.warpx.moving_window_v = self.moving_window_velocity[2]/c  # in units of the speed of light
+                pywarpx.warpx.moving_window_v = self.moving_window_velocity[2]/constants.c  # in units of the speed of light
 
         if self.refined_regions:
             assert len(self.refined_regions) == 1, Exception('WarpX only supports one refined region.')
@@ -386,6 +429,9 @@ class ElectromagneticSolver(picmistandard.PICMI_ElectromagneticSolver):
         if self.cfl is not None:
             pywarpx.warpx.cfl = self.cfl
 
+        if self.source_smoother is not None:
+            self.source_smoother.initialize_inputs(self)
+
 
 class ElectrostaticSolver(picmistandard.PICMI_ElectrostaticSolver):
     def initialize_inputs(self):
@@ -415,7 +461,7 @@ class LaserAntenna(picmistandard.PICMI_LaserAntenna):
         laser.laser.position = self.position  # This point is on the laser plane
         laser.laser.direction = self.normal_vector  # The plane normal direction
         laser.laser.profile_focal_distance = laser.focal_position[2] - self.position[2]  # Focal distance from the antenna (in meters)
-        laser.laser.profile_t_peak = (self.position[2] - laser.centroid_position[2])/c  # The time at which the laser reaches its peak (in seconds)
+        laser.laser.profile_t_peak = (self.position[2] - laser.centroid_position[2])/constants.c  # The time at which the laser reaches its peak (in seconds)
 
 
 class Simulation(picmistandard.PICMI_Simulation):
diff --git a/Python/setup.py b/Python/setup.py
index ecb87190c..76d99fbaa 100644
--- a/Python/setup.py
+++ b/Python/setup.py
@@ -20,10 +20,10 @@ else:
     package_data = {}
 
 setup (name = 'pywarpx',
-       version = '19.05',
+       version = '19.08',
        packages = ['pywarpx'],
        package_dir = {'pywarpx':'pywarpx'},
        description = """Wrapper of WarpX""",
        package_data = package_data,
-       install_requires=['picmistandard']
+       install_requires=['picmistandard', 'periodictable']
        )
diff --git a/Regression/WarpX-tests.ini b/Regression/WarpX-tests.ini
index 1c04081a3..0fcb88e05 100644
--- a/Regression/WarpX-tests.ini
+++ b/Regression/WarpX-tests.ini
@@ -40,7 +40,7 @@ goUpLink = 1
 
 # email
 sendEmailWhenFail = 1
-emailTo = weiqunzhang@lbl.gov, ASAlmgren@lbl.gov, jlvay@lbl.gov, rlehe@lbl.gov, atmyers@lbl.gov, mthevenet@lbl.gov, jaehongpark@lbl.gov, oshapoval@lbl.gov, henri.vincenti@cea.fr, ldianaamorim@lbl.gov, rjambunathan@lbl.gov
+emailTo = weiqunzhang@lbl.gov, jlvay@lbl.gov, rlehe@lbl.gov, atmyers@lbl.gov, mthevenet@lbl.gov, jaehongpark@lbl.gov, oshapoval@lbl.gov, henri.vincenti@cea.fr, ldianaamorim@lbl.gov, rjambunathan@lbl.gov, axelhuebl@lbl.gov
 emailBody = Check https://ccse.lbl.gov/pub/RegressionTesting/WarpX/ for more details.
 
 [AMReX]
@@ -87,6 +87,21 @@ compileTest = 0
 doVis = 0
 analysisRoutine = Examples/Tests/PML/analysis_pml_ckc.py
 
+#[pml_x_psatd]
+#buildDir = .
+#inputFile = Examples/Tests/PML/inputs2d
+#runtime_params = warpx.do_dynamic_scheduling=0
+#dim = 2
+#addToCompileString = USE_PSATD=TRUE
+#restartTest = 0
+#useMPI = 1
+#numprocs = 2
+#useOMP = 1
+#numthreads = 2
+#compileTest = 0
+#doVis = 0
+#analysisRoutine = Examples/Tests/PML/analysis_pml_psatd.py
+
 [nci_corrector]
 buildDir = .
 inputFile = Examples/Modules/nci_corrector/inputs2d
@@ -397,7 +412,7 @@ compareParticles = 0
 [LaserAccelerationMR]
 buildDir = .
 inputFile = Examples/Physics_applications/laser_acceleration/inputs.2d
-runtime_params = amr.max_level=1 max_step=100 warpx.serialize_ics=1
+runtime_params = amr.max_level=1 max_step=200 warpx.serialize_ics=1
 dim = 2
 addToCompileString =
 restartTest = 0
@@ -413,7 +428,7 @@ particleTypes = electrons beam
 [PlasmaAccelerationMR]
 buildDir = .
 inputFile = Examples/Physics_applications/plasma_acceleration/inputs.2d
-runtime_params = amr.max_level=1 amr.n_cell=32 512 max_step=100 plasma_e.zmin=-200.e-6 warpx.serialize_ics=1 warpx.do_dynamic_scheduling=0
+runtime_params = amr.max_level=1 amr.n_cell=32 512 max_step=400 warpx.serialize_ics=1 warpx.do_dynamic_scheduling=0
 dim = 2
 addToCompileString =
 restartTest = 0
diff --git a/Source/.DS_Store b/Source/.DS_Store
deleted file mode 100644
index 01640e062..000000000
--- a/Source/.DS_Store
+++ /dev/null
diff --git a/Source/BoundaryConditions/PML.H b/Source/BoundaryConditions/PML.H
index 0cf367284..b34cbe88b 100644
--- a/Source/BoundaryConditions/PML.H
+++ b/Source/BoundaryConditions/PML.H
@@ -6,6 +6,10 @@
 #include <AMReX_MultiFab.H>
 #include <AMReX_Geometry.H>
 
+#ifdef WARPX_USE_PSATD
+#include <SpectralSolver.H>
+#endif
+
 #if (AMREX_SPACEDIM == 3)
 
 #define WRPX_PML_TO_FORTRAN(x)                              \
@@ -93,7 +97,12 @@ class PML
 public:
     PML (const amrex::BoxArray& ba, const amrex::DistributionMapping& dm,
          const amrex::Geometry* geom, const amrex::Geometry* cgeom,
-         int ncell, int delta, int ref_ratio, int do_dive_cleaning, int do_moving_window);
+         int ncell, int delta, int ref_ratio,
+#ifdef WARPX_USE_PSATD
+         amrex::Real dt, int nox_fft, int noy_fft, int noz_fft, bool do_nodal,
+#endif
+         int do_dive_cleaning, int do_moving_window,
+         const amrex::IntVect do_pml_Lo, const amrex::IntVect do_pml_Hi);
 
     void ComputePMLFactors (amrex::Real dt);
 
@@ -111,6 +120,10 @@ public:
     const MultiSigmaBox& GetMultiSigmaBox_cp () const
         { return *sigba_cp; }
 
+#ifdef WARPX_USE_PSATD
+    void PushPSATD ();
+#endif
+
     void ExchangeB (const std::array<amrex::MultiFab*,3>& B_fp,
                     const std::array<amrex::MultiFab*,3>& B_cp);
     void ExchangeE (const std::array<amrex::MultiFab*,3>& E_fp,
@@ -154,10 +167,23 @@ private:
     std::unique_ptr<MultiSigmaBox> sigba_fp;
     std::unique_ptr<MultiSigmaBox> sigba_cp;
 
+#ifdef WARPX_USE_PSATD
+    std::unique_ptr<SpectralSolver> spectral_solver_fp;
+    std::unique_ptr<SpectralSolver> spectral_solver_cp;
+#endif
+
     static amrex::BoxArray MakeBoxArray (const amrex::Geometry& geom,
-                                         const amrex::BoxArray& grid_ba, int ncell);
+                                         const amrex::BoxArray& grid_ba, int ncell,
+                                         const amrex::IntVect do_pml_Lo,
+                                         const amrex::IntVect do_pml_Hi);
 
     static void Exchange (amrex::MultiFab& pml, amrex::MultiFab& reg, const amrex::Geometry& geom);
 };
 
+#ifdef WARPX_USE_PSATD
+void PushPMLPSATDSinglePatch( SpectralSolver& solver,
+    std::array<std::unique_ptr<amrex::MultiFab>,3>& pml_E,
+    std::array<std::unique_ptr<amrex::MultiFab>,3>& pml_B );
+#endif
+
 #endif
diff --git a/Source/BoundaryConditions/PML.cpp b/Source/BoundaryConditions/PML.cpp
index f780f335c..21d348482 100644
--- a/Source/BoundaryConditions/PML.cpp
+++ b/Source/BoundaryConditions/PML.cpp
@@ -258,14 +258,7 @@ SigmaBox::ComputePMLFactorsB (const Real* dx, Real dt)
     {
         for (int i = 0, N = sigma_star[idim].size(); i < N; ++i)
         {
-            if (sigma_star[idim][i] == 0.0)
-            {
-                sigma_star_fac[idim][i] = 1.0;
-            }
-            else
-            {
-                sigma_star_fac[idim][i] = std::exp(-sigma_star[idim][i]*dt);
-            }
+            sigma_star_fac[idim][i] = std::exp(-sigma_star[idim][i]*dt);
         }
     }
 }
@@ -277,14 +270,7 @@ SigmaBox::ComputePMLFactorsE (const Real* dx, Real dt)
     {
         for (int i = 0, N = sigma[idim].size(); i < N; ++i)
         {
-            if (sigma[idim][i] == 0.0)
-            {
-                sigma_fac[idim][i] = 1.0;
-            }
-            else
-            {
-                sigma_fac[idim][i] = std::exp(-sigma[idim][i]*dt);
-            }
+            sigma_fac[idim][i] = std::exp(-sigma[idim][i]*dt);
         }
     }
 }
@@ -329,11 +315,16 @@ MultiSigmaBox::ComputePMLFactorsE (const Real* dx, Real dt)
 
 PML::PML (const BoxArray& grid_ba, const DistributionMapping& grid_dm,
           const Geometry* geom, const Geometry* cgeom,
-          int ncell, int delta, int ref_ratio, int do_dive_cleaning, int do_moving_window)
+          int ncell, int delta, int ref_ratio,
+#ifdef WARPX_USE_PSATD
+          Real dt, int nox_fft, int noy_fft, int noz_fft, bool do_nodal,
+#endif
+          int do_dive_cleaning, int do_moving_window,
+          const amrex::IntVect do_pml_Lo, const amrex::IntVect do_pml_Hi)
     : m_geom(geom),
       m_cgeom(cgeom)
 {
-    const BoxArray& ba = MakeBoxArray(*geom, grid_ba, ncell);
+    const BoxArray& ba = MakeBoxArray(*geom, grid_ba, ncell, do_pml_Lo, do_pml_Hi);
     if (ba.size() == 0) {
         m_ok = false;
         return;
@@ -343,10 +334,30 @@ PML::PML (const BoxArray& grid_ba, const DistributionMapping& grid_dm,
 
     DistributionMapping dm{ba};
 
-    int nge = 2;
-    int ngb = 2;
-    int ngf = (do_moving_window) ? 2 : 0;
-    if (WarpX::maxwell_fdtd_solver_id == 1) ngf = std::max( ngf, 1 );
+    // Define the number of guard cells in each direction, for E, B, and F
+    IntVect nge = IntVect(AMREX_D_DECL(2, 2, 2));
+    IntVect ngb = IntVect(AMREX_D_DECL(2, 2, 2));
+    int ngf_int = (do_moving_window) ? 2 : 0;
+    if (WarpX::maxwell_fdtd_solver_id == 1) ngf_int = std::max( ngf_int, 1 );
+    IntVect ngf = IntVect(AMREX_D_DECL(ngf_int, ngf_int, ngf_int));
+#ifdef WARPX_USE_PSATD
+    // Increase the number of guard cells, in order to fit the extent
+    // of the stencil for the spectral solver
+    IntVect ngFFT;
+    if (do_nodal) {
+        ngFFT = IntVect(AMREX_D_DECL(nox_fft, noy_fft, noz_fft));
+    } else {
+        ngFFT = IntVect(AMREX_D_DECL(nox_fft/2, noy_fft/2, noz_fft/2));
+    }
+    // Set the number of guard cells to the maximum of each field
+    // (all fields should have the same number of guard cells)
+    ngFFT = ngFFT.max(nge);
+    ngFFT = ngFFT.max(ngb);
+    ngFFT = ngFFT.max(ngf);
+    nge = ngFFT;
+    ngb = ngFFT;
+    ngf = ngFFT;
+ #endif
 
     pml_E_fp[0].reset(new MultiFab(amrex::convert(ba,WarpX::Ex_nodal_flag), dm, 3, nge));
     pml_E_fp[1].reset(new MultiFab(amrex::convert(ba,WarpX::Ey_nodal_flag), dm, 3, nge));
@@ -370,15 +381,26 @@ PML::PML (const BoxArray& grid_ba, const DistributionMapping& grid_dm,
 
     sigba_fp.reset(new MultiSigmaBox(ba, dm, grid_ba, geom->CellSize(), ncell, delta));
 
+#ifdef WARPX_USE_PSATD
+    const bool in_pml = true; // Tells spectral solver to use split-PML equations
+    const RealVect dx{AMREX_D_DECL(geom->CellSize(0), geom->CellSize(1), geom->CellSize(2))};
+    // Get the cell-centered box, with guard cells
+    BoxArray realspace_ba = ba;  // Copy box
+    realspace_ba.enclosedCells().grow(nge); // cell-centered + guard cells
+    spectral_solver_fp.reset( new SpectralSolver( realspace_ba, dm,
+        nox_fft, noy_fft, noz_fft, do_nodal, dx, dt, in_pml ) );
+#endif
+
     if (cgeom)
     {
-
-        nge = 1;
-        ngb = 1;
+#ifndef WARPX_USE_PSATD
+        nge = IntVect(AMREX_D_DECL(1, 1, 1));
+        ngb = IntVect(AMREX_D_DECL(1, 1, 1));
+#endif
 
         BoxArray grid_cba = grid_ba;
         grid_cba.coarsen(ref_ratio);
-        const BoxArray& cba = MakeBoxArray(*cgeom, grid_cba, ncell);
+        const BoxArray& cba = MakeBoxArray(*cgeom, grid_cba, ncell, do_pml_Lo, do_pml_Hi);
 
         DistributionMapping cdm{cba};
 
@@ -403,17 +425,32 @@ PML::PML (const BoxArray& grid_ba, const DistributionMapping& grid_dm,
         }
 
         sigba_cp.reset(new MultiSigmaBox(cba, cdm, grid_cba, cgeom->CellSize(), ncell, delta));
-    }
 
+#ifdef WARPX_USE_PSATD
+        const bool in_pml = true; // Tells spectral solver to use split-PML equations
+        const RealVect cdx{AMREX_D_DECL(cgeom->CellSize(0), cgeom->CellSize(1), cgeom->CellSize(2))};
+        // Get the cell-centered box, with guard cells
+        BoxArray realspace_cba = cba;  // Copy box
+        realspace_cba.enclosedCells().grow(nge); // cell-centered + guard cells
+        spectral_solver_cp.reset( new SpectralSolver( realspace_cba, cdm,
+            nox_fft, noy_fft, noz_fft, do_nodal, cdx, dt, in_pml ) );
+#endif
+    }
 }
 
 BoxArray
-PML::MakeBoxArray (const amrex::Geometry& geom, const amrex::BoxArray& grid_ba, int ncell)
+PML::MakeBoxArray (const amrex::Geometry& geom, const amrex::BoxArray& grid_ba, int ncell,
+                   const amrex::IntVect do_pml_Lo, const amrex::IntVect do_pml_Hi)
 {
     Box domain = geom.Domain();
     for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
         if ( ! geom.isPeriodic(idim) ) {
-            domain.grow(idim, ncell);
+            if (do_pml_Lo[idim]){
+                domain.growLo(idim, ncell);
+            }
+            if (do_pml_Hi[idim]){
+                domain.growHi(idim, ncell);
+            }
         }
     }
 
@@ -753,3 +790,57 @@ PML::Restart (const std::string& dir)
         VisMF::Read(*pml_B_cp[2], dir+"_Bz_cp");
     }
 }
+
+#ifdef WARPX_USE_PSATD
+void
+PML::PushPSATD () {
+
+    // Update the fields on the fine and coarse patch
+    PushPMLPSATDSinglePatch( *spectral_solver_fp, pml_E_fp, pml_B_fp );
+    if (spectral_solver_cp) {
+        PushPMLPSATDSinglePatch( *spectral_solver_cp, pml_E_cp, pml_B_cp );
+    }
+}
+
+void
+PushPMLPSATDSinglePatch (
+    SpectralSolver& solver,
+    std::array<std::unique_ptr<amrex::MultiFab>,3>& pml_E,
+    std::array<std::unique_ptr<amrex::MultiFab>,3>& pml_B ) {
+
+    using Idx = SpectralPMLIndex;
+
+    // Perform forward Fourier transform
+    // Note: the correspondance between the spectral PML index
+    // (Exy, Ezx, etc.) and the component (0 or 1) of the
+    // MultiFabs (e.g. pml_E) is dictated by the
+    // function that damps the PML
+    solver.ForwardTransform(*pml_E[0], Idx::Exy, 0);
+    solver.ForwardTransform(*pml_E[0], Idx::Exz, 1);
+    solver.ForwardTransform(*pml_E[1], Idx::Eyz, 0);
+    solver.ForwardTransform(*pml_E[1], Idx::Eyx, 1);
+    solver.ForwardTransform(*pml_E[2], Idx::Ezx, 0);
+    solver.ForwardTransform(*pml_E[2], Idx::Ezy, 1);
+    solver.ForwardTransform(*pml_B[0], Idx::Bxy, 0);
+    solver.ForwardTransform(*pml_B[0], Idx::Bxz, 1);
+    solver.ForwardTransform(*pml_B[1], Idx::Byz, 0);
+    solver.ForwardTransform(*pml_B[1], Idx::Byx, 1);
+    solver.ForwardTransform(*pml_B[2], Idx::Bzx, 0);
+    solver.ForwardTransform(*pml_B[2], Idx::Bzy, 1);
+    // Advance fields in spectral space
+    solver.pushSpectralFields();
+    // Perform backward Fourier Transform
+    solver.BackwardTransform(*pml_E[0], Idx::Exy, 0);
+    solver.BackwardTransform(*pml_E[0], Idx::Exz, 1);
+    solver.BackwardTransform(*pml_E[1], Idx::Eyz, 0);
+    solver.BackwardTransform(*pml_E[1], Idx::Eyx, 1);
+    solver.BackwardTransform(*pml_E[2], Idx::Ezx, 0);
+    solver.BackwardTransform(*pml_E[2], Idx::Ezy, 1);
+    solver.BackwardTransform(*pml_B[0], Idx::Bxy, 0);
+    solver.BackwardTransform(*pml_B[0], Idx::Bxz, 1);
+    solver.BackwardTransform(*pml_B[1], Idx::Byz, 0);
+    solver.BackwardTransform(*pml_B[1], Idx::Byx, 1);
+    solver.BackwardTransform(*pml_B[2], Idx::Bzx, 0);
+    solver.BackwardTransform(*pml_B[2], Idx::Bzy, 1);
+}
+#endif
diff --git a/Source/Diagnostics/ParticleIO.cpp b/Source/Diagnostics/ParticleIO.cpp
index f2a543ed5..f159e5302 100644
--- a/Source/Diagnostics/ParticleIO.cpp
+++ b/Source/Diagnostics/ParticleIO.cpp
@@ -98,7 +98,7 @@ MultiParticleContainer::WritePlotFile (const std::string& dir) const
             real_names.push_back("By");
             real_names.push_back("Bz");
             
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
             real_names.push_back("theta");
 #endif
             
diff --git a/Source/Evolve/WarpXEvolveEM.cpp b/Source/Evolve/WarpXEvolveEM.cpp
index 32a4747db..16b5905d1 100644
--- a/Source/Evolve/WarpXEvolveEM.cpp
+++ b/Source/Evolve/WarpXEvolveEM.cpp
@@ -299,6 +299,7 @@ WarpX::OneStep_nosub (Real cur_time)
     // (And update guard cells immediately afterwards)
 #ifdef WARPX_USE_PSATD
     PushPSATD(dt[0]);
+    if (do_pml) DampPML();
     FillBoundaryE();
     FillBoundaryB();
 #else
@@ -481,6 +482,19 @@ WarpX::PushParticlesandDepose (int lev, Real cur_time)
                  Efield_cax[lev][0].get(), Efield_cax[lev][1].get(), Efield_cax[lev][2].get(),
                  Bfield_cax[lev][0].get(), Bfield_cax[lev][1].get(), Bfield_cax[lev][2].get(),
                  cur_time, dt[lev]);
+#ifdef WARPX_DIM_RZ
+    // This is called after all particles have deposited their current and charge.
+    ApplyInverseVolumeScalingToCurrentDensity(current_fp[lev][0].get(), current_fp[lev][1].get(), current_fp[lev][2].get(), lev);
+    if (current_buf[lev][0].get()) {
+        ApplyInverseVolumeScalingToCurrentDensity(current_buf[lev][0].get(), current_buf[lev][1].get(), current_buf[lev][2].get(), lev-1);
+    }
+    if (rho_fp[lev].get()) {
+        ApplyInverseVolumeScalingToChargeDensity(rho_fp[lev].get(), lev);
+        if (charge_buf[lev].get()) {
+            ApplyInverseVolumeScalingToChargeDensity(charge_buf[lev].get(), lev-1);
+        }
+    }
+#endif
 }
 
 void
@@ -491,7 +505,7 @@ WarpX::ComputeDt ()
 
     if (maxwell_fdtd_solver_id == 0) {
         // CFL time step Yee solver
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
         // Derived semi-analytically by R. Lehe
         deltat  = cfl * 1./( std::sqrt((1+0.2105)/(dx[0]*dx[0]) + 1./(dx[1]*dx[1])) * PhysConst::c );
 #else
@@ -536,10 +550,7 @@ WarpX::computeMaxStepBoostAccelerator(amrex::Geometry a_geom){
         WarpX::moving_window_dir == AMREX_SPACEDIM-1,
         "Can use zmax_plasma_to_compute_max_step only if " +
         "moving window along z. TODO: all directions.");
-    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(
-        maxLevel() == 0,
-        "Can use zmax_plasma_to_compute_max_step only if " +
-        "max level = 0.");
+
     AMREX_ALWAYS_ASSERT_WITH_MESSAGE(
         (WarpX::boost_direction[0]-0)*(WarpX::boost_direction[0]-0) +
         (WarpX::boost_direction[1]-0)*(WarpX::boost_direction[1]-0) +
@@ -560,7 +571,12 @@ WarpX::computeMaxStepBoostAccelerator(amrex::Geometry a_geom){
     const Real interaction_time_boost = (len_plasma_boost-zmin_domain_boost)/
         (moving_window_v-v_plasma_boost);
     // Divide by dt, and update value of max_step.
-    const int computed_max_step = interaction_time_boost/dt[0];
+    int computed_max_step;
+    if (do_subcycling){
+        computed_max_step = interaction_time_boost/dt[0];
+    } else {
+        computed_max_step = interaction_time_boost/dt[maxLevel()];
+    }
     max_step = computed_max_step;
     Print()<<"max_step computed in computeMaxStepBoostAccelerator: "
            <<computed_max_step<<std::endl;
diff --git a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/Make.package b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/Make.package
index c62c21f44..ee8376865 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/Make.package
+++ b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/Make.package
@@ -1,6 +1,8 @@
 CEXE_headers += SpectralBaseAlgorithm.H
 CEXE_headers += PsatdAlgorithm.H
 CEXE_sources += PsatdAlgorithm.cpp
+CEXE_headers += PMLPsatdAlgorithm.H
+CEXE_sources += PMLPsatdAlgorithm.cpp
 
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/FieldSolver/SpectralSolver/SpectralAlgorithms
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/FieldSolver/SpectralSolver/SpectralAlgorithms
diff --git a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PMLPsatdAlgorithm.H b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PMLPsatdAlgorithm.H
new file mode 100644
index 000000000..a2511b6b7
--- /dev/null
+++ b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PMLPsatdAlgorithm.H
@@ -0,0 +1,34 @@
+#ifndef WARPX_PML_PSATD_ALGORITHM_H_
+#define WARPX_PML_PSATD_ALGORITHM_H_
+
+#include <SpectralBaseAlgorithm.H>
+
+/* \brief Class that updates the field in spectral space
+ * and stores the coefficients of the corresponding update equation.
+ */
+class PMLPsatdAlgorithm : public SpectralBaseAlgorithm
+{
+    public:
+        PMLPsatdAlgorithm(const SpectralKSpace& spectral_kspace,
+                         const amrex::DistributionMapping& dm,
+                         const int norder_x, const int norder_y,
+                         const int norder_z, const bool nodal,
+                         const amrex::Real dt);
+
+        void InitializeSpectralCoefficients(
+	    const SpectralKSpace& spectral_kspace,
+	    const amrex::DistributionMapping& dm, 
+	    const amrex::Real dt);
+
+        // Redefine functions from base class
+        virtual void pushSpectralFields(SpectralFieldData& f) const override final;
+        virtual int getRequiredNumberOfFields() const override final {
+            return SpectralPMLIndex::n_fields;
+        }
+
+    private:
+        SpectralCoefficients C_coef, S_ck_coef;
+
+};
+
+#endif // WARPX_PML_PSATD_ALGORITHM_H_
diff --git a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PMLPsatdAlgorithm.cpp b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PMLPsatdAlgorithm.cpp
new file mode 100644
index 000000000..d76259d4c
--- /dev/null
+++ b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PMLPsatdAlgorithm.cpp
@@ -0,0 +1,146 @@
+#include <PMLPsatdAlgorithm.H>
+#include <WarpXConst.H>
+#include <cmath>
+
+using namespace amrex;
+
+/* \brief Initialize coefficients for the update equation */
+PMLPsatdAlgorithm::PMLPsatdAlgorithm(
+                         const SpectralKSpace& spectral_kspace,
+                         const DistributionMapping& dm,
+                         const int norder_x, const int norder_y,
+                         const int norder_z, const bool nodal, const Real dt)
+     // Initialize members of base class
+     : SpectralBaseAlgorithm( spectral_kspace, dm,
+                              norder_x, norder_y, norder_z, nodal )
+{
+    const BoxArray& ba = spectral_kspace.spectralspace_ba;
+
+    // Allocate the arrays of coefficients
+    C_coef = SpectralCoefficients(ba, dm, 1, 0);
+    S_ck_coef = SpectralCoefficients(ba, dm, 1, 0);
+
+    InitializeSpectralCoefficients(spectral_kspace, dm, dt);
+}
+
+/* Advance the E and B field in spectral space (stored in `f`)
+ * over one time step */
+void
+PMLPsatdAlgorithm::pushSpectralFields(SpectralFieldData& f) const{
+
+    // Loop over boxes
+    for (MFIter mfi(f.fields); mfi.isValid(); ++mfi){
+
+        const Box& bx = f.fields[mfi].box();
+
+        // Extract arrays for the fields to be updated
+        Array4<Complex> fields = f.fields[mfi].array();
+        // Extract arrays for the coefficients
+        Array4<const Real> C_arr = C_coef[mfi].array();
+        Array4<const Real> S_ck_arr = S_ck_coef[mfi].array();
+        // Extract pointers for the k vectors
+        const Real* modified_kx_arr = modified_kx_vec[mfi].dataPtr();
+#if (AMREX_SPACEDIM==3)
+        const Real* modified_ky_arr = modified_ky_vec[mfi].dataPtr();
+#endif
+        const Real* modified_kz_arr = modified_kz_vec[mfi].dataPtr();
+
+        // Loop over indices within one box
+        ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+        {
+            // Record old values of the fields to be updated
+            using Idx = SpectralPMLIndex;
+            const Complex Ex_old = fields(i,j,k,Idx::Exy) \
+                                 + fields(i,j,k,Idx::Exz);
+            const Complex Ey_old = fields(i,j,k,Idx::Eyx) \
+                                 + fields(i,j,k,Idx::Eyz);
+            const Complex Ez_old = fields(i,j,k,Idx::Ezx) \
+                                 + fields(i,j,k,Idx::Ezy);
+            const Complex Bx_old = fields(i,j,k,Idx::Bxy) \
+                                 + fields(i,j,k,Idx::Bxz);
+            const Complex By_old = fields(i,j,k,Idx::Byx) \
+                                 + fields(i,j,k,Idx::Byz);
+            const Complex Bz_old = fields(i,j,k,Idx::Bzx) \
+                                 + fields(i,j,k,Idx::Bzy);
+            // k vector values, and coefficients
+            const Real kx = modified_kx_arr[i];
+#if (AMREX_SPACEDIM==3)
+            const Real ky = modified_ky_arr[j];
+            const Real kz = modified_kz_arr[k];
+#else
+            constexpr Real ky = 0;
+            const Real kz = modified_kz_arr[j];
+#endif
+            constexpr Real c2 = PhysConst::c*PhysConst::c;
+            const Complex I = Complex{0,1};
+            const Real C = C_arr(i,j,k);
+            const Real S_ck = S_ck_arr(i,j,k);
+
+            // Update E
+            fields(i,j,k,Idx::Exy) = C*fields(i,j,k,Idx::Exy) + S_ck*c2*I*ky*Bz_old;
+            fields(i,j,k,Idx::Exz) = C*fields(i,j,k,Idx::Exz) - S_ck*c2*I*kz*By_old;
+            fields(i,j,k,Idx::Eyz) = C*fields(i,j,k,Idx::Eyz) + S_ck*c2*I*kz*Bx_old;
+            fields(i,j,k,Idx::Eyx) = C*fields(i,j,k,Idx::Eyx) - S_ck*c2*I*kx*Bz_old;
+            fields(i,j,k,Idx::Ezx) = C*fields(i,j,k,Idx::Ezx) + S_ck*c2*I*kx*By_old;
+            fields(i,j,k,Idx::Ezy) = C*fields(i,j,k,Idx::Ezy) - S_ck*c2*I*ky*Bx_old;
+            // Update B
+            fields(i,j,k,Idx::Bxy) = C*fields(i,j,k,Idx::Bxy) - S_ck*I*ky*Ez_old;
+            fields(i,j,k,Idx::Bxz) = C*fields(i,j,k,Idx::Bxz) + S_ck*I*kz*Ey_old;
+            fields(i,j,k,Idx::Byz) = C*fields(i,j,k,Idx::Byz) - S_ck*I*kz*Ex_old;
+            fields(i,j,k,Idx::Byx) = C*fields(i,j,k,Idx::Byx) + S_ck*I*kx*Ez_old;
+            fields(i,j,k,Idx::Bzx) = C*fields(i,j,k,Idx::Bzx) - S_ck*I*kx*Ey_old;
+            fields(i,j,k,Idx::Bzy) = C*fields(i,j,k,Idx::Bzy) + S_ck*I*ky*Ex_old;
+        });
+    }
+};
+
+void PMLPsatdAlgorithm::InitializeSpectralCoefficients (
+    const SpectralKSpace& spectral_kspace,
+    const amrex::DistributionMapping& dm,
+    const amrex::Real dt)
+{
+    const BoxArray& ba = spectral_kspace.spectralspace_ba;
+    // Fill them with the right values:
+    // Loop over boxes and allocate the corresponding coefficients
+    // for each box owned by the local MPI proc
+    for (MFIter mfi(ba, dm); mfi.isValid(); ++mfi){
+
+        const Box& bx = ba[mfi];
+
+        // Extract pointers for the k vectors
+        const Real* modified_kx = modified_kx_vec[mfi].dataPtr();
+#if (AMREX_SPACEDIM==3)
+        const Real* modified_ky = modified_ky_vec[mfi].dataPtr();
+#endif
+        const Real* modified_kz = modified_kz_vec[mfi].dataPtr();
+        // Extract arrays for the coefficients
+        Array4<Real> C = C_coef[mfi].array();
+        Array4<Real> S_ck = S_ck_coef[mfi].array();
+
+        // Loop over indices within one box
+        ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+        {
+            // Calculate norm of vector
+            const Real k_norm = std::sqrt(
+                std::pow(modified_kx[i], 2) +
+#if (AMREX_SPACEDIM==3)
+                std::pow(modified_ky[j], 2) +
+                std::pow(modified_kz[k], 2));
+#else
+                std::pow(modified_kz[j], 2));
+#endif
+
+            // Calculate coefficients
+            constexpr Real c = PhysConst::c;
+            if (k_norm != 0){
+                C(i,j,k) = std::cos(c*k_norm*dt);
+                S_ck(i,j,k) = std::sin(c*k_norm*dt)/(c*k_norm);
+            } else { // Handle k_norm = 0, by using the analytical limit
+                C(i,j,k) = 1.;
+                S_ck(i,j,k) = dt;
+            }
+        });
+    }
+};
diff --git a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H
index 12718e38b..825d04dc2 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H
+++ b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H
@@ -13,14 +13,18 @@ class PsatdAlgorithm : public SpectralBaseAlgorithm
         PsatdAlgorithm(const SpectralKSpace& spectral_kspace,
                          const amrex::DistributionMapping& dm,
                          const int norder_x, const int norder_y,
-                         const int norder_z, const bool nodal, const amrex::Real dt);
-
+                         const int norder_z, const bool nodal,
+                         const amrex::Real dt);
+        // Redefine functions from base class
+        virtual void pushSpectralFields(SpectralFieldData& f) const override final;
+        virtual int getRequiredNumberOfFields() const override final {
+            return SpectralFieldIndex::n_fields;
+        }
+        
         void InitializeSpectralCoefficients(const SpectralKSpace& spectral_kspace,
-                                    const amrex::DistributionMapping& dm, 
+                                    const amrex::DistributionMapping& dm,
                                     const amrex::Real dt);
 
-        void pushSpectralFields(SpectralFieldData& f) const override final;
-
     private:
         SpectralCoefficients C_coef, S_ck_coef, X1_coef, X2_coef, X3_coef;
 };
diff --git a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/SpectralBaseAlgorithm.H b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/SpectralBaseAlgorithm.H
index 602eb2473..5d5e376c1 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/SpectralBaseAlgorithm.H
+++ b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/SpectralBaseAlgorithm.H
@@ -14,9 +14,9 @@
 class SpectralBaseAlgorithm
 {
     public:
-        // Member function that updates the fields in spectral space ;
-        // meant to be overridden in subclasses
+        // Virtual member function ; meant to be overridden in subclasses
         virtual void pushSpectralFields(SpectralFieldData& f) const = 0;
+        virtual int getRequiredNumberOfFields() const = 0;
         // The destructor should also be a virtual function, so that
         // a pointer to subclass of `SpectraBaseAlgorithm` actually
         // calls the subclass's destructor.
diff --git a/Source/FieldSolver/SpectralSolver/SpectralFieldData.H b/Source/FieldSolver/SpectralSolver/SpectralFieldData.H
index 7954414b8..6a2446981 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralFieldData.H
+++ b/Source/FieldSolver/SpectralSolver/SpectralFieldData.H
@@ -8,18 +8,24 @@
 // Declare type for spectral fields
 using SpectralField = amrex::FabArray< amrex::BaseFab <Complex> >;
 
-/* Index for the fields that will be stored in spectral space */
+/* Index for the regular fields, when stored in spectral space */
 struct SpectralFieldIndex {
   enum { Ex=0, Ey, Ez, Bx, By, Bz, Jx, Jy, Jz, rho_old, rho_new, n_fields };
   // n_fields is automatically the total number of fields
 };
 
+/* Index for the PML fields, when stored in spectral space */
+struct SpectralPMLIndex {
+  enum { Exy=0, Exz, Eyx, Eyz, Ezx, Ezy,
+           Bxy, Bxz, Byx, Byz, Bzx, Bzy, n_fields };
+  // n_fields is automatically the total number of fields
+};
+
 /* \brief Class that stores the fields in spectral space, and performs the
  *  Fourier transforms between real space and spectral space
  */
 class SpectralFieldData
 {
-    friend class PsatdAlgorithm;
 
     // Define the FFTplans type, which holds one fft plan per box
     // (plans are only initialized for the boxes that are owned by
@@ -32,8 +38,9 @@ class SpectralFieldData
 
     public:
         SpectralFieldData( const amrex::BoxArray& realspace_ba,
-                      const SpectralKSpace& k_space,
-                      const amrex::DistributionMapping& dm );
+                           const SpectralKSpace& k_space,
+                           const amrex::DistributionMapping& dm,
+                           const int n_field_required );
         SpectralFieldData() = default; // Default constructor
         SpectralFieldData& operator=(SpectralFieldData&& field_data) = default;
         ~SpectralFieldData();
@@ -41,10 +48,10 @@ class SpectralFieldData
                                const int field_index, const int i_comp);
         void BackwardTransform( amrex::MultiFab& mf,
                                const int field_index, const int i_comp);
-
-    private:
         // `fields` stores fields in spectral space, as multicomponent FabArray
         SpectralField fields;
+
+    private:
         // tmpRealField and tmpSpectralField store fields
         // right before/after the Fourier transform
         SpectralField tmpSpectralField; // contains Complexs
diff --git a/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp b/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp
index 948baf0a6..8f0853484 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp
+++ b/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp
@@ -5,14 +5,14 @@ using namespace amrex;
 /* \brief Initialize fields in spectral space, and FFT plans */
 SpectralFieldData::SpectralFieldData( const BoxArray& realspace_ba,
                             const SpectralKSpace& k_space,
-                            const DistributionMapping& dm )
+                            const DistributionMapping& dm,
+                            const int n_field_required )
 {
     const BoxArray& spectralspace_ba = k_space.spectralspace_ba;
 
     // Allocate the arrays that contain the fields in spectral space
     // (one component per field)
-    fields = SpectralField(spectralspace_ba, dm,
-                            SpectralFieldIndex::n_fields, 0);
+    fields = SpectralField(spectralspace_ba, dm, n_field_required, 0);
 
     // Allocate temporary arrays - in real space and spectral space
     // These arrays will store the data just before/after the FFT
diff --git a/Source/FieldSolver/SpectralSolver/SpectralSolver.H b/Source/FieldSolver/SpectralSolver/SpectralSolver.H
index d4019a9a3..c570b017b 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralSolver.H
+++ b/Source/FieldSolver/SpectralSolver/SpectralSolver.H
@@ -23,7 +23,8 @@ class SpectralSolver
                         const amrex::DistributionMapping& dm,
                         const int norder_x, const int norder_y,
                         const int norder_z, const bool nodal,
-                        const amrex::RealVect dx, const amrex::Real dt );
+                        const amrex::RealVect dx, const amrex::Real dt,
+                        const bool pml=false );
 
         /* \brief Transform the component `i_comp` of MultiFab `mf`
          *  to spectral space, and store the corresponding result internally
diff --git a/Source/FieldSolver/SpectralSolver/SpectralSolver.cpp b/Source/FieldSolver/SpectralSolver/SpectralSolver.cpp
index c21c3cfb1..4b9def013 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralSolver.cpp
+++ b/Source/FieldSolver/SpectralSolver/SpectralSolver.cpp
@@ -1,19 +1,29 @@
 #include <SpectralKSpace.H>
 #include <SpectralSolver.H>
 #include <PsatdAlgorithm.H>
+#include <PMLPsatdAlgorithm.H>
 
 /* \brief Initialize the spectral Maxwell solver
  *
  * This function selects the spectral algorithm to be used, allocates the
  * corresponding coefficients for the discretized field update equation,
  * and prepares the structures that store the fields in spectral space.
+ *
+ * \param norder_x Order of accuracy of the spatial derivatives along x
+ * \param norder_y Order of accuracy of the spatial derivatives along y
+ * \param norder_z Order of accuracy of the spatial derivatives along z
+ * \param nodal    Whether the solver is applied to a nodal or staggered grid
+ * \param dx       Cell size along each dimension
+ * \param dt       Time step
+ * \param pml      Whether the boxes in which the solver is applied are PML boxes
  */
 SpectralSolver::SpectralSolver(
                 const amrex::BoxArray& realspace_ba,
                 const amrex::DistributionMapping& dm,
                 const int norder_x, const int norder_y,
                 const int norder_z, const bool nodal,
-                const amrex::RealVect dx, const amrex::Real dt ) {
+                const amrex::RealVect dx, const amrex::Real dt,
+                const bool pml ) {
 
     // Initialize all structures using the same distribution mapping dm
 
@@ -24,12 +34,16 @@ SpectralSolver::SpectralSolver(
 
     // - Select the algorithm depending on the input parameters
     //   Initialize the corresponding coefficients over k space
-    // TODO: Add more algorithms + selection depending on input parameters
-    //       For the moment, this only uses the standard PsatdAlgorithm
-    algorithm = std::unique_ptr<PsatdAlgorithm>( new PsatdAlgorithm(
+    if (pml) {
+        algorithm = std::unique_ptr<PMLPsatdAlgorithm>( new PMLPsatdAlgorithm(
+            k_space, dm, norder_x, norder_y, norder_z, nodal, dt ) );
+    } else {
+        algorithm = std::unique_ptr<PsatdAlgorithm>( new PsatdAlgorithm(
             k_space, dm, norder_x, norder_y, norder_z, nodal, dt ) );
+    }
 
     // - Initialize arrays for fields in spectral space + FFT plans
-    field_data = SpectralFieldData( realspace_ba, k_space, dm );
+    field_data = SpectralFieldData( realspace_ba, k_space, dm,
+            algorithm->getRequiredNumberOfFields() );
 
 };
diff --git a/Source/FieldSolver/WarpXPushFieldsEM.cpp b/Source/FieldSolver/WarpXPushFieldsEM.cpp
index 4fce4717b..1df05bc0f 100644
--- a/Source/FieldSolver/WarpXPushFieldsEM.cpp
+++ b/Source/FieldSolver/WarpXPushFieldsEM.cpp
@@ -18,6 +18,40 @@
 using namespace amrex;
 
 #ifdef WARPX_USE_PSATD
+namespace {
+    void
+    PushPSATDSinglePatch (
+        SpectralSolver& solver,
+        std::array<std::unique_ptr<amrex::MultiFab>,3>& Efield,
+        std::array<std::unique_ptr<amrex::MultiFab>,3>& Bfield,
+        std::array<std::unique_ptr<amrex::MultiFab>,3>& current,
+        std::unique_ptr<amrex::MultiFab>& rho ) {
+
+        using Idx = SpectralFieldIndex;
+
+        // Perform forward Fourier transform
+        solver.ForwardTransform(*Efield[0], Idx::Ex);
+        solver.ForwardTransform(*Efield[1], Idx::Ey);
+        solver.ForwardTransform(*Efield[2], Idx::Ez);
+        solver.ForwardTransform(*Bfield[0], Idx::Bx);
+        solver.ForwardTransform(*Bfield[1], Idx::By);
+        solver.ForwardTransform(*Bfield[2], Idx::Bz);
+        solver.ForwardTransform(*current[0], Idx::Jx);
+        solver.ForwardTransform(*current[1], Idx::Jy);
+        solver.ForwardTransform(*current[2], Idx::Jz);
+        solver.ForwardTransform(*rho, Idx::rho_old, 0);
+        solver.ForwardTransform(*rho, Idx::rho_new, 1);
+        // Advance fields in spectral space
+        solver.pushSpectralFields();
+        // Perform backward Fourier Transform
+        solver.BackwardTransform(*Efield[0], Idx::Ex);
+        solver.BackwardTransform(*Efield[1], Idx::Ey);
+        solver.BackwardTransform(*Efield[2], Idx::Ez);
+        solver.BackwardTransform(*Bfield[0], Idx::Bx);
+        solver.BackwardTransform(*Bfield[1], Idx::By);
+        solver.BackwardTransform(*Bfield[2], Idx::Bz);
+    }
+}
 
 void
 WarpX::PushPSATD (amrex::Real a_dt)
@@ -31,38 +65,25 @@ WarpX::PushPSATD (amrex::Real a_dt)
         } else {
             PushPSATD_localFFT(lev, a_dt);
         }
+
+        // Evolve the fields in the PML boxes
+        if (do_pml && pml[lev]->ok()) {
+            pml[lev]->PushPSATD();
+        }
     }
 }
 
-void WarpX::PushPSATD_localFFT (int lev, amrex::Real /* dt */)
+void
+WarpX::PushPSATD_localFFT (int lev, amrex::Real /* dt */)
 {
-    auto& solver = *spectral_solver_fp[lev];
-
-    // Perform forward Fourier transform
-    solver.ForwardTransform(*Efield_fp[lev][0], SpectralFieldIndex::Ex);
-    solver.ForwardTransform(*Efield_fp[lev][1], SpectralFieldIndex::Ey);
-    solver.ForwardTransform(*Efield_fp[lev][2], SpectralFieldIndex::Ez);
-    solver.ForwardTransform(*Bfield_fp[lev][0], SpectralFieldIndex::Bx);
-    solver.ForwardTransform(*Bfield_fp[lev][1], SpectralFieldIndex::By);
-    solver.ForwardTransform(*Bfield_fp[lev][2], SpectralFieldIndex::Bz);
-    solver.ForwardTransform(*current_fp[lev][0], SpectralFieldIndex::Jx);
-    solver.ForwardTransform(*current_fp[lev][1], SpectralFieldIndex::Jy);
-    solver.ForwardTransform(*current_fp[lev][2], SpectralFieldIndex::Jz);
-    solver.ForwardTransform(*rho_fp[lev], SpectralFieldIndex::rho_old, 0);
-    solver.ForwardTransform(*rho_fp[lev], SpectralFieldIndex::rho_new, 1);
-
-    // Advance fields in spectral space
-    solver.pushSpectralFields();
-
-    // Perform backward Fourier Transform
-    solver.BackwardTransform(*Efield_fp[lev][0], SpectralFieldIndex::Ex);
-    solver.BackwardTransform(*Efield_fp[lev][1], SpectralFieldIndex::Ey);
-    solver.BackwardTransform(*Efield_fp[lev][2], SpectralFieldIndex::Ez);
-    solver.BackwardTransform(*Bfield_fp[lev][0], SpectralFieldIndex::Bx);
-    solver.BackwardTransform(*Bfield_fp[lev][1], SpectralFieldIndex::By);
-    solver.BackwardTransform(*Bfield_fp[lev][2], SpectralFieldIndex::Bz);
+    // Update the fields on the fine and coarse patch
+    PushPSATDSinglePatch( *spectral_solver_fp[lev],
+        Efield_fp[lev], Bfield_fp[lev], current_fp[lev], rho_fp[lev] );
+    if (spectral_solver_cp[lev]) {
+        PushPSATDSinglePatch( *spectral_solver_cp[lev],
+             Efield_cp[lev], Bfield_cp[lev], current_cp[lev], rho_cp[lev] );
+    }
 }
-
 #endif
 
 void
@@ -560,3 +581,143 @@ WarpX::EvolveF (int lev, PatchType patch_type, Real a_dt, DtType a_dt_type)
     }
 }
 
+#ifdef WARPX_DIM_RZ
+// This scales the current by the inverse volume and wraps around the depostion at negative radius.
+// It is faster to apply this on the grid than to do it particle by particle.
+// It is put here since there isn't another nice place for it.
+void
+WarpX::ApplyInverseVolumeScalingToCurrentDensity (MultiFab* Jx, MultiFab* Jy, MultiFab* Jz, int lev)
+{
+    const long ngJ = Jx->nGrow();
+    const std::array<Real,3>& dx = WarpX::CellSize(lev);
+    const Real dr = dx[0];
+
+    Box tilebox;
+
+    for ( MFIter mfi(*Jx, TilingIfNotGPU()); mfi.isValid(); ++mfi )
+    {
+
+        Array4<Real> const& Jr_arr = Jx->array(mfi);
+        Array4<Real> const& Jt_arr = Jy->array(mfi);
+        Array4<Real> const& Jz_arr = Jz->array(mfi);
+
+        tilebox = mfi.tilebox();
+        Box tbr = convert(tilebox, WarpX::jx_nodal_flag);
+        Box tbt = convert(tilebox, WarpX::jy_nodal_flag);
+        Box tbz = convert(tilebox, WarpX::jz_nodal_flag);
+
+        // Lower corner of tile box physical domain
+        // Note that this is done before the tilebox.grow so that
+        // these do not include the guard cells.
+        const std::array<Real, 3>& xyzmin = WarpX::LowerCorner(tilebox, lev);
+        const Dim3 lo = lbound(tilebox);
+        const Real rmin = xyzmin[0];
+        const int irmin = lo.x;
+
+        // Rescale current in r-z mode since the inverse volume factor was not
+        // included in the current deposition.
+        amrex::ParallelFor(tbr,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            // Wrap the current density deposited in the guard cells around
+            // to the cells above the axis.
+            // Note that Jr(i==0) is at 1/2 dr.
+            if (rmin == 0. && 0 <= i && i < ngJ) {
+                Jr_arr(i,j,0) -= Jr_arr(-1-i,j,0);
+            }
+            // Apply the inverse volume scaling
+            // Since Jr is not cell centered in r, no need for distinction
+            // between on axis and off-axis factors
+            const amrex::Real r = std::abs(rmin + (i - irmin + 0.5)*dr);
+            Jr_arr(i,j,0) /= (2.*MathConst::pi*r);
+        });
+        amrex::ParallelFor(tbt,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            // Wrap the current density deposited in the guard cells around
+            // to the cells above the axis.
+            // Jt is located on the boundary
+            if (rmin == 0. && 0 < i && i <= ngJ) {
+                Jt_arr(i,j,0) += Jt_arr(-i,j,0);
+            }
+
+            // Apply the inverse volume scaling
+            // Jt is forced to zero on axis.
+            const amrex::Real r = std::abs(rmin + (i - irmin)*dr);
+            if (r == 0.) {
+                Jt_arr(i,j,0) = 0.;
+            } else {
+                Jt_arr(i,j,0) /= (2.*MathConst::pi*r);
+            }
+        });
+        amrex::ParallelFor(tbz,
+        [=] AMREX_GPU_DEVICE (int i, int j, int k)
+        {
+            // Wrap the current density deposited in the guard cells around
+            // to the cells above the axis.
+            // Jz is located on the boundary
+            if (rmin == 0. && 0 < i && i <= ngJ) {
+                Jz_arr(i,j,0) += Jz_arr(-i,j,0);
+            }
+
+            // Apply the inverse volume scaling
+            const amrex::Real r = std::abs(rmin + (i - irmin)*dr);
+            if (r == 0.) {
+                // Verboncoeur JCP 164, 421-427 (2001) : corrected volume on axis
+                Jz_arr(i,j,0) /= (MathConst::pi*dr/3.);
+            } else {
+                Jz_arr(i,j,0) /= (2.*MathConst::pi*r);
+            }
+        });
+    }
+}
+
+void
+WarpX::ApplyInverseVolumeScalingToChargeDensity (MultiFab* Rho, int lev)
+{
+    const long ngRho = Rho->nGrow();
+    const std::array<Real,3>& dx = WarpX::CellSize(lev);
+    const Real dr = dx[0];
+
+    Box tilebox;
+
+    for ( MFIter mfi(*Rho, TilingIfNotGPU()); mfi.isValid(); ++mfi )
+    {
+
+        Array4<Real> const& Rho_arr = Rho->array(mfi);
+
+        tilebox = mfi.tilebox();
+        Box tb = convert(tilebox, IntVect::TheUnitVector());
+
+        // Lower corner of tile box physical domain
+        // Note that this is done before the tilebox.grow so that
+        // these do not include the guard cells.
+        const std::array<Real, 3>& xyzmin = WarpX::LowerCorner(tilebox, lev);
+        const Dim3 lo = lbound(tilebox);
+        const Real rmin = xyzmin[0];
+        const int irmin = lo.x;
+
+        // Rescale charge in r-z mode since the inverse volume factor was not
+        // included in the charge deposition.
+        amrex::ParallelFor(tb, Rho->nComp(),
+        [=] AMREX_GPU_DEVICE (int i, int j, int k, int icomp)
+        {
+            // Wrap the charge density deposited in the guard cells around
+            // to the cells above the axis.
+            // Rho is located on the boundary
+            if (rmin == 0. && 0 < i && i <= ngRho) {
+                Rho_arr(i,j,0,icomp) += Rho_arr(-i,j,0,icomp);
+            }
+
+            // Apply the inverse volume scaling
+            const amrex::Real r = std::abs(rmin + (i - irmin)*dr);
+            if (r == 0.) {
+                // Verboncoeur JCP 164, 421-427 (2001) : corrected volume on axis
+                Rho_arr(i,j,0,icomp) /= (MathConst::pi*dr/3.);
+            } else {
+                Rho_arr(i,j,0,icomp) /= (2.*MathConst::pi*r);
+            }
+        });
+    }
+}
+#endif
diff --git a/Source/FortranInterface/WarpX_f.H b/Source/FortranInterface/WarpX_f.H
index 0440148eb..aac23f781 100644
--- a/Source/FortranInterface/WarpX_f.H
+++ b/Source/FortranInterface/WarpX_f.H
@@ -62,7 +62,7 @@
 #define WRPX_PUSH_LEAPFROG               warpx_push_leapfrog_2d
 #define WRPX_PUSH_LEAPFROG_POSITIONS     warpx_push_leapfrog_positions_2d
 
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
 #define WRPX_COMPUTE_DIVE                warpx_compute_dive_rz
 #else
 #define WRPX_COMPUTE_DIVE                warpx_compute_dive_2d
@@ -75,22 +75,6 @@ extern "C"
 {
 #endif
 
-	// Charge deposition
-	void warpx_charge_deposition(amrex::Real* rho,
-            const long* np,	const amrex::Real* xp, const amrex::Real* yp, const amrex::Real* zp,	const amrex::Real* w,
-			const amrex::Real* q, const amrex::Real* xmin, const amrex::Real* ymin, const amrex::Real* zmin,
-			const amrex::Real* dx, const amrex::Real* dy, const amrex::Real* dz,
-			const long* nx, const long* ny, const long* nz,
-			const long* nxguard, const long* nyguard, const long* nzguard,
-			const long* nox, const long* noy,const long* noz,
-			const long* lvect, const long* charge_depo_algo);
-
-        // Charge deposition finalize for RZ
-        void warpx_charge_deposition_rz_volume_scaling(
-			amrex::Real* rho, const long* rho_ng, const int* rho_ntot,
-			const amrex::Real* rmin,
-			const amrex::Real* dr);
-
 	// Current deposition
 	void warpx_current_deposition(
 			amrex::Real* jx, const long* jx_ng, const int* jx_ntot,
@@ -106,34 +90,6 @@ extern "C"
 			const long* nox, const long* noy,const long* noz,
                         const int* l_nodal, const long* lvect, const long* current_depo_algo);
 
-        // Current deposition finalize for RZ
-        void warpx_current_deposition_rz_volume_scaling(
-			amrex::Real* jx, const long* jx_ng, const int* jx_ntot,
-			amrex::Real* jy, const long* jy_ng, const int* jy_ntot,
-			amrex::Real* jz, const long* jz_ng, const int* jz_ntot,
-			const amrex::Real* rmin,
-			const amrex::Real* dr);
-
-	// Field gathering
-
-	void warpx_geteb_energy_conserving(const long* np,
-			const amrex::Real* xp, const amrex::Real* yp, const amrex::Real* zp,
-			amrex::Real* exp, amrex::Real* eyp, amrex::Real* ezp,
-                        amrex::Real* bxp, amrex::Real* byp, amrex::Real* bzp,
-                        const int* ixyzmin,
-                        const amrex::Real* xmin, const amrex::Real* ymin, const amrex::Real* zmin,
-                        const amrex::Real* dx, const amrex::Real* dy, const amrex::Real* dz,
-			const long* nox, const long* noy, const long* noz,
-			const amrex::Real* exg, const int* exg_lo, const int* exg_hi,
-			const amrex::Real* eyg, const int* eyg_lo, const int* eyg_hi,
-			const amrex::Real* ezg, const int* ezg_lo, const int* ezg_hi,
-			const amrex::Real* bxg, const int* bxg_lo, const int* bxg_hi,
-			const amrex::Real* byg, const int* byg_lo, const int* byg_hi,
-			const amrex::Real* bzg, const int* bzg_lo, const int* bzg_hi,
-			const int* ll4symtry, const int* l_lower_order_in_v,
-                        const int* l_nodal, const long* lvect,
-			const long* field_gathe_algo);
-
 	// Particle pusher (velocity and position)
 
 	void warpx_particle_pusher(const long* np,
@@ -342,7 +298,7 @@ extern "C"
                              const BL_FORT_FAB_ARG_ANYD(ey),
                              const BL_FORT_FAB_ARG_ANYD(ez),
                              const amrex::Real* dx
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
                              ,const amrex::Real* rmin
 #endif
                              );
diff --git a/Source/FortranInterface/WarpX_picsar.F90 b/Source/FortranInterface/WarpX_picsar.F90
index dc47245dd..34084d753 100644
--- a/Source/FortranInterface/WarpX_picsar.F90
+++ b/Source/FortranInterface/WarpX_picsar.F90
@@ -1,20 +1,15 @@
 #if (AMREX_SPACEDIM == 3)
 
-#define WRPX_PXR_GETEB_ENERGY_CONSERVING  geteb3d_energy_conserving_generic
 #define WRPX_PXR_CURRENT_DEPOSITION       depose_jxjyjz_generic
 
 #elif (AMREX_SPACEDIM == 2)
 
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
 
-#define WRPX_PXR_GETEB_ENERGY_CONSERVING  geteb2drz_energy_conserving_generic
 #define WRPX_PXR_CURRENT_DEPOSITION       depose_jrjtjz_generic_rz
-#define WRPX_PXR_RZ_VOLUME_SCALING_RHO    apply_rz_volume_scaling_rho
-#define WRPX_PXR_RZ_VOLUME_SCALING_J      apply_rz_volume_scaling_j
 
 #else
 
-#define WRPX_PXR_GETEB_ENERGY_CONSERVING  geteb2dxz_energy_conserving_generic
 #define WRPX_PXR_CURRENT_DEPOSITION       depose_jxjyjz_generic_2d
 
 #endif
@@ -56,227 +51,6 @@ contains
   ! _________________________________________________________________
   !>
   !> @brief
-  !> Main subroutine for the field gathering process
-  !>
-  !> @param[in] np number of particles
-  !> @param[in] xp,yp,zp particle position arrays
-  !> @param[in] ex,ey,ez particle electric fields in each direction
-  !> @param[in] bx,by,bz particle magnetic fields in each direction
-  !> @param[in] ixyzmin tile grid minimum index
-  !> @param[in] xmin,ymin,zmin tile grid minimum position
-  !> @param[in] dx,dy,dz space discretization steps
-  !> @param[in] xyzmin grid minimum position
-  !> @param[in] dxyz space discretization steps
-  !> @param[in] nox,noy,noz interpolation order
-  !> @param[in] exg,eyg,ezg electric field grid arrays
-  !> @param[in] bxg,byg,bzg electric field grid arrays
-  !> @param[in] lvect vector length
-  !>
-  subroutine warpx_geteb_energy_conserving(np,xp,yp,zp, &
-       ex,ey,ez,bx,by,bz,ixyzmin,xmin,ymin,zmin,dx,dy,dz,nox,noy,noz, &
-       exg,exg_lo,exg_hi,eyg,eyg_lo,eyg_hi,ezg,ezg_lo,ezg_hi, &
-       bxg,bxg_lo,bxg_hi,byg,byg_lo,byg_hi,bzg,bzg_lo,bzg_hi, &
-       ll4symtry,l_lower_order_in_v, l_nodal,&
-       lvect,field_gathe_algo) &
-       bind(C, name="warpx_geteb_energy_conserving")
-
-    integer, intent(in) :: exg_lo(AMREX_SPACEDIM), eyg_lo(AMREX_SPACEDIM), ezg_lo(AMREX_SPACEDIM), &
-                           bxg_lo(AMREX_SPACEDIM), byg_lo(AMREX_SPACEDIM), bzg_lo(AMREX_SPACEDIM)
-    integer, intent(in) :: exg_hi(AMREX_SPACEDIM), eyg_hi(AMREX_SPACEDIM), ezg_hi(AMREX_SPACEDIM), &
-                           bxg_hi(AMREX_SPACEDIM), byg_hi(AMREX_SPACEDIM), bzg_hi(AMREX_SPACEDIM)
-    integer, intent(in) :: ixyzmin(AMREX_SPACEDIM)
-    real(amrex_real), intent(in) :: xmin,ymin,zmin,dx,dy,dz
-    integer(c_long), intent(in) :: field_gathe_algo
-    integer(c_long), intent(in) :: np,nox,noy,noz
-    integer(c_int), intent(in)  :: ll4symtry,l_lower_order_in_v, l_nodal
-    integer(c_long),intent(in)   :: lvect
-    real(amrex_real), intent(in), dimension(np) :: xp,yp,zp
-    real(amrex_real), intent(out), dimension(np) :: ex,ey,ez,bx,by,bz
-    real(amrex_real),intent(in):: exg(*), eyg(*), ezg(*), bxg(*), byg(*), bzg(*)
-    logical(pxr_logical) :: pxr_ll4symtry, pxr_l_lower_order_in_v, pxr_l_nodal
-
-    ! Compute the number of valid cells and guard cells
-    integer(c_long) :: exg_nvalid(AMREX_SPACEDIM), eyg_nvalid(AMREX_SPACEDIM), ezg_nvalid(AMREX_SPACEDIM),    &
-                       bxg_nvalid(AMREX_SPACEDIM), byg_nvalid(AMREX_SPACEDIM), bzg_nvalid(AMREX_SPACEDIM),    &
-                       exg_nguards(AMREX_SPACEDIM), eyg_nguards(AMREX_SPACEDIM), ezg_nguards(AMREX_SPACEDIM), &
-                       bxg_nguards(AMREX_SPACEDIM), byg_nguards(AMREX_SPACEDIM), bzg_nguards(AMREX_SPACEDIM)
-
-    pxr_ll4symtry = ll4symtry .eq. 1
-    pxr_l_lower_order_in_v = l_lower_order_in_v .eq. 1
-    pxr_l_nodal = l_nodal .eq. 1
-
-    exg_nguards = ixyzmin - exg_lo
-    eyg_nguards = ixyzmin - eyg_lo
-    ezg_nguards = ixyzmin - ezg_lo
-    bxg_nguards = ixyzmin - bxg_lo
-    byg_nguards = ixyzmin - byg_lo
-    bzg_nguards = ixyzmin - bzg_lo
-    exg_nvalid = exg_lo + exg_hi - 2_c_long*ixyzmin + 1_c_long
-    eyg_nvalid = eyg_lo + eyg_hi - 2_c_long*ixyzmin + 1_c_long
-    ezg_nvalid = ezg_lo + ezg_hi - 2_c_long*ixyzmin + 1_c_long
-    bxg_nvalid = bxg_lo + bxg_hi - 2_c_long*ixyzmin + 1_c_long
-    byg_nvalid = byg_lo + byg_hi - 2_c_long*ixyzmin + 1_c_long
-    bzg_nvalid = bzg_lo + bzg_hi - 2_c_long*ixyzmin + 1_c_long
-
-    CALL WRPX_PXR_GETEB_ENERGY_CONSERVING(np,xp,yp,zp, &
-         ex,ey,ez,bx,by,bz,xmin,ymin,zmin,dx,dy,dz,nox,noy,noz, &
-         exg,exg_nguards,exg_nvalid,&
-         eyg,eyg_nguards,eyg_nvalid,&
-         ezg,ezg_nguards,ezg_nvalid,&
-         bxg,bxg_nguards,bxg_nvalid,&
-         byg,byg_nguards,byg_nvalid,&
-         bzg,bzg_nguards,bzg_nvalid,&
-	 pxr_ll4symtry, pxr_l_lower_order_in_v, pxr_l_nodal, &
-	 lvect, field_gathe_algo )
-
-  end subroutine warpx_geteb_energy_conserving
-
-! _________________________________________________________________
-!>
-!> @brief
-!> Main subroutine for the charge deposition
-!>
-!> @details
-!> This subroutines enable to controle the interpolation order
-!> via the parameters nox,noy,noz and the type of algorithm via
-!> the parameter charge_depo_algo
-!
-!> @param[inout] rho charge array
-!> @param[in] np number of particles
-!> @param[in] xp,yp,zp particle position arrays
-!> @param[in] w particle weight arrays
-!> @param[in] q particle species charge
-!> @param[in] xmin,ymin,zmin tile grid minimum position
-!> @param[in] dx,dy,dz space discretization steps
-!> @param[in] nx,ny,nz number of cells
-!> @param[in] nxguard,nyguard,nzguard number of guard cells
-!> @param[in] nox,noy,noz interpolation order
-!> @param[in] lvect vector length
-!> @param[in] charge_depo_algo algorithm choice for the charge deposition
-!>
-subroutine warpx_charge_deposition(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,nx,ny,nz,&
-   nxguard,nyguard,nzguard,nox,noy,noz,lvect,charge_depo_algo) &
-  bind(C, name="warpx_charge_deposition")
-
-  integer(c_long), intent(IN)                   :: np
-  integer(c_long), intent(IN)                   :: nx,ny,nz
-  integer(c_long), intent(IN)                   :: nxguard,nyguard,nzguard
-  integer(c_long), intent(IN)                   :: nox,noy,noz
-  real(amrex_real), intent(IN OUT)              :: rho(*)
-  real(amrex_real), intent(IN)                  :: q
-  real(amrex_real), intent(IN)                  :: dx,dy,dz
-  real(amrex_real), intent(IN)                  :: xmin,ymin,zmin
-  real(amrex_real), intent(IN),  dimension(np)  :: xp,yp,zp,w
-  integer(c_long), intent(IN)                   :: lvect
-  integer(c_long), intent(IN)                   :: charge_depo_algo
-
-
-  ! Dimension 3
-#if (AMREX_SPACEDIM==3)
-
-  SELECT CASE(charge_depo_algo)
-
-  ! Scalar classical charge deposition subroutines
-  CASE(1)
-    IF ((nox.eq.1).and.(noy.eq.1).and.(noz.eq.1)) THEN
-
-      CALL depose_rho_scalar_1_1_1(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,nx,ny,nz,&
-    nxguard,nyguard,nzguard,lvect)
-
-    ELSE IF ((nox.eq.2).and.(noy.eq.2).and.(noz.eq.2)) THEN
-
-      CALL depose_rho_scalar_2_2_2(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,nx,ny,nz,&
-    nxguard,nyguard,nzguard,lvect)
-
-    ELSE IF ((nox.eq.3).and.(noy.eq.3).and.(noz.eq.3)) THEN
-
-      CALL depose_rho_scalar_3_3_3(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,nx,ny,nz,&
-    nxguard,nyguard,nzguard,lvect)
-
-    ELSE
-      CALL pxr_depose_rho_n(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,nx,ny,nz,&
-                  nxguard,nyguard,nzguard,nox,noy,noz, &
-                  .TRUE._c_long,.FALSE._c_long)
-    ENDIF
-
-  ! Optimized subroutines
-  CASE DEFAULT
-
-    IF ((nox.eq.1).and.(noy.eq.1).and.(noz.eq.1)) THEN
-      CALL depose_rho_vecHVv2_1_1_1(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,nx,ny,nz,&
-               nxguard,nyguard,nzguard,lvect)
-
-    ELSE IF ((nox.eq.2).and.(noy.eq.2).and.(noz.eq.2)) THEN
-      CALL depose_rho_vecHVv2_2_2_2(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,nx,ny,nz,&
-                 nxguard,nyguard,nzguard,lvect)
-
-    ELSE
-      CALL pxr_depose_rho_n(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,nx,ny,nz,&
-                  nxguard,nyguard,nzguard,nox,noy,noz, &
-                  .TRUE._c_long,.FALSE._c_long)
-    ENDIF
-  END SELECT
-
-  ! Dimension 2
-#elif (AMREX_SPACEDIM==2)
-
-#ifdef WARPX_RZ
-  logical(pxr_logical) :: l_2drz = .TRUE._c_long
-#else
-  logical(pxr_logical) :: l_2drz = .FALSE._c_long
-#endif
-
-  CALL pxr_depose_rho_n_2dxz(rho,np,xp,yp,zp,w,q,xmin,zmin,dx,dz,nx,nz,&
-       nxguard,nzguard,nox,noz, &
-       .TRUE._c_long, .FALSE._c_long, l_2drz, 0_c_long)
-
-#endif
-
- end subroutine warpx_charge_deposition
-
-  ! _________________________________________________________________
-  !>
-  !> @brief
-  !> Applies the inverse volume scaling for RZ charge deposition
-  !>
-  !> @details
-  !> The scaling is done for both single mode (FDTD) and
-  !> multi mode (spectral) (todo)
-  !
-  !> @param[inout] rho charge array
-  !> @param[in] rmin tile grid minimum radius
-  !> @param[in] dr radial space discretization steps
-  !> @param[in] nx,ny,nz number of cells
-  !> @param[in] nxguard,nyguard,nzguard number of guard cells
-  !>
-  subroutine warpx_charge_deposition_rz_volume_scaling(rho,rho_ng,rho_ntot,rmin,dr) &
-    bind(C, name="warpx_charge_deposition_rz_volume_scaling")
-
-    integer, intent(in) :: rho_ntot(AMREX_SPACEDIM)
-    integer(c_long), intent(in) :: rho_ng
-    real(amrex_real), intent(IN OUT):: rho(*)
-    real(amrex_real), intent(IN) :: rmin, dr
-
-#ifdef WARPX_RZ
-    integer(c_long) :: type_rz_depose = 1
-#endif
-
-    ! Compute the number of valid cells and guard cells
-    integer(c_long) :: rho_nvalid(AMREX_SPACEDIM), rho_nguards(AMREX_SPACEDIM)
-    rho_nvalid = rho_ntot - 2*rho_ng
-    rho_nguards = rho_ng
-
-#ifdef WARPX_RZ
-    CALL WRPX_PXR_RZ_VOLUME_SCALING_RHO(   &
-                 rho,rho_nguards,rho_nvalid, &
-                 rmin,dr,type_rz_depose)
-#endif
-
-  end subroutine warpx_charge_deposition_rz_volume_scaling
-
-  ! _________________________________________________________________
-  !>
-  !> @brief
   !> Main subroutine for the current deposition
   !>
   !> @details
@@ -355,165 +129,4 @@ subroutine warpx_charge_deposition(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,n
 
   end subroutine warpx_current_deposition
 
-  ! _________________________________________________________________
-  !>
-  !> @brief
-  !> Applies the inverse volume scaling for RZ current deposition
-  !>
-  !> @details
-  !> The scaling is done for single mode only
-  !
-  !> @param[inout] jx,jy,jz current arrays
-  !> @param[in] jx_ntot,jy_ntot,jz_ntot vectors with total number of
-  !>            cells (including guard cells) along each axis for each current
-  !> @param[in] jx_ng,jy_ng,jz_ng vectors with number of guard cells along each
-  !>            axis for each current
-  !> @param[in] rmin tile grid minimum radius
-  !> @param[in] dr radial space discretization steps
-  !>
-  subroutine warpx_current_deposition_rz_volume_scaling( &
-    jx,jx_ng,jx_ntot,jy,jy_ng,jy_ntot,jz,jz_ng,jz_ntot, &
-    rmin,dr) &
-    bind(C, name="warpx_current_deposition_rz_volume_scaling")
-
-    integer, intent(in) :: jx_ntot(AMREX_SPACEDIM), jy_ntot(AMREX_SPACEDIM), jz_ntot(AMREX_SPACEDIM)
-    integer(c_long), intent(in) :: jx_ng, jy_ng, jz_ng
-    real(amrex_real), intent(IN OUT):: jx(*), jy(*), jz(*)
-    real(amrex_real), intent(IN) :: rmin, dr
-
-#ifdef WARPX_RZ
-    integer(c_long) :: type_rz_depose = 1
-#endif
-    ! Compute the number of valid cells and guard cells
-    integer(c_long) :: jx_nvalid(AMREX_SPACEDIM), jy_nvalid(AMREX_SPACEDIM), jz_nvalid(AMREX_SPACEDIM), &
-                       jx_nguards(AMREX_SPACEDIM), jy_nguards(AMREX_SPACEDIM), jz_nguards(AMREX_SPACEDIM)
-    jx_nvalid = jx_ntot - 2*jx_ng
-    jy_nvalid = jy_ntot - 2*jy_ng
-    jz_nvalid = jz_ntot - 2*jz_ng
-    jx_nguards = jx_ng
-    jy_nguards = jy_ng
-    jz_nguards = jz_ng
-
-#ifdef WARPX_RZ
-    CALL WRPX_PXR_RZ_VOLUME_SCALING_J(   &
-                 jx,jx_nguards,jx_nvalid, &
-                 jy,jy_nguards,jy_nvalid, &
-                 jz,jz_nguards,jz_nvalid, &
-                 rmin,dr,type_rz_depose)
-#endif
-
-  end subroutine warpx_current_deposition_rz_volume_scaling
-
-  ! _________________________________________________________________
-  !>
-  !> @brief
-  !> Main subroutine for the particle pusher (velocity and position)
-  !>
-  !> @param[in] np number of super-particles
-  !> @param[in] xp,yp,zp particle position arrays
-  !> @param[in] uxp,uyp,uzp normalized momentum in each direction
-  !> @param[in] gaminv particle Lorentz factors
-  !> @param[in] ex,ey,ez particle electric fields in each direction
-  !> @param[in] bx,by,bz particle magnetic fields in each direction
-  !> @param[in] q charge
-  !> @param[in] m masse
-  !> @param[in] dt time step
-  !> @param[in] particle_pusher_algo Particle pusher algorithm
-  subroutine warpx_particle_pusher(np,xp,yp,zp,uxp,uyp,uzp, &
-                                  gaminv,&
-                                  ex,ey,ez,bx,by,bz,q,m,dt, &
-                                  particle_pusher_algo) &
-       bind(C, name="warpx_particle_pusher")
-
-    INTEGER(c_long), INTENT(IN)   :: np
-    REAL(amrex_real),INTENT(INOUT)    :: gaminv(np)
-    REAL(amrex_real),INTENT(INOUT)    :: xp(np),yp(np),zp(np)
-    REAL(amrex_real),INTENT(INOUT)    :: uxp(np),uyp(np),uzp(np)
-    REAL(amrex_real),INTENT(IN)       :: ex(np),ey(np),ez(np)
-    REAL(amrex_real),INTENT(IN)       :: bx(np),by(np),bz(np)
-    REAL(amrex_real),INTENT(IN)       :: q,m,dt
-    INTEGER(c_long), INTENT(IN)   :: particle_pusher_algo
-
-    SELECT CASE (particle_pusher_algo)
-
-    !! Vay pusher -- Full push
-    CASE (1_c_long)
-      CALL pxr_set_gamma(np,uxp,uyp,uzp,gaminv)
-
-      CALL pxr_ebcancelpush3d(np,uxp,uyp,uzp,gaminv, &
-                                 ex,ey,ez,  &
-                                 bx,by,bz,q,m,dt,0_c_long)
-    CASE DEFAULT
-
-      ! Momentum pusher in a single loop
-      CALL pxr_boris_push_u_3d(np,uxp,uyp,uzp,&
-                                     gaminv, &
-                                     ex,ey,ez, &
-                                     bx,by,bz, &
-                                     q,m,dt)
-
-    END SELECT
-
-    !!!! --- push particle species positions a time step
-#if (AMREX_SPACEDIM == 3) || (defined WARPX_RZ)
-    CALL pxr_pushxyz(np,xp,yp,zp,uxp,uyp,uzp,gaminv,dt)
-#elif (AMREX_SPACEDIM == 2)
-    CALL pxr_pushxz(np,xp,zp,uxp,uzp,gaminv,dt)
-#endif
-
-  end subroutine warpx_particle_pusher
-
-
-  ! _________________________________________________________________
-  !>
-  !> @brief
-  !> Main subroutine for the particle pusher (velocity)
-  !>
-  !> @param[in] np number of super-particles
-  !> @param[in] xp,yp,zp particle position arrays
-  !> @param[in] uxp,uyp,uzp normalized momentum in each direction
-  !> @param[in] gaminv particle Lorentz factors
-  !> @param[in] ex,ey,ez particle electric fields in each direction
-  !> @param[in] bx,by,bz particle magnetic fields in each direction
-  !> @param[in] q charge
-  !> @param[in] m masse
-  !> @param[in] dt time step
-  !> @param[in] particle_pusher_algo Particle pusher algorithm
-  subroutine warpx_particle_pusher_momenta(np,xp,yp,zp,uxp,uyp,uzp, &
-                                  gaminv,&
-                                  ex,ey,ez,bx,by,bz,q,m,dt, &
-                                  particle_pusher_algo) &
-       bind(C, name="warpx_particle_pusher_momenta")
-
-    INTEGER(c_long), INTENT(IN)   :: np
-    REAL(amrex_real),INTENT(INOUT)    :: gaminv(np)
-    REAL(amrex_real),INTENT(IN)       :: xp(np),yp(np),zp(np)
-    REAL(amrex_real),INTENT(INOUT)    :: uxp(np),uyp(np),uzp(np)
-    REAL(amrex_real),INTENT(IN)       :: ex(np),ey(np),ez(np)
-    REAL(amrex_real),INTENT(IN)       :: bx(np),by(np),bz(np)
-    REAL(amrex_real),INTENT(IN)       :: q,m,dt
-    INTEGER(c_long), INTENT(IN)   :: particle_pusher_algo
-
-    SELECT CASE (particle_pusher_algo)
-
-    !! Vay pusher -- Full push
-    CASE (1_c_long)
-      CALL pxr_set_gamma(np,uxp,uyp,uzp,gaminv)
-
-      CALL pxr_ebcancelpush3d(np,uxp,uyp,uzp,gaminv, &
-                                 ex,ey,ez,  &
-                                 bx,by,bz,q,m,dt,0_c_long)
-    CASE DEFAULT
-
-      ! Momentum pusher in a single loop
-      CALL pxr_boris_push_u_3d(np,uxp,uyp,uzp,&
-                                     gaminv, &
-                                     ex,ey,ez, &
-                                     bx,by,bz, &
-                                     q,m,dt)
-
-    END SELECT
-
-  end subroutine warpx_particle_pusher_momenta
-
 end module warpx_to_pxr_module
diff --git a/Source/Initialization/CustomDensityProb.H b/Source/Initialization/CustomDensityProb.H
new file mode 100644
index 000000000..b00830e6c
--- /dev/null
+++ b/Source/Initialization/CustomDensityProb.H
@@ -0,0 +1,49 @@
+#ifndef CUSTOM_DENSITY_PROB_H_
+#define CUSTOM_DENSITY_PROB_H_
+
+#include <AMReX_ParmParse.H>
+#include <AMReX_Arena.H>
+#include <AMReX_Gpu.H>
+#include <AMReX_Dim3.H>
+
+// An example of Custom Density Profile
+
+// struct whose getDensity returns density at a given position computed from
+// a custom function, with runtime input parameters.
+struct InjectorDensityCustom
+{
+    InjectorDensityCustom (std::string const& species_name)
+        : p(nullptr)
+    {
+        // Read parameters for custom density profile from file, and
+        // store them in managed memory.
+        amrex::ParmParse pp(species_name);
+        std::vector<amrex::Real> v;
+        pp.getarr("custom_profile_params", v);
+        p = static_cast<amrex::Real*>
+            (amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real)*v.size()));
+        for (int i = 0; i < static_cast<int>(v.size()); ++i) {
+            p[i] = v[i];
+        }
+    }
+
+    // Return density at given position, using user-defined parameters 
+    // stored in p.
+    AMREX_GPU_HOST_DEVICE
+    amrex::Real
+    getDensity (amrex::Real, amrex::Real, amrex::Real) const noexcept
+    {
+        return p[0];
+    }
+
+    // Note that we are not allowed to have non-trivial destructor.
+    // So we rely on clear() to free memory.
+    void clear () {
+        amrex::The_Managed_Arena()->free(p);
+    }
+
+private:
+    amrex::Real* p;
+};
+
+#endif
diff --git a/Source/Initialization/CustomDensityProb.cpp b/Source/Initialization/CustomDensityProb.cpp
deleted file mode 100644
index 3efcb13c5..000000000
--- a/Source/Initialization/CustomDensityProb.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <PlasmaInjector.H>
-
-#include <iostream>
-
-using namespace amrex;
-
-///
-/// This "custom" density profile just does constant
-///
-Real CustomDensityProfile::getDensity(Real x, Real y, Real z) const {
-  return params[0];
-}
diff --git a/Source/Initialization/CustomMomentumProb.H b/Source/Initialization/CustomMomentumProb.H
new file mode 100644
index 000000000..f8bc29a05
--- /dev/null
+++ b/Source/Initialization/CustomMomentumProb.H
@@ -0,0 +1,30 @@
+#ifndef CUSTOM_MOMENTUM_PROB_H
+#define CUSTOM_MOMENTUM_PROB_H
+
+#include <AMReX_ParmParse.H>
+#include <AMReX_Gpu.H>
+#include <AMReX_Arena.H>
+#include <AMReX_Dim3.H>
+
+// An example of Custom Momentum Profile
+
+// struct whose getDensity returns momentum at a given position computed from
+// a custom function.
+struct InjectorMomentumCustom
+{
+    InjectorMomentumCustom (std::string const& /*a_species_name*/) {}
+
+    // Return momentum at given position (illustration: momentum=0).
+    AMREX_GPU_HOST_DEVICE
+    amrex::XDim3
+    getMomentum (amrex::Real, amrex::Real, amrex::Real) const noexcept
+    {
+        return {0., 0., 0.};
+    }
+
+    // Note that we are not allowed to have non-trivial destructor.
+    // So we rely on clear() to free memory if needed.
+    void clear () { }
+};
+
+#endif
diff --git a/Source/Initialization/CustomMomentumProb.cpp b/Source/Initialization/CustomMomentumProb.cpp
deleted file mode 100644
index fa21252d0..000000000
--- a/Source/Initialization/CustomMomentumProb.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-#include <PlasmaInjector.H>
-
-#include <iostream>
-
-using namespace amrex;
-
-///
-/// This "custom" momentum distribution just does 0 momentum
-///
-void CustomMomentumDistribution::getMomentum(vec3& u, Real x, Real y, Real z) {
-  u[0] = 0;
-  u[1] = 0;
-  u[2] = 0;
-}
diff --git a/Source/Initialization/InjectorDensity.H b/Source/Initialization/InjectorDensity.H
new file mode 100644
index 000000000..b7f5c26eb
--- /dev/null
+++ b/Source/Initialization/InjectorDensity.H
@@ -0,0 +1,202 @@
+#ifndef INJECTOR_DENSITY_H_
+#define INJECTOR_DENSITY_H_
+
+#include <AMReX_Gpu.H>
+#include <AMReX_Dim3.H>
+#include <GpuParser.H>
+#include <CustomDensityProb.H>
+#include <WarpXConst.H>
+
+// struct whose getDensity returns constant density.
+struct InjectorDensityConstant
+{
+    InjectorDensityConstant (amrex::Real a_rho) noexcept : m_rho(a_rho) {}
+
+    AMREX_GPU_HOST_DEVICE
+    amrex::Real
+    getDensity (amrex::Real, amrex::Real, amrex::Real) const noexcept
+    {
+        return m_rho;
+    }
+
+private:
+    amrex::Real m_rho;
+};
+
+// struct whose getDensity returns local density computed from parser.
+struct InjectorDensityParser
+{
+    InjectorDensityParser (WarpXParser const& a_parser) noexcept
+        : m_parser(a_parser) {}
+
+    AMREX_GPU_HOST_DEVICE
+    amrex::Real
+    getDensity (amrex::Real x, amrex::Real y, amrex::Real z) const noexcept
+    {
+        return m_parser(x,y,z);
+    }
+
+    // InjectorDensityParser constructs this GpuParser from WarpXParser.
+    GpuParser m_parser;
+};
+
+// struct whose getDensity returns local density computed from predefined profile.
+struct InjectorDensityPredefined
+{
+    InjectorDensityPredefined (std::string const& a_species_name) noexcept;
+
+    void clear ();
+
+    AMREX_GPU_HOST_DEVICE
+    amrex::Real
+    getDensity (amrex::Real x, amrex::Real y, amrex::Real z) const noexcept
+    {
+        // Choices for profile are:
+        // - parabolic_channel
+        switch (profile)
+        {
+        case Profile::parabolic_channel:
+        {
+            amrex::Real z_start   = p[0];
+            amrex::Real ramp_up   = p[1];
+            amrex::Real plateau   = p[2];
+            amrex::Real ramp_down = p[3];
+            amrex::Real rc        = p[4];
+            amrex::Real n0        = p[5];
+            amrex::Real n;
+            amrex::Real kp = PhysConst::q_e/PhysConst::c
+                *std::sqrt( n0/(PhysConst::m_e*PhysConst::ep0) );
+
+            if        ((z-z_start)>=0               and
+                       (z-z_start)<ramp_up ) {
+                n = (z-z_start)/ramp_up;
+            } else if ((z-z_start)>=ramp_up         and
+                       (z-z_start)< ramp_up+plateau ) {
+                n = 1.;
+            } else if ((z-z_start)>=ramp_up+plateau and
+                       (z-z_start)< ramp_up+plateau+ramp_down) {
+                n = 1.-((z-z_start)-ramp_up-plateau)/ramp_down;
+            } else {
+                n = 0.;
+            }
+            n *= n0*(1.+4.*(x*x+y*y)/(kp*kp*rc*rc*rc*rc));
+            return n;
+        }
+        default:
+            amrex::Abort("InjectorDensityPredefined: how did we get here?");
+            return 0.0;
+        }
+    }
+
+private:
+    enum struct Profile { null, parabolic_channel };
+    Profile profile;
+    amrex::Real* p;
+};
+
+// Base struct for density injector. 
+// InjectorDensity contains a union (called Object) that holds any one 
+// instance of: 
+// - InjectorDensityConstant  : to generate constant density;
+// - InjectorDensityParser    : to generate density from parser;
+// - InjectorDensityCustom    : to generate density from custom profile;
+// - InjectorDensityPredefined: to generate density from predefined profile;
+// The choice is made at runtime, depending in the constructor called.
+// This mimics virtual functions, except the struct is stored in managed memory
+// and member functions are made __host__ __device__ to run on CPU and GPU.
+// This struct inherits from amrex::Gpu::Managed to provide new and delete
+// operators in managed memory when running on GPU. Nothing special on CPU.
+struct InjectorDensity
+    : public amrex::Gpu::Managed
+{
+    // This constructor stores a InjectorDensityConstant in union object.
+    InjectorDensity (InjectorDensityConstant* t, amrex::Real a_rho)
+        : type(Type::constant),
+          object(t,a_rho)
+    { }
+
+    // This constructor stores a InjectorDensityParser in union object.
+    InjectorDensity (InjectorDensityParser* t, WarpXParser const& a_parser)
+        : type(Type::parser),
+          object(t,a_parser)
+    { }
+
+    // This constructor stores a InjectorDensityCustom in union object.
+    InjectorDensity (InjectorDensityCustom* t, std::string const& a_species_name)
+        : type(Type::custom),
+          object(t,a_species_name)
+    { }
+
+    // This constructor stores a InjectorDensityPredefined in union object.
+    InjectorDensity (InjectorDensityPredefined* t, std::string const& a_species_name)
+        : type(Type::predefined),
+          object(t,a_species_name)
+    { }
+
+    // Explicitly prevent the compiler from generating copy constructors
+    // and copy assignment operators.
+    InjectorDensity (InjectorDensity const&) = delete;
+    InjectorDensity (InjectorDensity&&) = delete;
+    void operator= (InjectorDensity const&) = delete;
+    void operator= (InjectorDensity &&) = delete;
+
+    ~InjectorDensity ();
+
+    std::size_t sharedMemoryNeeded () const noexcept;
+
+    // call getDensity from the object stored in the union
+    // (the union is called Object, and the instance is called object).
+    AMREX_GPU_HOST_DEVICE
+    amrex::Real
+    getDensity (amrex::Real x, amrex::Real y, amrex::Real z) const noexcept
+    {
+        switch (type)
+        {
+        case Type::parser:
+        {
+            return object.parser.getDensity(x,y,z);
+        }
+        case Type::constant:
+        {
+            return object.constant.getDensity(x,y,z);
+        }
+        case Type::custom:
+        {
+            return object.custom.getDensity(x,y,z);
+        }
+        case Type::predefined:
+        {
+            return object.predefined.getDensity(x,y,z);
+        }
+        default:
+        {
+            amrex::Abort("InjectorDensity: unknown type");
+            return 0.0;
+        }
+        }
+    }
+
+private:
+    enum struct Type { constant, custom, predefined, parser };
+    Type type;
+
+    // An instance of union Object constructs and stores any one of
+    // the objects declared (constant or parser or custom or predefined).
+    union Object {
+        Object (InjectorDensityConstant*, amrex::Real a_rho) noexcept
+            : constant(a_rho) {}
+        Object (InjectorDensityParser*, WarpXParser const& a_parser) noexcept
+            : parser(a_parser) {}
+        Object (InjectorDensityCustom*, std::string const& a_species_name) noexcept
+            : custom(a_species_name) {}
+        Object (InjectorDensityPredefined*, std::string const& a_species_name) noexcept
+            : predefined(a_species_name) {}
+        InjectorDensityConstant   constant;
+        InjectorDensityParser     parser;
+        InjectorDensityCustom     custom;
+        InjectorDensityPredefined predefined;
+    };
+    Object object;
+};
+
+#endif
diff --git a/Source/Initialization/InjectorDensity.cpp b/Source/Initialization/InjectorDensity.cpp
new file mode 100644
index 000000000..54df4b14d
--- /dev/null
+++ b/Source/Initialization/InjectorDensity.cpp
@@ -0,0 +1,77 @@
+#include <PlasmaInjector.H>
+
+using namespace amrex;
+
+InjectorDensity::~InjectorDensity ()
+{
+    switch (type)
+    {
+    case Type::parser:
+    {
+        object.parser.m_parser.clear();
+        break;
+    }
+    case Type::custom:
+    {
+        object.custom.clear();
+        break;
+    }
+    case Type::predefined:
+    {
+        object.predefined.clear();
+        break;
+    }
+    }
+}
+
+// Compute the amount of memory needed in GPU Shared Memory.
+std::size_t
+InjectorDensity::sharedMemoryNeeded () const noexcept
+{
+    switch (type)
+    {
+    case Type::parser:
+    {
+        // For parser injector, the 3D position of each particle
+        // is stored in shared memory.
+        return amrex::Gpu::numThreadsPerBlockParallelFor() * sizeof(double) * 3;
+    }
+    default:
+        return 0;
+    }
+}
+
+InjectorDensityPredefined::InjectorDensityPredefined (
+    std::string const& a_species_name) noexcept
+    : profile(Profile::null)
+{
+    ParmParse pp(a_species_name);
+
+    std::vector<amrex::Real> v;
+    // Read parameters for the predefined plasma profile, 
+    // and store them in managed memory
+    pp.getarr("predefined_profile_params", v);
+    p = static_cast<amrex::Real*>
+        (amrex::The_Managed_Arena()->alloc(sizeof(amrex::Real)*v.size()));
+    for (int i = 0; i < static_cast<int>(v.size()); ++i) {
+        p[i] = v[i];
+    }
+
+    // Parse predefined profile name, and update member variable profile.
+    std::string which_profile_s;
+    pp.query("predefined_profile_name", which_profile_s);
+    std::transform(which_profile_s.begin(), which_profile_s.end(),
+                   which_profile_s.begin(), ::tolower);
+    if (which_profile_s == "parabolic_channel"){
+        profile = Profile::parabolic_channel;
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(v.size() > 5,
+            "InjectorDensityPredefined::parabolic_channel: not enough parameters");
+    }
+}
+
+// Note that we are not allowed to have non-trivial destructor.
+// So we rely on clear() to free memory.
+void InjectorDensityPredefined::clear ()
+{
+    amrex::The_Managed_Arena()->free(p);
+}
diff --git a/Source/Initialization/InjectorMomentum.H b/Source/Initialization/InjectorMomentum.H
new file mode 100644
index 000000000..399ee7759
--- /dev/null
+++ b/Source/Initialization/InjectorMomentum.H
@@ -0,0 +1,223 @@
+#ifndef INJECTOR_MOMENTUM_H_
+#define INJECTOR_MOMENTUM_H_
+
+#include <AMReX_Gpu.H>
+#include <AMReX_Dim3.H>
+#include <GpuParser.H>
+#include <CustomMomentumProb.H>
+
+// struct whose getMomentum returns constant momentum.
+struct InjectorMomentumConstant
+{
+    InjectorMomentumConstant (amrex::Real a_ux, amrex::Real a_uy, amrex::Real a_uz) noexcept
+        : m_ux(a_ux), m_uy(a_uy), m_uz(a_uz) {}
+
+    AMREX_GPU_HOST_DEVICE
+    amrex::XDim3
+    getMomentum (amrex::Real, amrex::Real, amrex::Real) const noexcept
+    {
+        return amrex::XDim3{m_ux,m_uy,m_uz};
+    }
+private:
+    amrex::Real m_ux, m_uy, m_uz;
+};
+
+// struct whose getMomentum returns momentum for 1 particle, from random 
+// gaussian distribution.
+struct InjectorMomentumGaussian
+{
+    InjectorMomentumGaussian (amrex::Real a_ux_m, amrex::Real a_uy_m,
+                              amrex::Real a_uz_m, amrex::Real a_ux_th,
+                              amrex::Real a_uy_th, amrex::Real a_uz_th) noexcept
+        : m_ux_m(a_ux_m), m_uy_m(a_uy_m), m_uz_m(a_uz_m),
+          m_ux_th(a_ux_th), m_uy_th(a_uy_th), m_uz_th(a_uz_th)
+        {}
+
+    AMREX_GPU_HOST_DEVICE
+    amrex::XDim3
+    getMomentum (amrex::Real x, amrex::Real y, amrex::Real z) const noexcept
+    {
+        return amrex::XDim3{amrex::RandomNormal(m_ux_m, m_ux_th),
+                            amrex::RandomNormal(m_uy_m, m_uy_th),
+                            amrex::RandomNormal(m_uz_m, m_uz_th)};
+    }
+private:
+    amrex::Real m_ux_m, m_uy_m, m_uz_m;
+    amrex::Real m_ux_th, m_uy_th, m_uz_th;
+};
+
+// struct whose getMomentum returns momentum for 1 particle, for
+// radial expansion
+struct InjectorMomentumRadialExpansion
+{
+    InjectorMomentumRadialExpansion (amrex::Real a_u_over_r) noexcept
+        : u_over_r(a_u_over_r)
+        {}
+
+    AMREX_GPU_HOST_DEVICE
+    amrex::XDim3
+    getMomentum (amrex::Real x, amrex::Real y, amrex::Real z) const noexcept
+    {
+        return {x*u_over_r, y*u_over_r, z*u_over_r};
+    }
+
+private:
+    amrex::Real u_over_r;
+};
+
+// struct whose getMomentumm returns local momentum computed from parser.
+struct InjectorMomentumParser
+{
+    InjectorMomentumParser (WarpXParser const& a_ux_parser,
+                            WarpXParser const& a_uy_parser,
+                            WarpXParser const& a_uz_parser) noexcept
+        : m_ux_parser(a_ux_parser), m_uy_parser(a_uy_parser),
+          m_uz_parser(a_uz_parser) {}
+
+    AMREX_GPU_HOST_DEVICE
+    amrex::XDim3
+    getMomentum (amrex::Real x, amrex::Real y, amrex::Real z) const noexcept
+    {
+        return amrex::XDim3{m_ux_parser(x,y,z),m_uy_parser(x,y,z),m_uz_parser(x,y,z)};
+    }
+
+    GpuParser m_ux_parser, m_uy_parser, m_uz_parser;
+};
+
+// Base struct for momentum injector. 
+// InjectorMomentum contains a union (called Object) that holds any one 
+// instance of: 
+// - InjectorMomentumConstant       : to generate constant density;
+// - InjectorMomentumGaussian       : to generate gaussian distribution;
+// - InjectorMomentumRadialExpansion: to generate radial expansion;
+// - InjectorMomentumParser         : to generate momentum from parser;
+// The choice is made at runtime, depending in the constructor called.
+// This mimics virtual functions, except the struct is stored in managed memory
+// and member functions are made __host__ __device__ to run on CPU and GPU.
+// This struct inherits from amrex::Gpu::Managed to provide new and delete
+// operators in managed memory when running on GPU. Nothing special on CPU.
+struct InjectorMomentum
+    : public amrex::Gpu::Managed
+{
+    // This constructor stores a InjectorMomentumConstant in union object.
+    InjectorMomentum (InjectorMomentumConstant* t,
+                      amrex::Real a_ux, amrex::Real a_uy, amrex::Real a_uz)
+        : type(Type::constant),
+          object(t, a_ux, a_uy, a_uz)
+    { }
+
+    // This constructor stores a InjectorMomentumParser in union object.
+    InjectorMomentum (InjectorMomentumParser* t,
+                      WarpXParser const& a_ux_parser,
+                      WarpXParser const& a_uy_parser,
+                      WarpXParser const& a_uz_parser)
+        : type(Type::parser),
+          object(t, a_ux_parser, a_uy_parser, a_uz_parser)
+    { }
+
+    // This constructor stores a InjectorMomentumGaussian in union object.
+    InjectorMomentum (InjectorMomentumGaussian* t,
+                      amrex::Real a_ux_m, amrex::Real a_uy_m, amrex::Real a_uz_m,
+                      amrex::Real a_ux_th, amrex::Real a_uy_th, amrex::Real a_uz_th)
+        : type(Type::gaussian),
+          object(t,a_ux_m,a_uy_m,a_uz_m,a_ux_th,a_uy_th,a_uz_th)
+    { }
+
+    // This constructor stores a InjectorMomentumCustom in union object.
+    InjectorMomentum (InjectorMomentumCustom* t,
+                      std::string const& a_species_name)
+        : type(Type::custom),
+          object(t, a_species_name)
+    { }
+
+    // This constructor stores a InjectorMomentumRadialExpansion in union object.
+    InjectorMomentum (InjectorMomentumRadialExpansion* t,
+                      amrex::Real u_over_r)
+        : type(Type::radial_expansion),
+          object(t, u_over_r)
+    { }
+
+    // Explicitly prevent the compiler from generating copy constructors
+    // and copy assignment operators.
+    InjectorMomentum (InjectorMomentum const&) = delete;
+    InjectorMomentum (InjectorMomentum&&) = delete;
+    void operator= (InjectorMomentum const&) = delete;
+    void operator= (InjectorMomentum &&) = delete;
+
+    ~InjectorMomentum ();
+
+    std::size_t sharedMemoryNeeded () const noexcept;
+
+    // call getMomentum from the object stored in the union
+    // (the union is called Object, and the instance is called object).
+    AMREX_GPU_HOST_DEVICE
+    amrex::XDim3
+    getMomentum (amrex::Real x, amrex::Real y, amrex::Real z) const noexcept
+    {
+        switch (type)
+        {
+        case Type::parser:
+        {
+            return object.parser.getMomentum(x,y,z);
+        }
+        case Type::gaussian:
+        {
+            return object.gaussian.getMomentum(x,y,z);
+        }
+        case Type::constant:
+        {
+            return object.constant.getMomentum(x,y,z);
+        }
+        case Type::radial_expansion:
+        {
+            return object.radial_expansion.getMomentum(x,y,z);
+        }
+        case Type::custom:
+        {
+            return object.custom.getMomentum(x,y,z);
+        }
+        default:
+        {
+            amrex::Abort("InjectorMomentum: unknown type");
+            return {0.0,0.0,0.0};
+        }
+        }
+    }
+
+private:
+    enum struct Type { constant, custom, gaussian, radial_expansion, parser };
+    Type type;
+
+    // An instance of union Object constructs and stores any one of
+    // the objects declared (constant or custom or gaussian or 
+    // radial_expansion or parser).
+    union Object {
+        Object (InjectorMomentumConstant*,
+                amrex::Real a_ux, amrex::Real a_uy, amrex::Real a_uz) noexcept
+            : constant(a_ux,a_uy,a_uz) {}
+        Object (InjectorMomentumCustom*,
+                std::string const& a_species_name) noexcept
+            : custom(a_species_name) {}
+        Object (InjectorMomentumGaussian*,
+                amrex::Real a_ux_m, amrex::Real a_uy_m,
+                amrex::Real a_uz_m, amrex::Real a_ux_th,
+                amrex::Real a_uy_th, amrex::Real a_uz_th) noexcept
+            : gaussian(a_ux_m,a_uy_m,a_uz_m,a_ux_th,a_uy_th,a_uz_th) {}
+        Object (InjectorMomentumRadialExpansion*,
+                amrex::Real u_over_r) noexcept
+            : radial_expansion(u_over_r) {}
+        Object (InjectorMomentumParser*,
+                WarpXParser const& a_ux_parser,
+                WarpXParser const& a_uy_parser,
+                WarpXParser const& a_uz_parser) noexcept
+            : parser(a_ux_parser, a_uy_parser, a_uz_parser) {}
+        InjectorMomentumConstant constant;
+        InjectorMomentumCustom   custom;
+        InjectorMomentumGaussian gaussian;
+        InjectorMomentumRadialExpansion radial_expansion;
+        InjectorMomentumParser   parser;
+    };
+    Object object;
+};
+
+#endif
diff --git a/Source/Initialization/InjectorMomentum.cpp b/Source/Initialization/InjectorMomentum.cpp
new file mode 100644
index 000000000..a197b5bef
--- /dev/null
+++ b/Source/Initialization/InjectorMomentum.cpp
@@ -0,0 +1,40 @@
+#include <PlasmaInjector.H>
+
+using namespace amrex;
+
+InjectorMomentum::~InjectorMomentum ()
+{
+    switch (type)
+    {
+    case Type::parser:
+    {
+        object.parser.m_ux_parser.clear();
+        object.parser.m_uy_parser.clear();
+        object.parser.m_uz_parser.clear();
+        break;
+    }
+    case Type::custom:
+    {
+        object.custom.clear();
+        break;
+    }
+    }
+}
+
+// Compute the amount of memory needed in GPU Shared Memory.
+std::size_t
+InjectorMomentum::sharedMemoryNeeded () const noexcept
+{
+    switch (type)
+    {
+    case Type::parser:
+    {
+        // For parser injector, the 3D position of each particle
+        // is stored in shared memory.
+        return amrex::Gpu::numThreadsPerBlockParallelFor() * sizeof(double) * 3;
+    }
+    default:
+        return 0;
+    }
+}
+
diff --git a/Source/Initialization/InjectorPosition.H b/Source/Initialization/InjectorPosition.H
new file mode 100644
index 000000000..19bb092dd
--- /dev/null
+++ b/Source/Initialization/InjectorPosition.H
@@ -0,0 +1,146 @@
+#ifndef INJECTOR_POSITION_H_
+#define INJECTOR_POSITION_H_
+
+#include <AMReX_Gpu.H>
+#include <AMReX_Dim3.H>
+#include <AMReX_Utility.H>
+
+// struct whose getPositionUnitBox returns x, y and z for a particle with
+// random distribution inside a unit cell.
+struct InjectorPositionRandom
+{
+    AMREX_GPU_HOST_DEVICE
+    amrex::XDim3
+    getPositionUnitBox (int i_part, int ref_fac=1) const noexcept
+    {
+        return amrex::XDim3{amrex::Random(), amrex::Random(), amrex::Random()};
+    }
+};
+
+// struct whose getPositionUnitBox returns x, y and z for a particle with
+// regular distribution inside a unit cell.
+struct InjectorPositionRegular
+{
+    InjectorPositionRegular (amrex::Dim3 const& a_ppc) noexcept : ppc(a_ppc) {}
+
+    // i_part: particle number within the cell, required to evenly space
+    // particles within the cell.
+    // ref_fac: the number of particles evenly-spaced within a cell 
+    // is a_ppc*(ref_fac**AMREX_SPACEDIM).
+    AMREX_GPU_HOST_DEVICE
+    amrex::XDim3
+    getPositionUnitBox (int i_part, int ref_fac=1) const noexcept
+    {
+        int nx = ref_fac*ppc.x;
+        int ny = ref_fac*ppc.y;
+#if (AMREX_SPACEDIM == 3)
+        int nz = ref_fac*ppc.z;
+#else
+        int nz = 1;
+#endif
+        int ix_part = i_part/(ny*nz);  // written this way backward compatibility
+        int iz_part = (i_part-ix_part*(ny*nz)) / ny;
+        int iy_part = (i_part-ix_part*(ny*nz)) - ny*iz_part;
+        return amrex::XDim3{(0.5+ix_part)/nx, (0.5+iy_part)/ny, (0.5+iz_part) / nz};
+    }
+private:
+    amrex::Dim3 ppc;
+};
+
+// Base struct for position injector. 
+// InjectorPosition contains a union (called Object) that holds any one 
+// instance of: 
+// - InjectorPositionRandom : to generate random distribution;
+// - InjectorPositionRegular: to generate regular distribution.
+// The choice is made at runtime, depending in the constructor called.
+// This mimics virtual functions, except the struct is stored in managed memory
+// and member functions are made __host__ __device__ to run on CPU and GPU.
+// This struct inherits from amrex::Gpu::Managed to provide new and delete
+// operators in managed memory when running on GPU. Nothing special on CPU.
+struct InjectorPosition
+    : public amrex::Gpu::Managed
+{
+    // This constructor stores a InjectorPositionRandom in union object.
+    InjectorPosition (InjectorPositionRandom* t,
+                      amrex::Real a_xmin, amrex::Real a_xmax,
+                      amrex::Real a_ymin, amrex::Real a_ymax,
+                      amrex::Real a_zmin, amrex::Real a_zmax)
+        : type(Type::random),
+          object(t),
+          xmin(a_xmin), xmax(a_xmax),
+          ymin(a_ymin), ymax(a_ymax),
+          zmin(a_zmin), zmax(a_zmax)
+    { }
+
+    // This constructor stores a InjectorPositionRegular in union object.
+    InjectorPosition (InjectorPositionRegular* t,
+                      amrex::Real a_xmin, amrex::Real a_xmax,
+                      amrex::Real a_ymin, amrex::Real a_ymax,
+                      amrex::Real a_zmin, amrex::Real a_zmax,
+                      amrex::Dim3 const& a_ppc)
+        : type(Type::regular),
+          object(t, a_ppc),
+          xmin(a_xmin), xmax(a_xmax),
+          ymin(a_ymin), ymax(a_ymax),
+          zmin(a_zmin), zmax(a_zmax)
+    { }
+
+    // Explicitly prevent the compiler from generating copy constructors
+    // and copy assignment operators.
+    InjectorPosition (InjectorPosition const&) = delete;
+    InjectorPosition (InjectorPosition&&) = delete;
+    void operator= (InjectorPosition const&) = delete;
+    void operator= (InjectorPosition &&) = delete;
+
+    std::size_t sharedMemoryNeeded () const noexcept { return 0; }
+
+    // call getPositionUnitBox from the object stored in the union
+    // (the union is called Object, and the instance is called object).
+    AMREX_GPU_HOST_DEVICE
+    amrex::XDim3
+    getPositionUnitBox (int i_part, int ref_fac=1) const noexcept
+    {
+        switch (type)
+        {
+        case Type::regular:
+        {
+            return object.regular.getPositionUnitBox(i_part, ref_fac);
+        }
+        default:
+        {
+            return object.random.getPositionUnitBox(i_part, ref_fac);
+        }
+        };
+    }
+
+    // bool: whether position specified is within bounds.
+    AMREX_GPU_HOST_DEVICE
+    bool
+    insideBounds (amrex::Real x, amrex::Real y, amrex::Real z) const noexcept
+    {
+        return (x < xmax and x >= xmin and
+                y < ymax and y >= ymin and
+                z < zmax and z >= zmin);
+    }
+
+private:
+    enum struct Type { random, regular };
+    Type type;
+
+    // An instance of union Object constructs and stores any one of
+    // the objects declared (random or regular).
+    union Object {
+        Object (InjectorPositionRandom*) noexcept : random() {}
+        Object (InjectorPositionRegular*, amrex::Dim3 const& a_ppc) noexcept
+            : regular(a_ppc) {}
+        InjectorPositionRandom random;
+        InjectorPositionRegular regular;
+    };
+    Object object;
+
+    amrex::Real xmin, xmax;
+    amrex::Real ymin, ymax;
+    amrex::Real zmin, zmax;
+};
+
+#endif
diff --git a/Source/Initialization/Make.package b/Source/Initialization/Make.package
index edcf402c9..2c6458b6d 100644
--- a/Source/Initialization/Make.package
+++ b/Source/Initialization/Make.package
@@ -1,9 +1,18 @@
-CEXE_sources += CustomDensityProb.cpp
-CEXE_sources += PlasmaProfiles.cpp
 CEXE_sources += WarpXInitData.cpp
-CEXE_sources += CustomMomentumProb.cpp
+
 CEXE_sources += PlasmaInjector.cpp
 CEXE_headers += PlasmaInjector.H
 
+CEXE_headers += InjectorPosition.H
+
+CEXE_headers += InjectorDensity.H
+CEXE_sources += InjectorDensity.cpp
+
+CEXE_headers += InjectorMomentum.H
+CEXE_sources += InjectorMomentum.cpp
+
+CEXE_headers += CustomDensityProb.H
+CEXE_headers += CustomMomentumProb.H
+
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Initialization
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Initialization
diff --git a/Source/Initialization/PlasmaInjector.H b/Source/Initialization/PlasmaInjector.H
index f998e217e..f7e86bff5 100644
--- a/Source/Initialization/PlasmaInjector.H
+++ b/Source/Initialization/PlasmaInjector.H
@@ -1,250 +1,16 @@
 #ifndef PLASMA_INJECTOR_H_
 #define PLASMA_INJECTOR_H_
 
-#include <array>
+#include <InjectorPosition.H>
+#include <InjectorDensity.H>
+#include <InjectorMomentum.H>
 
-#include "AMReX_REAL.H"
+#include <array>
 #include <AMReX_Vector.H>
 #include <WarpXConst.H>
 #include <WarpXParser.H>
-#include "AMReX_ParmParse.H"
-#include "AMReX_Utility.H"
-
-enum class predefined_profile_flag { Null, parabolic_channel };
-
-///
-/// PlasmaDensityProfile describes how the charge density
-/// is set in particle initialization. Subclasses must define a
-/// getDensity function that describes the charge density as a
-/// function of x, y, and z.
-///
-class PlasmaDensityProfile
-{
-public:
-    virtual ~PlasmaDensityProfile() {};
-    virtual amrex::Real getDensity(amrex::Real x,
-                                   amrex::Real y,
-                                   amrex::Real z) const = 0;
-protected:
-    std::string _species_name;
-};
-
-///
-/// This describes a constant density distribution.
-///
-class ConstantDensityProfile : public PlasmaDensityProfile
-{
-public:
-    ConstantDensityProfile(amrex::Real _density);
-    virtual amrex::Real getDensity(amrex::Real x,
-                                   amrex::Real y,
-                                   amrex::Real z) const override;
-
-private:
-    amrex::Real _density;
-};
-
-///
-/// This describes a custom density distribution. Users can supply
-/// in their problem directory.
-///
-///
-class CustomDensityProfile : public PlasmaDensityProfile
-{
-public:
-    CustomDensityProfile(const std::string& species_name);
-    virtual amrex::Real getDensity(amrex::Real x,
-                                   amrex::Real y,
-                                   amrex::Real z) const override;
-private:
-    amrex::Vector<amrex::Real> params;
-};
-
-///
-/// This describes predefined density distributions.
-///
-class PredefinedDensityProfile : public PlasmaDensityProfile
-{
-public:
-    PredefinedDensityProfile(const std::string& species_name);
-    virtual amrex::Real getDensity(amrex::Real x,
-                                   amrex::Real y,
-                                   amrex::Real z) const override;
-    amrex::Real ParabolicChannel(amrex::Real x,
-                                 amrex::Real y,
-                                 amrex::Real z) const;
-private:
-    predefined_profile_flag which_profile = predefined_profile_flag::Null;
-    amrex::Vector<amrex::Real> params;
-};
-
-///
-/// This describes a density function parsed in the input file. 
-///
-class ParseDensityProfile : public PlasmaDensityProfile
-{
-public:
-    ParseDensityProfile(const std::string _parse_density_function);
-    virtual amrex::Real getDensity(amrex::Real x,
-                                   amrex::Real y,
-                                   amrex::Real z) const override;
-private:
-    std::string _parse_density_function;
-    WarpXParser parser_density;
-};
-
-///
-/// PlasmaMomentumDistribution describes how the particle momenta
-/// are set. Subclasses must define a getMomentum method that fills
-/// a u with the 3 components of the particle momentum
-///
-class PlasmaMomentumDistribution
-{
-public:
-    using vec3 = std::array<amrex::Real, 3>;
-    virtual ~PlasmaMomentumDistribution() {};
-    virtual void getMomentum(vec3& u, amrex::Real x, amrex::Real y, amrex::Real z) = 0;
-};
-
-///
-/// This is a constant momentum distribution - all particles will
-/// have the same ux, uy, and uz
-///
-class ConstantMomentumDistribution : public PlasmaMomentumDistribution
-{
-public:
-    ConstantMomentumDistribution(amrex::Real ux,
-                                 amrex::Real uy,
-                                 amrex::Real uz);
-    virtual void getMomentum(vec3& u, amrex::Real x, amrex::Real y, amrex::Real z) override;
-
-private:
-    amrex::Real _ux;
-    amrex::Real _uy;
-    amrex::Real _uz;
-};
-
-///
-/// This describes a custom momentum distribution. Users can supply
-/// in their problem directory.
-///
-///
-class CustomMomentumDistribution : public PlasmaMomentumDistribution
-{
-public:
-    CustomMomentumDistribution(const std::string& species_name);
-    virtual void getMomentum(vec3& u, amrex::Real x, amrex::Real y, amrex::Real z) override;
-
-private:
-    amrex::Vector<amrex::Real> params;
-};
-
-
-///
-/// This is a Gaussian Random momentum distribution.
-/// Particles will get random momenta, drawn from a normal.
-/// ux_m, ux_y, and ux_z describe the mean components in the x, y, and z
-/// directions, while u_th is the standard deviation of the random
-/// component.
-///
-class GaussianRandomMomentumDistribution : public PlasmaMomentumDistribution
-{
-public:
-    GaussianRandomMomentumDistribution(amrex::Real ux_m,
-                                       amrex::Real uy_m,
-                                       amrex::Real uz_m,
-                                       amrex::Real ux_th,
-                                       amrex::Real uy_th,
-                                       amrex::Real uz_th);
-    virtual void getMomentum(vec3& u, amrex::Real x, amrex::Real y, amrex::Real z) override;
-private:
-    amrex::Real _ux_m;
-    amrex::Real _uy_m;
-    amrex::Real _uz_m;
-    amrex::Real _ux_th;
-    amrex::Real _uy_th;
-    amrex::Real _uz_th;
-};
-
-///
-/// This is a radially expanding momentum distribution
-/// Particles will have a radial momentum proportional to their 
-/// radius, with proportionality constant u_over_r
-class RadialExpansionMomentumDistribution : public PlasmaMomentumDistribution
-{
-public:
-  RadialExpansionMomentumDistribution( amrex::Real u_over_r );
-  virtual void getMomentum(vec3& u, amrex::Real x, amrex::Real y, amrex::Real z) override;
-private:
-    amrex::Real _u_over_r;
-};
-
-///
-/// This describes a momentum distribution function parsed in the input file. 
-///
-class ParseMomentumFunction : public PlasmaMomentumDistribution
-{
-public:
-    ParseMomentumFunction(const std::string _parse_momentum_function_ux,
-                          const std::string _parse_momentum_function_uy,
-                          const std::string _parse_momentum_function_uz);
-    virtual void getMomentum(vec3& u, 
-                             amrex::Real x,
-                             amrex::Real y,
-                             amrex::Real z) override;
-private:
-    std::string _parse_momentum_function_ux;
-    std::string _parse_momentum_function_uy;
-    std::string _parse_momentum_function_uz;
-    WarpXParser parser_ux;
-    WarpXParser parser_uy;
-    WarpXParser parser_uz;
-};
-
-
-///
-/// PlasmaParticlePosition describes how particles are initialized
-/// into each cell box. Subclasses must define a
-/// getPositionUnitBox function that returns the position of
-/// particle number i_part in a unitary box.
-///
-class PlasmaParticlePosition{
-public:
-  using vec3 = std::array<amrex::Real, 3>;
-  virtual ~PlasmaParticlePosition() {};
-    virtual void getPositionUnitBox(vec3& r, int i_part, int ref_fac=1) = 0;
-};
-
-///
-/// Particles are initialized with a random uniform
-/// distribution inside each cell
-///
-class RandomPosition : public PlasmaParticlePosition{
-public:
-    RandomPosition(int num_particles_per_cell);
-    virtual void getPositionUnitBox(vec3& r, int i_part, int ref_fac=1) override;
-private:
-    amrex::Real _x;
-    amrex::Real _y;
-    amrex::Real _z;
-    int _num_particles_per_cell;
-};
-
-///
-/// Particles are regularly distributed inside each cell. The user provides
-/// a 3d (resp. 2d) vector num_particles_per_cell_each_dim that contains
-/// the number of particles per cell along each dimension.
-///
-class RegularPosition : public PlasmaParticlePosition{
-public:
-  RegularPosition(const amrex::Vector<int>& num_particles_per_cell_each_dim);
-    virtual void getPositionUnitBox(vec3& r, int i_part, int ref_fac=1) override;
-private:
-  amrex::Real _x;
-  amrex::Real _y;
-  amrex::Real _z;
-  amrex::Vector<int> _num_particles_per_cell_each_dim;
-};
+#include <AMReX_ParmParse.H>
+#include <AMReX_Utility.H>
 
 ///
 /// The PlasmaInjector class parses and stores information about the plasma
@@ -256,28 +22,23 @@ class PlasmaInjector
 
 public:
 
-    using vec3 = std::array<amrex::Real, 3>;
-
-    PlasmaInjector();
-
-    PlasmaInjector(int ispecies, const std::string& name);
+    PlasmaInjector ();
 
-    amrex::Real getDensity(amrex::Real x, amrex::Real y, amrex::Real z);
+    PlasmaInjector (int ispecies, const std::string& name);
 
-    bool insideBounds(amrex::Real x, amrex::Real y, amrex::Real z);
+    bool insideBounds (amrex::Real x, amrex::Real y, amrex::Real z) const noexcept;
 
     int num_particles_per_cell;
 
     amrex::Vector<int> num_particles_per_cell_each_dim;
 
-    void getMomentum(vec3& u, amrex::Real x, amrex::Real y, amrex::Real z);
+    // gamma * beta
+    amrex::XDim3 getMomentum (amrex::Real x, amrex::Real y, amrex::Real z) const noexcept;
 
-    void getPositionUnitBox(vec3& r, int i_part, int ref_fac=1);
+    amrex::Real getCharge () {return charge;}
+    amrex::Real getMass () {return mass;}
 
-    amrex::Real getCharge() {return charge;}
-    amrex::Real getMass() {return mass;}
-
-    bool doInjection() { return part_pos != NULL;}
+    bool doInjection () const noexcept { return inj_pos != NULL;}
 
     bool add_single_particle = false;
     amrex::Vector<amrex::Real> single_particle_pos;
@@ -305,6 +66,21 @@ public:
     amrex::Real xmin, xmax;
     amrex::Real ymin, ymax;
     amrex::Real zmin, zmax;
+    amrex::Real density_min = 0;
+    amrex::Real density_max = std::numeric_limits<amrex::Real>::max();
+
+    InjectorPosition* getInjectorPosition ();
+    InjectorDensity*  getInjectorDensity ();
+    InjectorMomentum* getInjectorMomentum ();
+
+    // When running on GPU, injector for position, momentum and density store
+    // particle 3D positions in shared memory IF using the parser.
+    std::size_t
+    sharedMemoryNeeded () const noexcept {
+        return amrex::max(inj_pos->sharedMemoryNeeded(),
+                          inj_rho->sharedMemoryNeeded(),
+                          inj_mom->sharedMemoryNeeded());
+    }
 
 protected:
 
@@ -315,13 +91,12 @@ protected:
     int species_id;
     std::string species_name;
 
-    std::unique_ptr<PlasmaDensityProfile> rho_prof;
-    std::unique_ptr<PlasmaMomentumDistribution> mom_dist;
-    std::unique_ptr<PlasmaParticlePosition> part_pos;
-    
-    void parseDensity(amrex::ParmParse pp);
-    void parseMomentum(amrex::ParmParse pp);
+    std::unique_ptr<InjectorPosition> inj_pos;
+    std::unique_ptr<InjectorDensity > inj_rho;
+    std::unique_ptr<InjectorMomentum> inj_mom;
     
+    void parseDensity (amrex::ParmParse& pp);
+    void parseMomentum (amrex::ParmParse& pp);    
 };
 
 #endif
diff --git a/Source/Initialization/PlasmaInjector.cpp b/Source/Initialization/PlasmaInjector.cpp
index f9642d1b6..541999789 100644
--- a/Source/Initialization/PlasmaInjector.cpp
+++ b/Source/Initialization/PlasmaInjector.cpp
@@ -55,192 +55,34 @@ namespace {
     }
 }
 
-ConstantDensityProfile::ConstantDensityProfile(Real density)
-    : _density(density)
-{}
+PlasmaInjector::PlasmaInjector () {}
 
-Real ConstantDensityProfile::getDensity(Real x, Real y, Real z) const
-{
-    return _density;
-}
-
-CustomDensityProfile::CustomDensityProfile(const std::string& species_name)
-{
-    ParmParse pp(species_name);
-    pp.getarr("custom_profile_params", params);
-}
-
-PredefinedDensityProfile::PredefinedDensityProfile(const std::string& species_name)
+PlasmaInjector::PlasmaInjector (int ispecies, const std::string& name)
+    : species_id(ispecies), species_name(name)
 {
     ParmParse pp(species_name);
-    std::string which_profile_s;
-    pp.getarr("predefined_profile_params", params);
-    pp.query("predefined_profile_name", which_profile_s);
-    if (which_profile_s == "parabolic_channel"){
-        which_profile = predefined_profile_flag::parabolic_channel;
-    }
-}
-
-ParseDensityProfile::ParseDensityProfile(std::string parse_density_function)
-    : _parse_density_function(parse_density_function)
-{
-    parser_density.define(parse_density_function);
-    parser_density.registerVariables({"x","y","z"});
-
-    ParmParse pp("my_constants");
-    std::set<std::string> symbols = parser_density.symbols();
-    symbols.erase("x");
-    symbols.erase("y");
-    symbols.erase("z"); // after removing variables, we are left with constants
-    for (auto it = symbols.begin(); it != symbols.end(); ) {
-        Real v;
-        if (pp.query(it->c_str(), v)) {
-            parser_density.setConstant(*it, v);
-            it = symbols.erase(it);
-        } else {
-            ++it;
-        }
-    }
-    for (auto const& s : symbols) { // make sure there no unknown symbols
-        amrex::Abort("ParseDensityProfile: Unknown symbol "+s);
-    }
-}
-
-Real ParseDensityProfile::getDensity(Real x, Real y, Real z) const
-{
-    return parser_density.eval(x,y,z);
-}
-
-ConstantMomentumDistribution::ConstantMomentumDistribution(Real ux,
-                                                           Real uy,
-                                                           Real uz)
-    : _ux(ux), _uy(uy), _uz(uz)
-{}
-
-void ConstantMomentumDistribution::getMomentum(vec3& u, Real x, Real y, Real z) {
-    u[0] = _ux;
-    u[1] = _uy;
-    u[2] = _uz;
-}
 
-CustomMomentumDistribution::CustomMomentumDistribution(const std::string& species_name)
-{
-  ParmParse pp(species_name);
-  pp.getarr("custom_momentum_params", params);
-}
-
-GaussianRandomMomentumDistribution::GaussianRandomMomentumDistribution(Real ux_m,
-                                                                       Real uy_m,
-                                                                       Real uz_m,
-                                                                       Real ux_th,
-                                                                       Real uy_th,
-                                                                       Real uz_th)
-    : _ux_m(ux_m), _uy_m(uy_m), _uz_m(uz_m), _ux_th(ux_th), _uy_th(uy_th), _uz_th(uz_th)
-{
-}
-
-void GaussianRandomMomentumDistribution::getMomentum(vec3& u, Real x, Real y, Real z) {
-    Real ux_th = amrex::RandomNormal(0.0, _ux_th);
-    Real uy_th = amrex::RandomNormal(0.0, _uy_th);
-    Real uz_th = amrex::RandomNormal(0.0, _uz_th);
-
-    u[0] = _ux_m + ux_th;
-    u[1] = _uy_m + uy_th;
-    u[2] = _uz_m + uz_th;
-}
-RadialExpansionMomentumDistribution::RadialExpansionMomentumDistribution(Real u_over_r) : _u_over_r( u_over_r )
-{
-}
-
-void RadialExpansionMomentumDistribution::getMomentum(vec3& u, Real x, Real y, Real z) {
-  u[0] = _u_over_r * x;
-  u[1] = _u_over_r * y;
-  u[2] = _u_over_r * z;
-}
-
-ParseMomentumFunction::ParseMomentumFunction(std::string parse_momentum_function_ux,
-                                             std::string parse_momentum_function_uy,
-                                             std::string parse_momentum_function_uz)
-    : _parse_momentum_function_ux(parse_momentum_function_ux),
-      _parse_momentum_function_uy(parse_momentum_function_uy),
-      _parse_momentum_function_uz(parse_momentum_function_uz)
-{
-    parser_ux.define(parse_momentum_function_ux);
-    parser_uy.define(parse_momentum_function_uy);
-    parser_uz.define(parse_momentum_function_uz);
-
-    amrex::Array<std::reference_wrapper<WarpXParser>,3> parsers{parser_ux, parser_uy, parser_uz};
-    ParmParse pp("my_constants");
-    for (auto& p : parsers) {
-        auto& parser = p.get();
-        parser.registerVariables({"x","y","z"});
-        std::set<std::string> symbols = parser.symbols();
-        symbols.erase("x");
-        symbols.erase("y");
-        symbols.erase("z"); // after removing variables, we are left with constants
-        for (auto it = symbols.begin(); it != symbols.end(); ) {
-            Real v;
-            if (pp.query(it->c_str(), v)) {
-                parser.setConstant(*it, v);
-                it = symbols.erase(it);
-            } else {
-                ++it;
-            }
-        }
-        for (auto const& s : symbols) { // make sure there no unknown symbols
-            amrex::Abort("ParseMomentumFunction: Unknown symbol "+s);
-        }
-    }
-}
-
-void ParseMomentumFunction::getMomentum(vec3& u, Real x, Real y, Real z)
-{
-    u[0] = parser_ux.eval(x,y,z);
-    u[1] = parser_uy.eval(x,y,z);
-    u[2] = parser_uz.eval(x,y,z);
-}
-
-RandomPosition::RandomPosition(int num_particles_per_cell):
-  _num_particles_per_cell(num_particles_per_cell)
-{}
-
-void RandomPosition::getPositionUnitBox(vec3& r, int i_part, int ref_fac){
-    r[0] = amrex::Random();
-    r[1] = amrex::Random();
-    r[2] = amrex::Random();
-}
-
-RegularPosition::RegularPosition(const amrex::Vector<int>& num_particles_per_cell_each_dim)
-    : _num_particles_per_cell_each_dim(num_particles_per_cell_each_dim)
-{}
+    pp.query("radially_weighted", radially_weighted);
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(radially_weighted, "ERROR: Only radially_weighted=true is supported");
 
-void RegularPosition::getPositionUnitBox(vec3& r, int i_part, int ref_fac)
-{
-  int nx = ref_fac*_num_particles_per_cell_each_dim[0];
-  int ny = ref_fac*_num_particles_per_cell_each_dim[1];
-#if AMREX_SPACEDIM == 3
-  int nz = ref_fac*_num_particles_per_cell_each_dim[2];
-#else
-  int nz = 1;
-#endif
-  
-  int ix_part = i_part/(ny * nz);
-  int iy_part = (i_part % (ny * nz)) % ny;
-  int iz_part = (i_part % (ny * nz)) / ny;
+    // parse plasma boundaries
+    xmin = std::numeric_limits<amrex::Real>::lowest();
+    ymin = std::numeric_limits<amrex::Real>::lowest();
+    zmin = std::numeric_limits<amrex::Real>::lowest();
 
-  r[0] = (0.5+ix_part)/nx;
-  r[1] = (0.5+iy_part)/ny;
-  r[2] = (0.5+iz_part)/nz;
-}
+    xmax = std::numeric_limits<amrex::Real>::max();
+    ymax = std::numeric_limits<amrex::Real>::max();
+    zmax = std::numeric_limits<amrex::Real>::max();
 
-PlasmaInjector::PlasmaInjector(){
-    part_pos = NULL;
-}
+    pp.query("xmin", xmin);
+    pp.query("ymin", ymin);
+    pp.query("zmin", zmin);
+    pp.query("xmax", xmax);
+    pp.query("ymax", ymax);
+    pp.query("zmax", zmax);
 
-PlasmaInjector::PlasmaInjector(int ispecies, const std::string& name)
-    : species_id(ispecies), species_name(name)
-{
-    ParmParse pp(species_name);
+    pp.query("density_min", density_min);
+    pp.query("density_max", density_max);
 
     // parse charge and mass
     std::string charge_s;
@@ -290,9 +132,14 @@ PlasmaInjector::PlasmaInjector(int ispecies, const std::string& name)
         gaussian_beam = true;
         parseMomentum(pp);
     }
+    // Depending on injection type at runtime, initialize inj_pos
+    // so that inj_pos->getPositionUnitBox calls
+    // InjectorPosition[Random or Regular].getPositionUnitBox.
     else if (part_pos_s == "nrandompercell") {
         pp.query("num_particles_per_cell", num_particles_per_cell);
-        part_pos.reset(new RandomPosition(num_particles_per_cell));
+        // Construct InjectorPosition with InjectorPositionRandom.
+        inj_pos.reset(new InjectorPosition((InjectorPositionRandom*)nullptr,
+                                           xmin, xmax, ymin, ymax, zmin, zmax));
         parseDensity(pp);
         parseMomentum(pp);
     } else if (part_pos_s == "nuniformpercell") {
@@ -301,7 +148,12 @@ PlasmaInjector::PlasmaInjector(int ispecies, const std::string& name)
 #if ( AMREX_SPACEDIM == 2 )
         num_particles_per_cell_each_dim[2] = 1;
 #endif
-        part_pos.reset(new RegularPosition(num_particles_per_cell_each_dim));
+        // Construct InjectorPosition from InjectorPositionRegular.
+        inj_pos.reset(new InjectorPosition((InjectorPositionRegular*)nullptr,
+                                           xmin, xmax, ymin, ymax, zmin, zmax,
+                                           Dim3{num_particles_per_cell_each_dim[0],
+                                                num_particles_per_cell_each_dim[1],
+                                                num_particles_per_cell_each_dim[2]}));
         num_particles_per_cell = num_particles_per_cell_each_dim[0] *
                                  num_particles_per_cell_each_dim[1] *
                                  num_particles_per_cell_each_dim[2];
@@ -310,52 +162,75 @@ PlasmaInjector::PlasmaInjector(int ispecies, const std::string& name)
     } else {
         StringParseAbortMessage("Injection style", part_pos_s);
     }
+}
 
-    pp.query("radially_weighted", radially_weighted);
-    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(radially_weighted, "ERROR: Only radially_weighted=true is supported");
-
-    // parse plasma boundaries
-    xmin = std::numeric_limits<amrex::Real>::lowest();
-    ymin = std::numeric_limits<amrex::Real>::lowest();
-    zmin = std::numeric_limits<amrex::Real>::lowest();
-
-    xmax = std::numeric_limits<amrex::Real>::max();
-    ymax = std::numeric_limits<amrex::Real>::max();
-    zmax = std::numeric_limits<amrex::Real>::max();
+namespace {
+WarpXParser makeParser (std::string const& parse_function)
+{
+    WarpXParser parser(parse_function);
+    parser.registerVariables({"x","y","z"});
 
-    pp.query("xmin", xmin);
-    pp.query("ymin", ymin);
-    pp.query("zmin", zmin);
-    pp.query("xmax", xmax);
-    pp.query("ymax", ymax);
-    pp.query("zmax", zmax);
+    ParmParse pp("my_constants");
+    std::set<std::string> symbols = parser.symbols();
+    symbols.erase("x");
+    symbols.erase("y");
+    symbols.erase("z"); // after removing variables, we are left with constants
+    for (auto it = symbols.begin(); it != symbols.end(); ) {
+        Real v;
+        if (pp.query(it->c_str(), v)) {
+            parser.setConstant(*it, v);
+            it = symbols.erase(it);
+        } else {
+            ++it;
+        }
+    }
+    for (auto const& s : symbols) { // make sure there no unknown symbols
+        amrex::Abort("PlasmaInjector::makeParser: Unknown symbol "+s);
+    }
 
+    return parser;
+}
 }
 
-void PlasmaInjector::parseDensity(ParmParse pp){
+// Depending on injection type at runtime, initialize inj_rho
+// so that inj_rho->getDensity calls
+// InjectorPosition[Constant or Custom or etc.].getDensity.
+void PlasmaInjector::parseDensity (ParmParse& pp)
+{
     // parse density information
     std::string rho_prof_s;
     pp.get("profile", rho_prof_s);
-    std::transform(rho_prof_s.begin(),
-                   rho_prof_s.end(),
-                   rho_prof_s.begin(),
-                   ::tolower);
+    std::transform(rho_prof_s.begin(), rho_prof_s.end(),
+                   rho_prof_s.begin(), ::tolower);
     if (rho_prof_s == "constant") {
         pp.get("density", density);
-        rho_prof.reset(new ConstantDensityProfile(density));
+        // Construct InjectorDensity with InjectorDensityConstant.
+        inj_rho.reset(new InjectorDensity((InjectorDensityConstant*)nullptr, density));
     } else if (rho_prof_s == "custom") {
-        rho_prof.reset(new CustomDensityProfile(species_name));
+        // Construct InjectorDensity with InjectorDensityCustom.
+        inj_rho.reset(new InjectorDensity((InjectorDensityCustom*)nullptr, species_name));
     } else if (rho_prof_s == "predefined") {
-        rho_prof.reset(new PredefinedDensityProfile(species_name));
+        // Construct InjectorDensity with InjectorDensityPredefined.
+        inj_rho.reset(new InjectorDensity((InjectorDensityPredefined*)nullptr,species_name));
     } else if (rho_prof_s == "parse_density_function") {
-        pp.get("density_function(x,y,z)", str_density_function);
-        rho_prof.reset(new ParseDensityProfile(str_density_function));
+        std::vector<std::string> f;
+        pp.getarr("density_function(x,y,z)", f);
+        for (auto const& s : f) {
+            str_density_function += s;
+        }
+        // Construct InjectorDensity with InjectorDensityParser.
+        inj_rho.reset(new InjectorDensity((InjectorDensityParser*)nullptr,
+                                          makeParser(str_density_function)));
     } else {
         StringParseAbortMessage("Density profile type", rho_prof_s);
     }
 }
 
-void PlasmaInjector::parseMomentum(ParmParse pp){
+// Depending on injection type at runtime, initialize inj_mom
+// so that inj_mom->getMomentum calls
+// InjectorMomentum[Constant or Custom or etc.].getMomentum.
+void PlasmaInjector::parseMomentum (ParmParse& pp)
+{
     // parse momentum information
     std::string mom_dist_s;
     pp.get("momentum_distribution_type", mom_dist_s);
@@ -370,9 +245,11 @@ void PlasmaInjector::parseMomentum(ParmParse pp){
         pp.query("ux", ux);
         pp.query("uy", uy);
         pp.query("uz", uz);
-        mom_dist.reset(new ConstantMomentumDistribution(ux, uy, uz));
+        // Construct InjectorMomentum with InjectorMomentumConstant.
+        inj_mom.reset(new InjectorMomentum((InjectorMomentumConstant*)nullptr, ux,uy, uz));
     } else if (mom_dist_s == "custom") {
-        mom_dist.reset(new CustomMomentumDistribution(species_name));
+        // Construct InjectorMomentum with InjectorMomentumCustom.
+        inj_mom.reset(new InjectorMomentum((InjectorMomentumCustom*)nullptr, species_name));
     } else if (mom_dist_s == "gaussian") {
         Real ux_m = 0.;
         Real uy_m = 0.;
@@ -386,42 +263,68 @@ void PlasmaInjector::parseMomentum(ParmParse pp){
         pp.query("ux_th", ux_th);
         pp.query("uy_th", uy_th);
         pp.query("uz_th", uz_th);
-        mom_dist.reset(new GaussianRandomMomentumDistribution(ux_m, uy_m, uz_m, 
-                                                              ux_th, uy_th, uz_th));
+        // Construct InjectorMomentum with InjectorMomentumGaussian.
+        inj_mom.reset(new InjectorMomentum((InjectorMomentumGaussian*)nullptr,
+                                           ux_m, uy_m, uz_m, ux_th, uy_th, uz_th));
     } else if (mom_dist_s == "radial_expansion") {
         Real u_over_r = 0.;
         pp.query("u_over_r", u_over_r);
-        mom_dist.reset(new RadialExpansionMomentumDistribution(u_over_r));
+        // Construct InjectorMomentum with InjectorMomentumRadialExpansion.
+        inj_mom.reset(new InjectorMomentum
+                      ((InjectorMomentumRadialExpansion*)nullptr, u_over_r));
     } else if (mom_dist_s == "parse_momentum_function") {
-        pp.get("momentum_function_ux(x,y,z)", str_momentum_function_ux);
-        pp.get("momentum_function_uy(x,y,z)", str_momentum_function_uy);
-        pp.get("momentum_function_uz(x,y,z)", str_momentum_function_uz);
-        mom_dist.reset(new ParseMomentumFunction(str_momentum_function_ux, 
-                                                 str_momentum_function_uy, 
-                                                 str_momentum_function_uz));
+        std::vector<std::string> f;
+        pp.getarr("momentum_function_ux(x,y,z)", f);
+        for (auto const& s : f) {
+            str_momentum_function_ux += s;
+        }
+        f.clear();
+        pp.getarr("momentum_function_uy(x,y,z)", f);
+        for (auto const& s : f) {
+            str_momentum_function_uy += s;
+        }
+        f.clear();
+        pp.getarr("momentum_function_uz(x,y,z)", f);
+        for (auto const& s : f) {
+            str_momentum_function_uz += s;
+        }
+        // Construct InjectorMomentum with InjectorMomentumParser.
+        inj_mom.reset(new InjectorMomentum((InjectorMomentumParser*)nullptr,
+                                           makeParser(str_momentum_function_ux),
+                                           makeParser(str_momentum_function_uy),
+                                           makeParser(str_momentum_function_uz)));
     } else {
         StringParseAbortMessage("Momentum distribution type", mom_dist_s);
     }
 }
 
-void PlasmaInjector::getPositionUnitBox(vec3& r, int i_part, int ref_fac) {
-    return part_pos->getPositionUnitBox(r, i_part, ref_fac);
+XDim3 PlasmaInjector::getMomentum (Real x, Real y, Real z) const noexcept
+{
+    return inj_mom->getMomentum(x, y, z); // gamma*beta
+}
+
+bool PlasmaInjector::insideBounds (Real x, Real y, Real z) const noexcept
+{
+    return (x < xmax and x >= xmin and
+            y < ymax and y >= ymin and
+            z < zmax and z >= zmin);
 }
 
-void PlasmaInjector::getMomentum(vec3& u, Real x, Real y, Real z) {
-    mom_dist->getMomentum(u, x, y, z);
-    u[0] *= PhysConst::c;
-    u[1] *= PhysConst::c;
-    u[2] *= PhysConst::c;
+InjectorPosition*
+PlasmaInjector::getInjectorPosition ()
+{
+    return inj_pos.get();
 }
 
-bool PlasmaInjector::insideBounds(Real x, Real y, Real z) {
-  if (x >= xmax || x < xmin ||
-      y >= ymax || y < ymin ||
-      z >= zmax || z < zmin ) return false;
-  return true;
+InjectorDensity*
+PlasmaInjector::getInjectorDensity ()
+{
+    return inj_rho.get();
 }
 
-Real PlasmaInjector::getDensity(Real x, Real y, Real z) {
-    return rho_prof->getDensity(x, y, z);
+InjectorMomentum*
+PlasmaInjector::getInjectorMomentum ()
+{
+    return inj_mom.get();
 }
+
diff --git a/Source/Initialization/PlasmaProfiles.cpp b/Source/Initialization/PlasmaProfiles.cpp
deleted file mode 100644
index d9d207f7e..000000000
--- a/Source/Initialization/PlasmaProfiles.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#include <PlasmaInjector.H>
-#include <cmath>
-#include <iostream>
-#include <WarpXConst.H>
-
-using namespace amrex;
-
-Real PredefinedDensityProfile::getDensity(Real x, Real y, Real z) const {
-    Real n;
-    if ( which_profile == predefined_profile_flag::parabolic_channel ) {
-        n = ParabolicChannel(x,y,z);
-    }
-    return n;
-}
-
-///
-/// plateau between linear upramp and downramp, and parab transverse profile
-///
-Real PredefinedDensityProfile::ParabolicChannel(Real x, Real y, Real z) const {
-    //  params = [z_start   ramp_up   plateau   ramp_down   rc       n0]
-    Real z_start   = params[0];
-    Real ramp_up   = params[1];
-    Real plateau   = params[2];
-    Real ramp_down = params[3];
-    Real rc        = params[4];
-    Real n0        = params[5];
-    Real n;
-    Real kp = PhysConst::q_e/PhysConst::c*sqrt( n0/(PhysConst::m_e*PhysConst::ep0) );
-
-    if        ((z-z_start)>=0               and (z-z_start)<ramp_up                  ) {
-        n = (z-z_start)/ramp_up;
-    } else if ((z-z_start)>=ramp_up         and (z-z_start)<ramp_up+plateau          ) {
-        n = 1;
-    } else if ((z-z_start)>=ramp_up+plateau and (z-z_start)<ramp_up+plateau+ramp_down) {
-        n = 1-((z-z_start)-ramp_up-plateau)/ramp_down;
-    } else {
-        n = 0;
-    }
-    n *= n0*(1+4*(x*x+y*y)/(kp*kp*std::pow(rc,4)));
-    return n;
-}
diff --git a/Source/Initialization/WarpXInitData.cpp b/Source/Initialization/WarpXInitData.cpp
index 2442e0205..590c11b84 100644
--- a/Source/Initialization/WarpXInitData.cpp
+++ b/Source/Initialization/WarpXInitData.cpp
@@ -1,6 +1,4 @@
 
-#include <numeric>
-
 #include <AMReX_ParallelDescriptor.H>
 #include <AMReX_ParmParse.H>
 
@@ -88,7 +86,7 @@ WarpX::InitDiagnostics () {
         const Real* current_lo = geom[0].ProbLo();
         const Real* current_hi = geom[0].ProbHi();
         Real dt_boost = dt[0];
-        
+
 	// Find the positions of the lab-frame box that corresponds to the boosted-frame box at t=0
 	Real zmin_lab = current_lo[moving_window_dir]/( (1.+beta_boost)*gamma_boost );
 	Real zmax_lab = current_hi[moving_window_dir]/( (1.+beta_boost)*gamma_boost );
@@ -97,7 +95,7 @@ WarpX::InitDiagnostics () {
 					       zmax_lab,
                                                moving_window_v, dt_snapshots_lab,
                                                num_snapshots_lab, gamma_boost,
-                                               t_new[0], dt_boost, 
+                                               t_new[0], dt_boost,
                                                moving_window_dir, geom[0]));
     }
 }
@@ -118,10 +116,10 @@ WarpX::InitFromScratch ()
 
     InitPML();
 
-#ifdef WARPX_DO_ELECTROSTATIC    
+#ifdef WARPX_DO_ELECTROSTATIC
     if (do_electrostatic) {
         getLevelMasks(masks);
-        
+
         // the plus one is to convert from num_cells to num_nodes
         getLevelMasks(gather_masks, n_buffer + 1);
     }
@@ -133,14 +131,35 @@ WarpX::InitPML ()
 {
     if (do_pml)
     {
+        amrex::IntVect do_pml_Lo_corrected = do_pml_Lo;
+
+#ifdef WARPX_DIM_RZ
+        do_pml_Lo_corrected[0] = 0; // no PML at r=0, in cylindrical geometry
+#endif
         pml[0].reset(new PML(boxArray(0), DistributionMap(0), &Geom(0), nullptr,
-                             pml_ncell, pml_delta, 0, do_dive_cleaning, do_moving_window));
+                             pml_ncell, pml_delta, 0,
+#ifdef WARPX_USE_PSATD
+                             dt[0], nox_fft, noy_fft, noz_fft, do_nodal,
+#endif
+                             do_dive_cleaning, do_moving_window,
+                             do_pml_Lo_corrected, do_pml_Hi));
         for (int lev = 1; lev <= finest_level; ++lev)
         {
+            amrex::IntVect do_pml_Lo_MR = amrex::IntVect::TheUnitVector();
+#ifdef WARPX_DIM_RZ
+            //In cylindrical geometry, if the edge of the patch is at r=0, do not add PML
+            if ((max_level > 0) && (fine_tag_lo[0]==0.)) {
+                do_pml_Lo_MR[0] = 0;
+            }
+#endif
             pml[lev].reset(new PML(boxArray(lev), DistributionMap(lev),
                                    &Geom(lev), &Geom(lev-1),
-                                   pml_ncell, pml_delta, refRatio(lev-1)[0], do_dive_cleaning,
-                                   do_moving_window));
+                                   pml_ncell, pml_delta, refRatio(lev-1)[0],
+#ifdef WARPX_USE_PSATD
+                                   dt[lev], nox_fft, noy_fft, noz_fft, do_nodal,
+#endif
+                                   do_dive_cleaning, do_moving_window,
+                                   do_pml_Lo_MR, amrex::IntVect::TheUnitVector()));
         }
     }
 }
@@ -226,7 +245,7 @@ WarpX::InitOpenbc ()
     Vector<int> alllohi(6*nprocs,100000);
 
     MPI_Allgather(lohi, 6, MPI_INT, alllohi.data(), 6, MPI_INT, ParallelDescriptor::Communicator());
-    
+
     BoxList bl{IndexType::TheNodeType()};
     for (int i = 0; i < nprocs; ++i)
     {
@@ -252,7 +271,7 @@ WarpX::InitOpenbc ()
     rho_openbc.copy(*rho, 0, 0, 1, rho->nGrow(), 0, gm.periodicity(), FabArrayBase::ADD);
 
     const Real* dx = gm.CellSize();
-    
+
     warpx_openbc_potential(rho_openbc[myproc].dataPtr(), phi_openbc[myproc].dataPtr(), dx);
 
     BoxArray nba = boxArray(lev);
@@ -322,7 +341,7 @@ WarpX::InitLevelData (int lev, Real time)
 void
 WarpX::InitLevelDataFFT (int lev, Real time)
 {
- 
+
     Efield_fp_fft[lev][0]->setVal(0.0);
     Efield_fp_fft[lev][1]->setVal(0.0);
     Efield_fp_fft[lev][2]->setVal(0.0);
diff --git a/Source/Laser/LaserParticleContainer.cpp b/Source/Laser/LaserParticleContainer.cpp
index 3d3447a3c..786ebc622 100644
--- a/Source/Laser/LaserParticleContainer.cpp
+++ b/Source/Laser/LaserParticleContainer.cpp
@@ -453,7 +453,12 @@ LaserParticleContainer::Evolve (int lev,
             pti.GetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
             BL_PROFILE_VAR_STOP(blp_copy);
 
-            if (rho) DepositCharge(pti, wp, rho, crho, 0, np_current, np, thread_num, lev);
+            if (rho) {
+                DepositCharge(pti, wp, rho, 0, 0, np_current, thread_num, lev, lev);
+                if (crho) {
+                    DepositCharge(pti, wp, crho, 0, np_current, np-np_current, thread_num, lev, lev-1);
+                }
+            }
 
             //
             // Particle Push
@@ -504,15 +509,15 @@ LaserParticleContainer::Evolve (int lev,
             // Current Deposition
             //
             // Deposit inside domains
-            DepositCurrentFortran(pti, wp, uxp, uyp, uzp, &jx, &jy, &jz,
-                                  0, np_current, thread_num,
-                                  lev, lev, dt);
+            DepositCurrent(pti, wp, uxp, uyp, uzp, &jx, &jy, &jz,
+                           0, np_current, thread_num,
+                           lev, lev, dt);
             bool has_buffer = cjx;
             if (has_buffer){
                 // Deposit in buffers
-                DepositCurrentFortran(pti, wp, uxp, uyp, uzp, cjx, cjy, cjz,
-                                      np_current, np-np_current, thread_num,
-                                      lev, lev-1, dt);
+                DepositCurrent(pti, wp, uxp, uyp, uzp, cjx, cjy, cjz,
+                               np_current, np-np_current, thread_num,
+                               lev, lev-1, dt);
             }
 
             //
@@ -522,7 +527,12 @@ LaserParticleContainer::Evolve (int lev,
             pti.SetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
             BL_PROFILE_VAR_STOP(blp_copy);
 
-            if (rho) DepositCharge(pti, wp, rho, crho, 1, np_current, np, thread_num, lev);
+            if (rho) {
+                DepositCharge(pti, wp, rho, 1, 0, np_current, thread_num, lev, lev);
+                if (crho) {
+                    DepositCharge(pti, wp, crho, 1, np_current, np-np_current, thread_num, lev, lev-1);
+                }
+            }
 
             if (cost) {
                 const Box& tbx = pti.tilebox();
diff --git a/Source/Make.WarpX b/Source/Make.WarpX
index 3060ae8f0..e3a33a00f 100644
--- a/Source/Make.WarpX
+++ b/Source/Make.WarpX
@@ -97,16 +97,24 @@ ifeq ($(USE_OPENBC_POISSON),TRUE)
 endif
 
 ifeq ($(USE_OPENPMD), TRUE)
-   OPENPMD_LIB_PATH ?= NOT_SET
-   ifneq ($(OPENPMD_LIB_PATH),NOT_SET)
-     LIBRARY_LOCATIONS += $(OPENPMD_LIB_PATH)
+   # try pkg-config query
+   ifeq (0, $(shell pkg-config "openPMD >= 0.9.0"; echo $$?))
+       CXXFLAGS += $(shell pkg-config --cflags openPMD)
+       LDFLAGS += $(shell pkg-config --libs openPMD)
+       LDFLAGS += -Xlinker -rpath -Xlinker $(shell pkg-config --variable=libdir openPMD)
+   # fallback to manual settings
+   else
+       OPENPMD_LIB_PATH ?= NOT_SET
+       ifneq ($(OPENPMD_LIB_PATH),NOT_SET)
+         LIBRARY_LOCATIONS += $(OPENPMD_LIB_PATH)
+       endif
+       OPENPMD_INCLUDE_PATH ?= NOT_SET
+       ifneq ($(OPENPMD_INCLUDE_PATH),NOT_SET)
+         INCLUDE_LOCATIONS += $(OPENPMD_INCLUDE_PATH)
+       endif
+       libraries += -lopenPMD
    endif
-   OPENPMD_INCLUDE_PATH ?= NOT_SET
-   ifneq ($(OPENPMD_INCLUDE_PATH),NOT_SET)
-     INCLUDE_LOCATIONS += $(OPENPMD_INCLUDE_PATH)
-   endif
-   DEFINES += -DWARPX_USE_OPENPMD -DopenPMD_HAVE_MPI=1
-   LIBRARIES += -lopenPMD -lhdf5
+   DEFINES += -DWARPX_USE_OPENPMD
 endif
    
 
@@ -115,7 +123,7 @@ ifeq ($(USE_PSATD),TRUE)
   DEFINES += -DWARPX_USE_PSATD
   ifeq ($(USE_CUDA),FALSE) # Running on CPU
      # Use FFTW
-     LIBRARIES += -lfftw3_mpi -lfftw3 -lfftw3_threads
+     libraries += -lfftw3_mpi -lfftw3 -lfftw3_threads
      FFTW_HOME ?= NOT_SET
      ifneq ($(FFTW_HOME),NOT_SET)
        VPATH_LOCATIONS += $(FFTW_HOME)/include
@@ -127,13 +135,12 @@ ifeq ($(USE_PSATD),TRUE)
      DEFINES += -DFFTW # PICSAR uses it
   else
     # Use cuFFT
-    LIBRARIES += -lcufft
+    libraries += -lcufft
   endif
 endif
 
 ifeq ($(USE_RZ),TRUE)
   USERSuffix := $(USERSuffix).RZ
-  DEFINES += -DWARPX_RZ
 endif
 
 ifeq ($(DO_ELECTROSTATIC),TRUE)
@@ -151,7 +158,7 @@ ifeq ($(USE_HDF5),TRUE)
         LIBRARY_LOCATIONS += $(HDF5_HOME)/lib
     endif
     DEFINES += -DWARPX_USE_HDF5
-    LIBRARIES += -lhdf5 -lz
+    libraries += -lhdf5 -lz
 endif     
 
 # job_info support
diff --git a/Source/Parser/GpuParser.H b/Source/Parser/GpuParser.H
new file mode 100644
index 000000000..1533ee6b9
--- /dev/null
+++ b/Source/Parser/GpuParser.H
@@ -0,0 +1,72 @@
+#ifndef WARPX_GPU_PARSER_H_
+#define WARPX_GPU_PARSER_H_
+
+#include <WarpXParser.H>
+#include <AMReX_Gpu.H>
+
+// When compiled for CPU, wrap WarpXParser and enable threading.
+// When compiled for GPU, store one copy of the parser in 
+// CUDA managed memory for __device__ code, and one copy of the parser
+// in CUDA managed memory for __host__ code. This way, the parser can be
+// efficiently called from both host and device.
+class GpuParser
+{
+public:
+    GpuParser (WarpXParser const& wp);
+    void clear ();
+
+    AMREX_GPU_HOST_DEVICE
+    double
+    operator() (double x, double y, double z) const noexcept
+    {
+#ifdef AMREX_USE_GPU
+
+#ifdef AMREX_DEVICE_COMPILE
+// WarpX compiled for GPU, function compiled for __device__
+        // the 3D position of each particle is stored in shared memory.
+        amrex::Gpu::SharedMemory<double> gsm;
+        double* p = gsm.dataPtr();
+        int tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*(blockDim.x*blockDim.y);
+        p[tid*3] = x;
+        p[tid*3+1] = y;
+        p[tid*3+2] = z;
+        return wp_ast_eval(m_gpu_parser.ast);
+#else
+// WarpX compiled for GPU, function compiled for __host__
+        m_var.x = x;
+        m_var.y = y;
+        m_var.z = z;
+        return wp_ast_eval(m_cpu_parser.ast);
+#endif
+
+#else
+// WarpX compiled for CPU
+#ifdef _OPENMP
+        int tid = omp_get_thread_num();
+#else
+        int tid = 0;
+#endif
+        m_var[tid].x = x;
+        m_var[tid].y = y;
+        m_var[tid].z = z;
+        return wp_ast_eval(m_parser[tid]->ast);
+#endif
+    }
+
+private:
+
+#ifdef AMREX_USE_GPU
+    // Copy of the parser running on __device__
+    struct wp_parser m_gpu_parser;
+    // Copy of the parser running on __host__
+    struct wp_parser m_cpu_parser;
+    mutable amrex::XDim3 m_var;
+#else
+    // Only one parser
+    struct wp_parser** m_parser;
+    mutable amrex::XDim3* m_var;
+    int nthreads;
+#endif
+};
+
+#endif
diff --git a/Source/Parser/GpuParser.cpp b/Source/Parser/GpuParser.cpp
new file mode 100644
index 000000000..db1c2287d
--- /dev/null
+++ b/Source/Parser/GpuParser.cpp
@@ -0,0 +1,73 @@
+#include <GpuParser.H>
+
+GpuParser::GpuParser (WarpXParser const& wp)
+{
+#ifdef AMREX_USE_GPU
+
+    struct wp_parser* a_wp = wp.m_parser;
+    // Initialize GPU parser: allocate memory in CUDA managed memory,
+    // copy all data needed on GPU to m_gpu_parser
+    m_gpu_parser.sz_mempool = wp_ast_size(a_wp->ast);
+    m_gpu_parser.p_root = (struct wp_node*)
+        amrex::The_Managed_Arena()->alloc(m_gpu_parser.sz_mempool);
+    m_gpu_parser.p_free = m_gpu_parser.p_root;
+    // 0: don't free the source
+    m_gpu_parser.ast = wp_parser_ast_dup(&m_gpu_parser, a_wp->ast, 0);
+    wp_parser_regvar_gpu(&m_gpu_parser, "x", 0);
+    wp_parser_regvar_gpu(&m_gpu_parser, "y", 1);
+    wp_parser_regvar_gpu(&m_gpu_parser, "z", 2);
+
+    // Initialize CPU parser: allocate memory in CUDA managed memory,
+    // copy all data needed on CPU to m_cpu_parser
+    m_cpu_parser.sz_mempool = wp_ast_size(a_wp->ast);
+    m_cpu_parser.p_root = (struct wp_node*)
+        amrex::The_Managed_Arena()->alloc(m_cpu_parser.sz_mempool);
+    m_cpu_parser.p_free = m_cpu_parser.p_root;
+    // 0: don't free the source
+    m_cpu_parser.ast = wp_parser_ast_dup(&m_cpu_parser, a_wp->ast, 0);
+    wp_parser_regvar(&m_cpu_parser, "x", &(m_var.x));
+    wp_parser_regvar(&m_cpu_parser, "y", &(m_var.y));
+    wp_parser_regvar(&m_cpu_parser, "z", &(m_var.z));
+    
+#else // not defined AMREX_USE_GPU
+
+#ifdef _OPENMP
+    nthreads = omp_get_max_threads();
+#else // _OPENMP
+    nthreads = 1;
+#endif // _OPENMP
+
+    m_parser = ::new struct wp_parser*[nthreads];
+    m_var = ::new amrex::XDim3[nthreads];
+
+    for (int tid = 0; tid < nthreads; ++tid)
+    {
+#ifdef _OPENMP
+        m_parser[tid] = wp_parser_dup(wp.m_parser[tid]);
+#else // _OPENMP
+        m_parser[tid] = wp_parser_dup(wp.m_parser);
+#endif // _OPENMP
+        wp_parser_regvar(m_parser[tid], "x", &(m_var[tid].x));
+        wp_parser_regvar(m_parser[tid], "y", &(m_var[tid].y));
+        wp_parser_regvar(m_parser[tid], "z", &(m_var[tid].z));
+    }
+
+#endif // AMREX_USE_GPU
+}
+
+void
+GpuParser::clear ()
+{
+#ifdef AMREX_USE_GPU
+    amrex::The_Managed_Arena()->free(m_gpu_parser.ast);
+    amrex::The_Managed_Arena()->free(m_cpu_parser.ast);
+#else
+    for (int tid = 0; tid < nthreads; ++tid)
+    {
+        wp_parser_delete(m_parser[tid]);
+    }
+    ::delete[] m_parser;
+    ::delete[] m_var;
+#endif
+}
+
diff --git a/Source/Parser/Make.package b/Source/Parser/Make.package
index 26ef4fb43..5ce02cbda 100644
--- a/Source/Parser/Make.package
+++ b/Source/Parser/Make.package
@@ -3,6 +3,8 @@ cEXE_sources += wp_parser_y.c wp_parser.tab.c wp_parser.lex.c wp_parser_c.c
 cEXE_headers += wp_parser_y.h wp_parser.tab.h wp_parser.lex.h wp_parser_c.h
 CEXE_sources += WarpXParser.cpp
 CEXE_headers += WarpXParser.H
+CEXE_headers += GpuParser.H
+CEXE_sources += GpuParser.cpp
 
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Parser
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Parser
diff --git a/Source/Parser/WarpXParser.H b/Source/Parser/WarpXParser.H
index 046491e29..ffa61e457 100644
--- a/Source/Parser/WarpXParser.H
+++ b/Source/Parser/WarpXParser.H
@@ -13,6 +13,8 @@
 #include <omp.h>
 #endif
 
+class GpuParser;
+
 class WarpXParser
 {
 public:
@@ -46,6 +48,8 @@ public:
 
     std::set<std::string> symbols () const;
 
+    friend class GpuParser;
+
 private:
     void clear ();
 
diff --git a/Source/Parser/wp_parser_c.h b/Source/Parser/wp_parser_c.h
index d810bd685..3aafdec65 100644
--- a/Source/Parser/wp_parser_c.h
+++ b/Source/Parser/wp_parser_c.h
@@ -2,6 +2,8 @@
 #define WP_PARSER_C_H_
 
 #include "wp_parser_y.h"
+#include <AMReX_GpuQualifiers.H>
+#include <AMReX_Extension.H>
 
 #ifdef __cplusplus
 extern "C" {
@@ -18,71 +20,167 @@ extern "C" {
 #include <set>
 #include <string>
 
-inline
-double
+AMREX_GPU_HOST_DEVICE
+inline double
 wp_ast_eval (struct wp_node* node)
 {
     double result;
 
+#ifdef AMREX_DEVICE_COMPILE
+    extern __shared__ double extern_xyz[];
+    int tid = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*(blockDim.x*blockDim.y);
+    double* x = extern_xyz + tid*3;
+#endif
+
     switch (node->type)
     {
     case WP_NUMBER:
+    {
         result = ((struct wp_number*)node)->value;
         break;
+    }
     case WP_SYMBOL:
-        result = *(((struct wp_symbol*)node)->pointer);
+    {
+#ifdef AMREX_DEVICE_COMPILE
+        int i =((struct wp_symbol*)node)->ip.i;
+        result = x[i];
+#else
+        result = *(((struct wp_symbol*)node)->ip.p);
+#endif
         break;
+    }
     case WP_ADD:
+    {
         result = wp_ast_eval(node->l) + wp_ast_eval(node->r);
         break;
+    }
     case WP_SUB:
+    {
         result = wp_ast_eval(node->l) - wp_ast_eval(node->r);
         break;
+    }
     case WP_MUL:
+    {
         result = wp_ast_eval(node->l) * wp_ast_eval(node->r);
         break;
+    }
     case WP_DIV:
+    {
         result = wp_ast_eval(node->l) / wp_ast_eval(node->r);
         break;
+    }
     case WP_NEG:
+    {
         result = -wp_ast_eval(node->l);
         break;
+    }
     case WP_F1:
+    {
         result = wp_call_f1(((struct wp_f1*)node)->ftype,
                 wp_ast_eval(((struct wp_f1*)node)->l));
         break;
+    }
     case WP_F2:
+    {
         result = wp_call_f2(((struct wp_f2*)node)->ftype,
                 wp_ast_eval(((struct wp_f2*)node)->l),
                 wp_ast_eval(((struct wp_f2*)node)->r));
         break;
+    }
     case WP_ADD_VP:
-        result = node->lvp.v + *(node->rp);
+    {
+#ifdef AMREX_DEVICE_COMPILE
+        int i = node->rip.i;
+        result = node->lvp.v + x[i];
+#else
+        result = node->lvp.v + *(node->rip.p);
+#endif
         break;
+    }
     case WP_ADD_PP:
-        result = *(node->lvp.p) + *(node->rp);
+    {
+#ifdef AMREX_DEVICE_COMPILE
+        int i = node->lvp.ip.i;
+        int j = node->rip.i;
+        result = x[i] + x[j];
+#else
+        result = *(node->lvp.ip.p) + *(node->rip.p);
+#endif
         break;
+    }
     case WP_SUB_VP:
-        result = node->lvp.v - *(node->rp);
+    {
+#ifdef AMREX_DEVICE_COMPILE
+        int i = node->rip.i;
+        result = node->lvp.v - x[i];
+#else
+        result = node->lvp.v - *(node->rip.p);
+#endif
         break;
+    }
     case WP_SUB_PP:
-        result = *(node->lvp.p) - *(node->rp);
+    {
+#ifdef AMREX_DEVICE_COMPILE
+        int i = node->lvp.ip.i;
+        int j = node->rip.i;
+        result = x[i] - x[j];
+#else
+        result = *(node->lvp.ip.p) - *(node->rip.p);
+#endif
         break;
+    }
     case WP_MUL_VP:
-        result = node->lvp.v * *(node->rp);
+    {
+#ifdef AMREX_DEVICE_COMPILE
+        int i = node->rip.i;
+        result = node->lvp.v * x[i];
+#else
+        result = node->lvp.v * *(node->rip.p);
+#endif
         break;
+    }
     case WP_MUL_PP:
-        result = *(node->lvp.p) * *(node->rp);
+    {
+#ifdef AMREX_DEVICE_COMPILE
+        int i = node->lvp.ip.i;
+        int j = node->rip.i;
+        result = x[i] * x[j];
+#else
+        result = *(node->lvp.ip.p) * *(node->rip.p);
+#endif
         break;
+    }
     case WP_DIV_VP:
-        result = node->lvp.v / *(node->rp);
+    {
+#ifdef AMREX_DEVICE_COMPILE
+        int i = node->rip.i;
+        result = node->lvp.v / x[i];
+#else
+        result = node->lvp.v / *(node->rip.p);
+#endif
         break;
+    }
     case WP_DIV_PP:
-        result = *(node->lvp.p) / *(node->rp);
+    {
+#ifdef AMREX_DEVICE_COMPILE
+        int i = node->lvp.ip.i;
+        int j = node->rip.i;
+        result = x[i] / x[j];
+#else
+        result = *(node->lvp.ip.p) / *(node->rip.p);
+#endif
         break;
+    }
     case WP_NEG_P:
-        result = -*(node->lvp.p);
+    {
+#ifdef AMREX_DEVICE_COMPILE
+        int i = node->rip.i;
+        result = -x[i];
+#else
+        result = -*(node->lvp.ip.p);
+#endif
         break;
+    }
     default:
         yyerror("wp_ast_eval: unknown node type %d\n", node->type);
     }
diff --git a/Source/Parser/wp_parser_y.c b/Source/Parser/wp_parser_y.c
index 46cb199db..259f9368b 100644
--- a/Source/Parser/wp_parser_y.c
+++ b/Source/Parser/wp_parser_y.c
@@ -6,6 +6,8 @@
 #include "wp_parser_y.h"
 #include "wp_parser.tab.h"
 
+#include <AMReX_GpuQualifiers.H>
+
 static struct wp_node* wp_root = NULL;
 
 /* This is called by a bison rule to store the original AST in a
@@ -33,7 +35,7 @@ wp_makesymbol (char* name)
     struct wp_symbol* symbol = (struct wp_symbol*) malloc(sizeof(struct wp_symbol));
     symbol->type = WP_SYMBOL;
     symbol->name = strdup(name);
-    symbol->pointer = NULL;
+    symbol->ip.p = NULL;
     return symbol;
 }
 
@@ -74,13 +76,19 @@ wp_newf2 (enum wp_f2_t ftype, struct wp_node* l, struct wp_node* r)
     return (struct wp_node*) tmp;
 }
 
+AMREX_GPU_HOST_DEVICE
 void
 yyerror (char const *s, ...)
 {
     va_list vl;
     va_start(vl, s);
+#ifdef AMREX_DEVICE_COMPILE
+    printf(s,"\n");
+    assert(0);
+#else
     vfprintf(stderr, s, vl);
     fprintf(stderr, "\n");
+#endif
     va_end(vl);
 }
 
@@ -97,7 +105,7 @@ wp_parser_new (void)
 
     my_parser->ast = wp_parser_ast_dup(my_parser, wp_root,1); /* 1: free the source wp_root */
 
-    if (my_parser->p_root + my_parser->sz_mempool != my_parser->p_free) {
+    if ((char*)my_parser->p_root + my_parser->sz_mempool != (char*)my_parser->p_free) {
         yyerror("wp_parser_new: error in memory size");
         exit(1);
     }
@@ -145,6 +153,7 @@ wp_parser_dup (struct wp_parser* source)
     return dest;
 }
 
+AMREX_GPU_HOST_DEVICE
 double
 wp_call_f1 (enum wp_f1_t type, double a)
 {
@@ -175,6 +184,7 @@ wp_call_f1 (enum wp_f1_t type, double a)
     }
 }
 
+AMREX_GPU_HOST_DEVICE
 double
 wp_call_f2 (enum wp_f2_t type, double a, double b)
 {
@@ -346,23 +356,23 @@ wp_parser_ast_dup (struct wp_parser* my_parser, struct wp_node* node, int move)
 
 #define WP_MOVEUP_R(node, v) \
     struct wp_node* n = node->r->r; \
-    double* p = node->r->rp; \
+    double* p = node->r->rip.p; \
     node->r = n; \
     node->lvp.v = v; \
-    node->rp = p;
+    node->rip.p = p;
 #define WP_MOVEUP_L(node, v) \
     struct wp_node* n = node->l->r; \
-    double* p = node->l->rp; \
+    double* p = node->l->rip.p; \
     node->r = n; \
     node->lvp.v = v; \
-    node->rp = p;
+    node->rip.p = p;
 #define WP_EVAL_R(node) node->r->lvp.v
 #define WP_EVAL_L(node) node->l->lvp.v
 
 #define WP_NEG_MOVEUP(node) \
     node->r = node->l->r; \
     node->lvp.v = -node->l->lvp.v; \
-    node->rp = node->l->rp;
+    node->rip.p = node->l->rip.p;
 
 void
 wp_ast_optimize (struct wp_node* node)
@@ -391,22 +401,22 @@ wp_ast_optimize (struct wp_node* node)
                  node->r->type == WP_SYMBOL)
         {
             node->lvp.v = ((struct wp_number*)(node->l))->value;
-            node->rp = ((struct wp_symbol*)(node->r))->pointer;
+            node->rip.p = ((struct wp_symbol*)(node->r))->ip.p;
             node->type = WP_ADD_VP;
         }
         else if (node->l->type == WP_SYMBOL &&
                  node->r->type == WP_NUMBER)
         {
             node->lvp.v = ((struct wp_number*)(node->r))->value;
-            node->rp = ((struct wp_symbol*)(node->l))->pointer;
+            node->rip.p = ((struct wp_symbol*)(node->l))->ip.p;
             node->r = node->l;
             node->type = WP_ADD_VP;
         }
         else if (node->l->type == WP_SYMBOL &&
                  node->r->type == WP_SYMBOL)
         {
-            node->lvp.p = ((struct wp_symbol*)(node->l))->pointer;
-            node->rp = ((struct wp_symbol*)(node->r))->pointer;
+            node->lvp.ip.p = ((struct wp_symbol*)(node->l))->ip.p;
+            node->rip.p = ((struct wp_symbol*)(node->r))->ip.p;
             node->type = WP_ADD_PP;
         }
         else if (node->l->type == WP_NUMBER &&
@@ -454,22 +464,22 @@ wp_ast_optimize (struct wp_node* node)
                  node->r->type == WP_SYMBOL)
         {
             node->lvp.v = ((struct wp_number*)(node->l))->value;
-            node->rp = ((struct wp_symbol*)(node->r))->pointer;
+            node->rip.p = ((struct wp_symbol*)(node->r))->ip.p;
             node->type = WP_SUB_VP;
         }
         else if (node->l->type == WP_SYMBOL &&
                  node->r->type == WP_NUMBER)
         {
             node->lvp.v = -((struct wp_number*)(node->r))->value;
-            node->rp = ((struct wp_symbol*)(node->l))->pointer;
+            node->rip.p = ((struct wp_symbol*)(node->l))->ip.p;
             node->r = node->l;
             node->type = WP_ADD_VP;
         }
         else if (node->l->type == WP_SYMBOL &&
                  node->r->type == WP_SYMBOL)
         {
-            node->lvp.p = ((struct wp_symbol*)(node->l))->pointer;
-            node->rp = ((struct wp_symbol*)(node->r))->pointer;
+            node->lvp.ip.p = ((struct wp_symbol*)(node->l))->ip.p;
+            node->rip.p = ((struct wp_symbol*)(node->r))->ip.p;
             node->type = WP_SUB_PP;
         }
         else if (node->l->type == WP_NUMBER &&
@@ -517,22 +527,22 @@ wp_ast_optimize (struct wp_node* node)
                  node->r->type == WP_SYMBOL)
         {
             node->lvp.v = ((struct wp_number*)(node->l))->value;
-            node->rp = ((struct wp_symbol*)(node->r))->pointer;
+            node->rip.p = ((struct wp_symbol*)(node->r))->ip.p;
             node->type = WP_MUL_VP;
         }
         else if (node->l->type == WP_SYMBOL &&
                  node->r->type == WP_NUMBER)
         {
             node->lvp.v = ((struct wp_number*)(node->r))->value;
-            node->rp = ((struct wp_symbol*)(node->l))->pointer;
+            node->rip.p = ((struct wp_symbol*)(node->l))->ip.p;
             node->r = node->l;
             node->type = WP_MUL_VP;
         }
         else if (node->l->type == WP_SYMBOL &&
                  node->r->type == WP_SYMBOL)
         {
-            node->lvp.p = ((struct wp_symbol*)(node->l))->pointer;
-            node->rp = ((struct wp_symbol*)(node->r))->pointer;
+            node->lvp.ip.p = ((struct wp_symbol*)(node->l))->ip.p;
+            node->rip.p = ((struct wp_symbol*)(node->r))->ip.p;
             node->type = WP_MUL_PP;
         }
         else if (node->l->type == WP_NUMBER &&
@@ -580,22 +590,22 @@ wp_ast_optimize (struct wp_node* node)
                  node->r->type == WP_SYMBOL)
         {
             node->lvp.v = ((struct wp_number*)(node->l))->value;
-            node->rp = ((struct wp_symbol*)(node->r))->pointer;
+            node->rip.p = ((struct wp_symbol*)(node->r))->ip.p;
             node->type = WP_DIV_VP;
         }
         else if (node->l->type == WP_SYMBOL &&
                  node->r->type == WP_NUMBER)
         {
             node->lvp.v = 1./((struct wp_number*)(node->r))->value;
-            node->rp = ((struct wp_symbol*)(node->l))->pointer;
+            node->rip.p = ((struct wp_symbol*)(node->l))->ip.p;
             node->r = node->l;
             node->type = WP_MUL_VP;
         }
         else if (node->l->type == WP_SYMBOL &&
                  node->r->type == WP_SYMBOL)
         {
-            node->lvp.p = ((struct wp_symbol*)(node->l))->pointer;
-            node->rp = ((struct wp_symbol*)(node->r))->pointer;
+            node->lvp.ip.p = ((struct wp_symbol*)(node->l))->ip.p;
+            node->rip.p = ((struct wp_symbol*)(node->r))->ip.p;
             node->type = WP_DIV_PP;
         }
         else if (node->l->type == WP_NUMBER &&
@@ -637,7 +647,7 @@ wp_ast_optimize (struct wp_node* node)
         }
         else if (node->l->type == WP_SYMBOL)
         {
-            node->lvp.p = ((struct wp_symbol*)(node->l))->pointer;
+            node->lvp.ip.p = ((struct wp_symbol*)(node->l))->ip.p;
             node->type = WP_NEG_P;
         }
         else if (node->l->type == WP_ADD_VP)
@@ -936,7 +946,7 @@ wp_ast_regvar (struct wp_node* node, char const* name, double* p)
         break;
     case WP_SYMBOL:
         if (strcmp(name, ((struct wp_symbol*)node)->name) == 0) {
-            ((struct wp_symbol*)node)->pointer = p;
+            ((struct wp_symbol*)node)->ip.p = p;
         }
         break;
     case WP_ADD:
@@ -961,11 +971,11 @@ wp_ast_regvar (struct wp_node* node, char const* name, double* p)
     case WP_MUL_VP:
     case WP_DIV_VP:
         wp_ast_regvar(node->r, name, p);
-        node->rp = ((struct wp_symbol*)(node->r))->pointer;
+        node->rip.p = ((struct wp_symbol*)(node->r))->ip.p;
         break;
     case WP_NEG_P:
         wp_ast_regvar(node->l, name, p);
-        node->lvp.p = ((struct wp_symbol*)(node->l))->pointer;
+        node->lvp.ip.p = ((struct wp_symbol*)(node->l))->ip.p;
         break;
     case WP_ADD_PP:
     case WP_SUB_PP:
@@ -973,8 +983,8 @@ wp_ast_regvar (struct wp_node* node, char const* name, double* p)
     case WP_DIV_PP:
         wp_ast_regvar(node->l, name, p);
         wp_ast_regvar(node->r, name, p);
-        node->lvp.p = ((struct wp_symbol*)(node->l))->pointer;
-        node->rp = ((struct wp_symbol*)(node->r))->pointer;
+        node->lvp.ip.p = ((struct wp_symbol*)(node->l))->ip.p;
+        node->rip.p = ((struct wp_symbol*)(node->r))->ip.p;
         break;
     default:
         yyerror("wp_ast_regvar: unknown node type %d\n", node->type);
@@ -982,6 +992,61 @@ wp_ast_regvar (struct wp_node* node, char const* name, double* p)
     }
 }
 
+void
+wp_ast_regvar_gpu (struct wp_node* node, char const* name, int i)
+{
+    switch (node->type)
+    {
+    case WP_NUMBER:
+        break;
+    case WP_SYMBOL:
+        if (strcmp(name, ((struct wp_symbol*)node)->name) == 0) {
+            ((struct wp_symbol*)node)->ip.i = i;
+        }
+        break;
+    case WP_ADD:
+    case WP_SUB:
+    case WP_MUL:
+    case WP_DIV:
+        wp_ast_regvar_gpu(node->l, name, i);
+        wp_ast_regvar_gpu(node->r, name, i);
+        break;
+    case WP_NEG:
+        wp_ast_regvar_gpu(node->l, name, i);
+        break;
+    case WP_F1:
+        wp_ast_regvar_gpu(node->l, name, i);
+        break;
+    case WP_F2:
+        wp_ast_regvar_gpu(node->l, name, i);
+        wp_ast_regvar_gpu(node->r, name, i);
+        break;
+    case WP_ADD_VP:
+    case WP_SUB_VP:
+    case WP_MUL_VP:
+    case WP_DIV_VP:
+        wp_ast_regvar_gpu(node->r, name, i);
+        node->rip.i = ((struct wp_symbol*)(node->r))->ip.i;
+        break;
+    case WP_NEG_P:
+        wp_ast_regvar_gpu(node->l, name, i);
+        node->lvp.ip.i = ((struct wp_symbol*)(node->l))->ip.i;
+        break;
+    case WP_ADD_PP:
+    case WP_SUB_PP:
+    case WP_MUL_PP:
+    case WP_DIV_PP:
+        wp_ast_regvar_gpu(node->l, name, i);
+        wp_ast_regvar_gpu(node->r, name, i);
+        node->lvp.ip.i = ((struct wp_symbol*)(node->l))->ip.i;
+        node->rip.i = ((struct wp_symbol*)(node->r))->ip.i;
+        break;
+    default:
+        yyerror("wp_ast_regvar_gpu: unknown node type %d\n", node->type);
+        exit(1);
+    }
+}
+
 void wp_ast_setconst (struct wp_node* node, char const* name, double c)
 {
     switch (node->type)
@@ -1040,6 +1105,12 @@ wp_parser_regvar (struct wp_parser* parser, char const* name, double* p)
 }
 
 void
+wp_parser_regvar_gpu (struct wp_parser* parser, char const* name, int i)
+{
+    wp_ast_regvar_gpu(parser->ast, name, i);
+}
+
+void
 wp_parser_setconst (struct wp_parser* parser, char const* name, double c)
 {
     wp_ast_setconst(parser->ast, name, c);
diff --git a/Source/Parser/wp_parser_y.h b/Source/Parser/wp_parser_y.h
index 4a3aeda40..8c9f8e4e4 100644
--- a/Source/Parser/wp_parser_y.h
+++ b/Source/Parser/wp_parser_y.h
@@ -1,6 +1,8 @@
 #ifndef WP_PARSER_Y_H_
 #define WP_PARSER_Y_H_
 
+#include <AMReX_GpuQualifiers.H>
+
 #ifdef __cplusplus
 #include <cstdlib>
 extern "C" {
@@ -73,17 +75,22 @@ enum wp_node_t {
  * wp_node_t type can be safely checked to determine their real type.
  */
 
-union wp_vp {
-    double  v;
+union wp_ip {
+    int i;
     double* p;
 };
 
+union wp_vp {
+    double v;
+    union wp_ip ip;
+};
+
 struct wp_node {
     enum wp_node_t type;
     struct wp_node* l;
     struct wp_node* r;
     union wp_vp lvp;  // After optimization, this may store left value/pointer.
-    double* rp;       //                     this may store right      pointer.
+    union wp_ip rip;  //                     this may store right      pointer.
 };
 
 struct wp_number {
@@ -94,7 +101,7 @@ struct wp_number {
 struct wp_symbol {
     enum wp_node_t type;
     char* name;
-    double* pointer;
+    union wp_ip ip;
 };
 
 struct wp_f1 {  /* Builtin functions with one argument */
@@ -124,6 +131,7 @@ struct wp_node* wp_newf1 (enum wp_f1_t ftype, struct wp_node* l);
 struct wp_node* wp_newf2 (enum wp_f2_t ftype, struct wp_node* l,
                           struct wp_node* r);
 
+AMREX_GPU_HOST_DEVICE
 void yyerror (char const *s, ...);
 
 /*******************************************************************/
@@ -146,6 +154,7 @@ struct wp_parser* wp_parser_dup (struct wp_parser* source);
 struct wp_node* wp_parser_ast_dup (struct wp_parser* parser, struct wp_node* src, int move);
 
 void wp_parser_regvar (struct wp_parser* parser, char const* name, double* p);
+void wp_parser_regvar_gpu (struct wp_parser* parser, char const* name, int i);
 void wp_parser_setconst (struct wp_parser* parser, char const* name, double c);
 
 /* We need to walk the tree in these functions */
@@ -153,10 +162,11 @@ void wp_ast_optimize (struct wp_node* node);
 size_t wp_ast_size (struct wp_node* node);
 void wp_ast_print (struct wp_node* node);
 void wp_ast_regvar (struct wp_node* node, char const* name, double* p);
+void wp_ast_regvar_gpu (struct wp_node* node, char const* name, int i);
 void wp_ast_setconst (struct wp_node* node, char const* name, double c);
 
-double wp_call_f1 (enum wp_f1_t type, double a);
-double wp_call_f2 (enum wp_f2_t type, double a, double b);
+AMREX_GPU_HOST_DEVICE double wp_call_f1 (enum wp_f1_t type, double a);
+AMREX_GPU_HOST_DEVICE double wp_call_f2 (enum wp_f2_t type, double a, double b);
 
 #ifdef __cplusplus
 }
diff --git a/Source/Particles/Deposition/ChargeDeposition.H b/Source/Particles/Deposition/ChargeDeposition.H
new file mode 100755
index 000000000..a6573b7ab
--- /dev/null
+++ b/Source/Particles/Deposition/ChargeDeposition.H
@@ -0,0 +1,97 @@
+#ifndef CHARGEDEPOSITION_H_
+#define CHARGEDEPOSITION_H_
+
+#include "ShapeFactors.H"
+
+/* \brief Charge Deposition for thread thread_num
+ * /param xp, yp, zp   : Pointer to arrays of particle positions.
+ * \param wp           : Pointer to array of particle weights.
+ * \param rho_arr      : Array4 of charge density, either full array or tile.
+ * \param np_to_depose : Number of particles for which current is deposited.
+ * \param dx           : 3D cell size
+ * \param xyzmin       : Physical lower bounds of domain.
+ * \param lo           : Index lower bounds of domain.
+ * /param q            : species charge.
+ */
+template <int depos_order>
+void doChargeDepositionShapeN(const amrex::Real * const xp, 
+                              const amrex::Real * const yp, 
+                              const amrex::Real * const zp,
+                              const amrex::Real * const wp,
+                              const amrex::Array4<amrex::Real>& rho_arr,
+                              const long np_to_depose,
+                              const std::array<amrex::Real,3>& dx,
+                              const std::array<amrex::Real, 3> xyzmin,
+                              const amrex::Dim3 lo,
+                              const amrex::Real q)
+{
+    const amrex::Real dxi = 1.0/dx[0];
+    const amrex::Real dzi = 1.0/dx[2];
+#if (AMREX_SPACEDIM == 2)
+    const amrex::Real invvol = dxi*dzi;
+#elif (defined WARPX_DIM_3D)
+    const amrex::Real dyi = 1.0/dx[1];
+    const amrex::Real invvol = dxi*dyi*dzi;
+#endif
+
+    const amrex::Real xmin = xyzmin[0];
+    const amrex::Real ymin = xyzmin[1];
+    const amrex::Real zmin = xyzmin[2];
+
+    // Loop over particles and deposit into rho_arr
+    amrex::ParallelFor(
+        np_to_depose,
+        [=] AMREX_GPU_DEVICE (long ip) {
+            // --- Get particle quantities
+            const amrex::Real wq = q*wp[ip]*invvol;
+
+            // --- Compute shape factors
+            // x direction
+            // Get particle position in grid coordinates
+#if (defined WARPX_DIM_RZ)
+            const amrex::Real r = std::sqrt(xp[ip]*xp[ip] + yp[ip]*yp[ip]);
+            const amrex::Real x = (r - xmin)*dxi;
+#else
+            const amrex::Real x = (xp[ip] - xmin)*dxi;
+#endif
+            // Compute shape factors for node-centered quantities
+            amrex::Real AMREX_RESTRICT sx[depos_order + 1];
+            // i: leftmost grid point (node-centered) that the particle touches
+            const int i = compute_shape_factor<depos_order>(sx,  x);
+                     
+#if (defined WARPX_DIM_3D)
+            // y direction
+            const amrex::Real y = (yp[ip] - ymin)*dyi;
+            amrex::Real AMREX_RESTRICT sy[depos_order + 1];
+            const int j = compute_shape_factor<depos_order>(sy,  y);
+#endif
+            // z direction
+            const amrex::Real z = (zp[ip] - zmin)*dzi;
+            amrex::Real AMREX_RESTRICT sz[depos_order + 1];
+            const int k = compute_shape_factor<depos_order>(sz,  z);
+
+            // Deposit charge into rho_arr
+#if (defined WARPX_DIM_2D) || (defined WARPX_DIM_RZ)
+            for (int iz=0; iz<=depos_order; iz++){
+                for (int ix=0; ix<=depos_order; ix++){
+                    amrex::Gpu::Atomic::Add(
+                        &rho_arr(lo.x+i+ix, lo.y+k+iz, 0), 
+                        sx[ix]*sz[iz]*wq);
+                }
+            }
+#elif (defined WARPX_DIM_3D)
+            for (int iz=0; iz<=depos_order; iz++){
+                for (int iy=0; iy<=depos_order; iy++){
+                    for (int ix=0; ix<=depos_order; ix++){
+                        amrex::Gpu::Atomic::Add(
+                            &rho_arr(lo.x+i+ix, lo.y+j+iy, lo.z+k+iz),
+                            sx[ix]*sy[iy]*sz[iz]*wq);
+                    }
+                }
+            }
+#endif
+        }
+        );
+}
+
+#endif // CHARGEDEPOSITION_H_
diff --git a/Source/Particles/Deposition/CurrentDeposition.H b/Source/Particles/Deposition/CurrentDeposition.H
index 97bc53c20..4a392b57e 100644
--- a/Source/Particles/Deposition/CurrentDeposition.H
+++ b/Source/Particles/Deposition/CurrentDeposition.H
@@ -1,52 +1,7 @@
 #ifndef CURRENTDEPOSITION_H_
 #define CURRENTDEPOSITION_H_
 
-using namespace amrex;
-
-// Compute shape factor and return index of leftmost cell where
-// particle writes.
-// Specialized templates are defined below for orders 1, 2 and 3.
-template <int depos_order>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int compute_shape_factor(Real* const sx, Real xint);
-
-// Compute shape factor for order 1.
-template <>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int compute_shape_factor <1> (Real* const sx, Real xmid){
-    int j = (int) xmid;
-    Real xint = xmid-j;
-    sx[0] = 1.0 - xint;
-    sx[1] = xint;
-    return j;
-}
-
-// Compute shape factor for order 2.
-template <>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int compute_shape_factor <2> (Real* const sx, Real xmid){
-    int j = (int) (xmid+0.5);
-    Real xint = xmid-j;
-    sx[0] = 0.5*(0.5-xint)*(0.5-xint);
-    sx[1] = 0.75-xint*xint;
-    sx[2] = 0.5*(0.5+xint)*(0.5+xint);
-    // index of the leftmost cell where particle deposits
-    return j-1;
-}
-
-// Compute shape factor for order 3.
-template <>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int compute_shape_factor <3> (Real* const sx, Real xmid){
-    int j = (int) xmid;
-    Real xint = xmid-j;
-    sx[0] = 1.0/6.0*(1.0-xint)*(1.0-xint)*(1.0-xint);
-    sx[1] = 2.0/3.0-xint*xint*(1-xint/2.0);
-    sx[2] = 2.0/3.0-(1-xint)*(1-xint)*(1.0-0.5*(1-xint));
-    sx[3] = 1.0/6.0*xint*xint*xint;
-    // index of the leftmost cell where particle deposits
-    return j-1;
-}
+#include "ShapeFactors.H"
 
 /* \brief Current Deposition for thread thread_num
  * /param xp, yp, zp   : Pointer to arrays of particle positions.
@@ -55,9 +10,7 @@ int compute_shape_factor <3> (Real* const sx, Real xmid){
  * \param jx_arr       : Array4 of current density, either full array or tile.
  * \param jy_arr       : Array4 of current density, either full array or tile.
  * \param jz_arr       : Array4 of current density, either full array or tile.
- * \param offset       : Index of first particle for which current is deposited
  * \param np_to_depose : Number of particles for which current is deposited.
-                        Particles [offset,offset+np_tp_depose] deposit current.
  * \param dt           : Time step for particle level
  * \param dx           : 3D cell size
  * \param xyzmin       : Physical lower bounds of domain.
@@ -66,164 +19,144 @@ int compute_shape_factor <3> (Real* const sx, Real xmid){
  * /param q            : species charge.
  */
 template <int depos_order>
-void doDepositionShapeN(const Real * const xp, const Real * const yp, const Real * const zp,
-                        const Real * const wp, const Real * const uxp,
-                        const Real * const uyp, const Real * const uzp,
-                        const amrex::Array4<amrex::Real>& jx_arr, 
-                        const amrex::Array4<amrex::Real>& jy_arr, 
+void doDepositionShapeN(const amrex::Real * const xp, 
+                        const amrex::Real * const yp, 
+                        const amrex::Real * const zp,
+                        const amrex::Real * const wp,
+                        const amrex::Real * const uxp,
+                        const amrex::Real * const uyp,
+                        const amrex::Real * const uzp,
+                        const amrex::Array4<amrex::Real>& jx_arr,
+                        const amrex::Array4<amrex::Real>& jy_arr,
                         const amrex::Array4<amrex::Real>& jz_arr,
-                        const long offset, const long np_to_depose, 
-                        const amrex::Real dt, const std::array<amrex::Real,3>& dx,
-                        const std::array<Real, 3> xyzmin,
-                        const Dim3 lo,
+                        const long np_to_depose, const amrex::Real dt,
+                        const std::array<amrex::Real,3>& dx,
+                        const std::array<amrex::Real, 3> xyzmin,
+                        const amrex::Dim3 lo,
                         const amrex::Real stagger_shift, 
                         const amrex::Real q)
 {
-    const Real dxi = 1.0/dx[0];
-    const Real dzi = 1.0/dx[2];
-    const Real dts2dx = 0.5*dt*dxi;
-    const Real dts2dz = 0.5*dt*dzi;
+    const amrex::Real dxi = 1.0/dx[0];
+    const amrex::Real dzi = 1.0/dx[2];
+    const amrex::Real dts2dx = 0.5*dt*dxi;
+    const amrex::Real dts2dz = 0.5*dt*dzi;
 #if (AMREX_SPACEDIM == 2)
-    const Real invvol = dxi*dzi;
-#else // (AMREX_SPACEDIM == 3)
-    const Real dyi = 1.0/dx[1];
-    const Real dts2dy = 0.5*dt*dyi;
-    const Real invvol = dxi*dyi*dzi;
+    const amrex::Real invvol = dxi*dzi;
+#elif (defined WARPX_DIM_3D)
+    const amrex::Real dyi = 1.0/dx[1];
+    const amrex::Real dts2dy = 0.5*dt*dyi;
+    const amrex::Real invvol = dxi*dyi*dzi;
 #endif
 
-    const Real xmin = xyzmin[0];
-    const Real ymin = xyzmin[1];
-    const Real zmin = xyzmin[2];
-    const Real clightsq = 1.0/PhysConst::c/PhysConst::c;
+    const amrex::Real xmin = xyzmin[0];
+    const amrex::Real ymin = xyzmin[1];
+    const amrex::Real zmin = xyzmin[2];
+    const amrex::Real clightsq = 1.0/PhysConst::c/PhysConst::c;
 
     // Loop over particles and deposit into jx_arr, jy_arr and jz_arr
-    ParallelFor( np_to_depose,
-                 [=] AMREX_GPU_DEVICE (long ip) {
-                     // --- Get particle quantities
-                     const Real gaminv = 1.0/std::sqrt(1.0 + uxp[ip]*uxp[ip]*clightsq
-						       + uyp[ip]*uyp[ip]*clightsq
-						       + uzp[ip]*uzp[ip]*clightsq);
-                     const Real wq  = q*wp[ip];
-                     const Real vx  = uxp[ip]*gaminv;
-                     const Real vy  = uyp[ip]*gaminv;
-                     const Real vz  = uzp[ip]*gaminv;
-                     // wqx, wqy wqz are particle current in each direction 
-                     const Real wqx = wq*invvol*vx;
-                     const Real wqy = wq*invvol*vy;
-                     const Real wqz = wq*invvol*vz;
-
-                     // --- Compute shape factors
-                     // x direction
-                     // Get particle position after 1/2 push back in position
-                     const Real xmid = (xp[ip]-xmin)*dxi-dts2dx*vx;
-                     // Compute shape factors for node-centered quantities
-                     Real AMREX_RESTRICT sx [depos_order + 1];
-                     // j: leftmost grid point (node-centered) that the particle touches
-                     const int j  = compute_shape_factor<depos_order>(sx,  xmid);
-                     // Compute shape factors for cell-centered quantities
-                     Real AMREX_RESTRICT sx0[depos_order + 1];
-                     // j0: leftmost grid point (cell-centered) that the particle touches
-                     const int j0 = compute_shape_factor<depos_order>(sx0, xmid-stagger_shift);
+    amrex::ParallelFor(
+        np_to_depose,
+        [=] AMREX_GPU_DEVICE (long ip) {
+            // --- Get particle quantities
+            const amrex::Real gaminv = 1.0/std::sqrt(1.0 + uxp[ip]*uxp[ip]*clightsq
+                                                     + uyp[ip]*uyp[ip]*clightsq
+                                                     + uzp[ip]*uzp[ip]*clightsq);
+            const amrex::Real wq  = q*wp[ip];
+            const amrex::Real vx  = uxp[ip]*gaminv;
+            const amrex::Real vy  = uyp[ip]*gaminv;
+            const amrex::Real vz  = uzp[ip]*gaminv;
+            // wqx, wqy wqz are particle current in each direction 
+#if (defined WARPX_DIM_RZ)
+            // In RZ, wqx is actually wqr, and wqy is wqtheta
+            // Convert to cylinderical at the mid point
+            const amrex::Real xpmid = xp[ip] - 0.5*dt*vx;
+            const amrex::Real ypmid = yp[ip] - 0.5*dt*vy;
+            const amrex::Real rpmid = std::sqrt(xpmid*xpmid + ypmid*ypmid);
+            amrex::Real costheta;
+            amrex::Real sintheta;
+            if (rpmid > 0.) {
+                costheta = xpmid/rpmid;
+                sintheta = ypmid/rpmid;
+            } else {
+                costheta = 1.;
+                sintheta = 0.;
+            }
+            const amrex::Real wqx = wq*invvol*(+vx*costheta + vy*sintheta);
+            const amrex::Real wqy = wq*invvol*(-vx*sintheta + vy*costheta);
+#else
+            const amrex::Real wqx = wq*invvol*vx;
+            const amrex::Real wqy = wq*invvol*vy;
+#endif
+            const amrex::Real wqz = wq*invvol*vz;
+
+            // --- Compute shape factors
+            // x direction
+            // Get particle position after 1/2 push back in position
+#if (defined WARPX_DIM_RZ)
+            const amrex::Real xmid = (rpmid-xmin)*dxi;
+#else
+            const amrex::Real xmid = (xp[ip]-xmin)*dxi-dts2dx*vx;
+#endif
+            // Compute shape factors for node-centered quantities
+            amrex::Real AMREX_RESTRICT sx [depos_order + 1];
+            // j: leftmost grid point (node-centered) that the particle touches
+            const int j  = compute_shape_factor<depos_order>(sx,  xmid);
+            // Compute shape factors for cell-centered quantities
+            amrex::Real AMREX_RESTRICT sx0[depos_order + 1];
+            // j0: leftmost grid point (cell-centered) that the particle touches
+            const int j0 = compute_shape_factor<depos_order>(sx0, xmid-stagger_shift);
                      
-#if (AMREX_SPACEDIM == 3)
-                     // y direction
-                     const Real ymid= (yp[ip]-ymin)*dyi-dts2dy*vy;
-                     Real AMREX_RESTRICT sy [depos_order + 1];
-		     const int k  = compute_shape_factor<depos_order>(sy,  ymid);
-                     Real AMREX_RESTRICT sy0[depos_order + 1];
-		     const int k0 = compute_shape_factor<depos_order>(sy0, ymid-stagger_shift);
+#if (defined WARPX_DIM_3D)
+            // y direction
+            const amrex::Real ymid= (yp[ip]-ymin)*dyi-dts2dy*vy;
+            amrex::Real AMREX_RESTRICT sy [depos_order + 1];
+            const int k  = compute_shape_factor<depos_order>(sy,  ymid);
+            amrex::Real AMREX_RESTRICT sy0[depos_order + 1];
+            const int k0 = compute_shape_factor<depos_order>(sy0, ymid-stagger_shift);
 #endif
-                     // z direction
-                     const Real zmid= (zp[ip]-zmin)*dzi-dts2dz*vz;
-                     Real AMREX_RESTRICT sz [depos_order + 1];
-		     const int l  = compute_shape_factor<depos_order>(sz,  zmid);
-                     Real AMREX_RESTRICT sz0[depos_order + 1];
-		     const int l0 = compute_shape_factor<depos_order>(sz0, zmid-stagger_shift);
-
-                     // Deposit current into jx_arr, jy_arr and jz_arr
-#if (AMREX_SPACEDIM == 2)
-                     for (int iz=0; iz<=depos_order; iz++){
-                         for (int ix=0; ix<=depos_order; ix++){
-                             amrex::Gpu::Atomic::Add(
-                                 &jx_arr(lo.x+j0+ix, lo.y+l +iz, 0), 
-                                 sx0[ix]*sz [iz]*wqx);
-                             amrex::Gpu::Atomic::Add(
-                                 &jy_arr(lo.x+j +ix, lo.y+l +iz, 0), 
-                                 sx [ix]*sz [iz]*wqy);
-                             amrex::Gpu::Atomic::Add(
-                                 &jz_arr(lo.x+j +ix, lo.y+l0+iz, 0), 
-                                 sx [ix]*sz0[iz]*wqz);
-                         }
-                     }
-#else // (AMREX_SPACEDIM == 3)
-                     for (int iz=0; iz<=depos_order; iz++){
-                         for (int iy=0; iy<=depos_order; iy++){
-                             for (int ix=0; ix<=depos_order; ix++){
-                                 amrex::Gpu::Atomic::Add(
-                                     &jx_arr(lo.x+j0+ix, lo.y+k +iy, lo.z+l +iz),
-                                     sx0[ix]*sy [iy]*sz [iz]*wqx);
-                                 amrex::Gpu::Atomic::Add(
-                                     &jy_arr(lo.x+j +ix, lo.y+k0+iy, lo.z+l +iz), 
-                                     sx [ix]*sy0[iy]*sz [iz]*wqy);
-                                 amrex::Gpu::Atomic::Add(
-                                     &jz_arr(lo.x+j +ix, lo.y+k +iy, lo.z+l0+iz),
-                                     sx [ix]*sy [iy]*sz0[iz]*wqz);
-                             }
-                         }
-                     }
+            // z direction
+            const amrex::Real zmid= (zp[ip]-zmin)*dzi-dts2dz*vz;
+            amrex::Real AMREX_RESTRICT sz [depos_order + 1];
+            const int l  = compute_shape_factor<depos_order>(sz,  zmid);
+            amrex::Real AMREX_RESTRICT sz0[depos_order + 1];
+            const int l0 = compute_shape_factor<depos_order>(sz0, zmid-stagger_shift);
+
+            // Deposit current into jx_arr, jy_arr and jz_arr
+#if (defined WARPX_DIM_2D) || (defined WARPX_DIM_RZ)
+            for (int iz=0; iz<=depos_order; iz++){
+                for (int ix=0; ix<=depos_order; ix++){
+                    amrex::Gpu::Atomic::Add(
+                        &jx_arr(lo.x+j0+ix, lo.y+l +iz, 0), 
+                        sx0[ix]*sz [iz]*wqx);
+                    amrex::Gpu::Atomic::Add(
+                        &jy_arr(lo.x+j +ix, lo.y+l +iz, 0), 
+                        sx [ix]*sz [iz]*wqy);
+                    amrex::Gpu::Atomic::Add(
+                        &jz_arr(lo.x+j +ix, lo.y+l0+iz, 0), 
+                        sx [ix]*sz0[iz]*wqz);
+                }
+            }
+#elif (defined WARPX_DIM_3D)
+            for (int iz=0; iz<=depos_order; iz++){
+                for (int iy=0; iy<=depos_order; iy++){
+                    for (int ix=0; ix<=depos_order; ix++){
+                        amrex::Gpu::Atomic::Add(
+                            &jx_arr(lo.x+j0+ix, lo.y+k +iy, lo.z+l +iz),
+                            sx0[ix]*sy [iy]*sz [iz]*wqx);
+                        amrex::Gpu::Atomic::Add(
+                            &jy_arr(lo.x+j +ix, lo.y+k0+iy, lo.z+l +iz), 
+                            sx [ix]*sy0[iy]*sz [iz]*wqy);
+                        amrex::Gpu::Atomic::Add(
+                            &jz_arr(lo.x+j +ix, lo.y+k +iy, lo.z+l0+iz),
+                            sx [ix]*sy [iy]*sz0[iz]*wqz);
+                    }
+                }
+            }
 #endif
-                 }
+        }
         );
 }
 
-// Compute shape factor and return index of leftmost cell where
-// particle writes.
-// Specialized templates are defined below for orders 1, 2 and 3.
-template <int depos_order>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int compute_shifted_shape_factor (Real* const sx, const Real x_old, const int i_new);
-
-// Compute shape factor for order 1.
-template <>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int compute_shifted_shape_factor <1> (Real* const sx, const Real x_old, const int i_new){
-    const int i = (int) x_old;
-    const int i_shift = i - i_new;
-    const Real xint = x_old - i;
-    sx[1+i_shift] = 1.0 - xint;
-    sx[2+i_shift] = xint;
-    return i;
-}
-
-// Compute shape factor for order 2.
-template <>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int compute_shifted_shape_factor <2> (Real* const sx, const Real x_old, const int i_new){
-    const int i = (int) (x_old+0.5);
-    const int i_shift = i - (i_new + 1);
-    const Real xint = x_old - i;
-    sx[1+i_shift] = 0.5*(0.5-xint)*(0.5-xint);
-    sx[2+i_shift] = 0.75-xint*xint;
-    sx[3+i_shift] = 0.5*(0.5+xint)*(0.5+xint);
-    // index of the leftmost cell where particle deposits
-    return i-1;
-}
-
-// Compute shape factor for order 3.
-template <>
-AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
-int compute_shifted_shape_factor <3> (Real* const sx, const Real x_old, const int i_new){
-    const int i = (int) x_old;
-    const int i_shift = i - (i_new + 1);
-    const Real xint = x_old - i;
-    sx[1+i_shift] = 1.0/6.0*(1.0-xint)*(1.0-xint)*(1.0-xint);
-    sx[2+i_shift] = 2.0/3.0-xint*xint*(1-xint/2.0);
-    sx[3+i_shift] = 2.0/3.0-(1-xint)*(1-xint)*(1.0-0.5*(1-xint));
-    sx[4+i_shift] = 1.0/6.0*xint*xint*xint;
-    // index of the leftmost cell where particle deposits
-    return i-1;
-}
-
 /* \brief Esirkepov Current Deposition for thread thread_num
  * /param xp, yp, zp   : Pointer to arrays of particle positions.
  * \param wp           : Pointer to array of particle weights.
@@ -239,170 +172,197 @@ int compute_shifted_shape_factor <3> (Real* const sx, const Real x_old, const in
  * /param q            : species charge.
  */
 template <int depos_order>
-void doEsirkepovDepositionShapeN (const Real * const xp, const Real * const yp, const Real * const zp,
-                                  const Real * const wp, const Real * const uxp,
-                                  const Real * const uyp, const Real * const uzp,
+void doEsirkepovDepositionShapeN (const amrex::Real * const xp,
+                                  const amrex::Real * const yp,
+                                  const amrex::Real * const zp,
+                                  const amrex::Real * const wp,
+                                  const amrex::Real * const uxp,
+                                  const amrex::Real * const uyp,
+                                  const amrex::Real * const uzp,
                                   const amrex::Array4<amrex::Real>& Jx_arr,
                                   const amrex::Array4<amrex::Real>& Jy_arr,
                                   const amrex::Array4<amrex::Real>& Jz_arr,
                                   const long np_to_depose,
-                                  const amrex::Real dt, const std::array<amrex::Real,3>& dx,
-                                  const std::array<Real, 3> xyzmin,
-                                  const Dim3 lo,
+                                  const amrex::Real dt,
+                                  const std::array<amrex::Real,3>& dx,
+                                  const std::array<amrex::Real, 3> xyzmin,
+                                  const amrex::Dim3 lo,
                                   const amrex::Real q)
 {
-    const Real dxi = 1.0/dx[0];
-    const Real dtsdx0 = dt*dxi;
-    const Real xmin = xyzmin[0];
-#if (AMREX_SPACEDIM == 3)
-    const Real dyi = 1.0/dx[1];
-    const Real dtsdy0 = dt*dyi;
-    const Real ymin = xyzmin[1];
+    const amrex::Real dxi = 1.0/dx[0];
+    const amrex::Real dtsdx0 = dt*dxi;
+    const amrex::Real xmin = xyzmin[0];
+#if (defined WARPX_DIM_3D)
+    const amrex::Real dyi = 1.0/dx[1];
+    const amrex::Real dtsdy0 = dt*dyi;
+    const amrex::Real ymin = xyzmin[1];
 #endif
-    const Real dzi = 1.0/dx[2];
-    const Real dtsdz0 = dt*dzi;
-    const Real zmin = xyzmin[2];
-
-#if (AMREX_SPACEDIM == 3)
-    const Real invdtdx = 1.0/(dt*dx[1]*dx[2]);
-    const Real invdtdy = 1.0/(dt*dx[0]*dx[2]);
-    const Real invdtdz = 1.0/(dt*dx[0]*dx[1]);
-#elif (AMREX_SPACEDIM == 2)
-    const Real invdtdx = 1.0/(dt*dx[2]);
-    const Real invdtdz = 1.0/(dt*dx[0]);
-    const Real invvol = 1.0/(dx[0]*dx[2]);
+    const amrex::Real dzi = 1.0/dx[2];
+    const amrex::Real dtsdz0 = dt*dzi;
+    const amrex::Real zmin = xyzmin[2];
+
+#if (defined WARPX_DIM_3D)
+    const amrex::Real invdtdx = 1.0/(dt*dx[1]*dx[2]);
+    const amrex::Real invdtdy = 1.0/(dt*dx[0]*dx[2]);
+    const amrex::Real invdtdz = 1.0/(dt*dx[0]*dx[1]);
+#elif (defined WARPX_DIM_2D) || (defined WARPX_DIM_RZ)
+    const amrex::Real invdtdx = 1.0/(dt*dx[2]);
+    const amrex::Real invdtdz = 1.0/(dt*dx[0]);
+    const amrex::Real invvol = 1.0/(dx[0]*dx[2]);
 #endif
 
-    const Real clightsq = 1.0/PhysConst::c/PhysConst::c;
+    const amrex::Real clightsq = 1.0/PhysConst::c/PhysConst::c;
 
     // Loop over particles and deposit into Jx_arr, Jy_arr and Jz_arr
-    ParallelFor( np_to_depose,
-                 [=] AMREX_GPU_DEVICE (long ip) {
-
-                     // --- Get particle quantities
-                     const Real gaminv = 1.0/std::sqrt(1.0 + uxp[ip]*uxp[ip]*clightsq
-                                                           + uyp[ip]*uyp[ip]*clightsq
-                                                           + uzp[ip]*uzp[ip]*clightsq);
-
-                     // wqx, wqy wqz are particle current in each direction
-                     const Real wq = q*wp[ip];
-                     const Real wqx = wq*invdtdx;
-#if (AMREX_SPACEDIM == 3)
-                     const Real wqy = wq*invdtdy;
+    amrex::ParallelFor( 
+        np_to_depose,
+        [=] AMREX_GPU_DEVICE (long ip) {
+
+            // --- Get particle quantities
+            const amrex::Real gaminv = 1.0/std::sqrt(1.0 + uxp[ip]*uxp[ip]*clightsq
+                                                         + uyp[ip]*uyp[ip]*clightsq
+                                                         + uzp[ip]*uzp[ip]*clightsq);
+
+            // wqx, wqy wqz are particle current in each direction
+            const amrex::Real wq = q*wp[ip];
+            const amrex::Real wqx = wq*invdtdx;
+#if (defined WARPX_DIM_3D)
+            const amrex::Real wqy = wq*invdtdy;
 #endif
-                     const Real wqz = wq*invdtdz;
-
-                     // computes current and old position in grid units
-                     const Real x_new = (xp[ip] - xmin)*dxi;
-                     const Real x_old = x_new - dtsdx0*uxp[ip]*gaminv;
-#if (AMREX_SPACEDIM == 3)        
-                     const Real y_new = (yp[ip] - ymin)*dyi;
-                     const Real y_old = y_new - dtsdy0*uyp[ip]*gaminv;
+            const amrex::Real wqz = wq*invdtdz;
+
+            // computes current and old position in grid units
+#if (defined WARPX_DIM_RZ)
+            const amrex::Real r_new = std::sqrt(xp[ip]*xp[ip] + yp[ip]*yp[ip]);
+            const amrex::Real r_old = std::sqrt((xp[ip] - dt*uxp[ip]*gaminv)*(xp[ip] - dt*uxp[ip]*gaminv) +
+                                                (yp[ip] - dt*uyp[ip]*gaminv)*(yp[ip] - dt*uyp[ip]*gaminv));
+            const amrex::Real x_new = (r_new - xmin)*dxi;
+            const amrex::Real x_old = (r_old - xmin)*dxi;
+#else
+            const amrex::Real x_new = (xp[ip] - xmin)*dxi;
+            const amrex::Real x_old = x_new - dtsdx0*uxp[ip]*gaminv;
 #endif
-                     const Real z_new = (zp[ip] - zmin)*dzi;
-                     const Real z_old = z_new - dtsdz0*uzp[ip]*gaminv;
-
-                     // Shape factor arrays
-                     // Note that there are extra values above and below
-                     // to possibly hold the factor for the old particle
-                     // which can be at a different grid location.
-                     Real AMREX_RESTRICT sx_new[depos_order + 3] = {0.};
-                     Real AMREX_RESTRICT sx_old[depos_order + 3] = {0.};
-#if (AMREX_SPACEDIM == 3)
-                     Real AMREX_RESTRICT sy_new[depos_order + 3] = {0.};
-                     Real AMREX_RESTRICT sy_old[depos_order + 3] = {0.};
+#if (defined WARPX_DIM_3D)        
+            const amrex::Real y_new = (yp[ip] - ymin)*dyi;
+            const amrex::Real y_old = y_new - dtsdy0*uyp[ip]*gaminv;
+#endif
+            const amrex::Real z_new = (zp[ip] - zmin)*dzi;
+            const amrex::Real z_old = z_new - dtsdz0*uzp[ip]*gaminv;
+
+#if (defined WARPX_DIM_RZ)
+            amrex::Real costheta;
+            amrex::Real sintheta;
+            if (r_new > 0.) {
+                costheta = xp[ip]/r_new;
+                sintheta = yp[ip]/r_new;
+            } else {
+                costheta = 1.;
+                sintheta = 0.;
+            }
+            const amrex::Real vy = (-uxp[ip]*sintheta + uyp[ip]*costheta)*gaminv;
+#elif (defined WARPX_DIM_2D)
+            const amrex::Real vy = uyp[ip]*gaminv;
 #endif
-                     Real AMREX_RESTRICT sz_new[depos_order + 3] = {0.};
-                     Real AMREX_RESTRICT sz_old[depos_order + 3] = {0.};
 
-                     // --- Compute shape factors
-                     // Compute shape factors for position as they are now and at old positions
-                     // [ijk]_new: leftmost grid point that the particle touches
-                     const int i_new = compute_shape_factor<depos_order>(sx_new+1, x_new);
-                     const int i_old = compute_shifted_shape_factor<depos_order>(sx_old, x_old, i_new);
-#if (AMREX_SPACEDIM == 3)
-                     const int j_new = compute_shape_factor<depos_order>(sy_new+1, y_new);
-                     const int j_old = compute_shifted_shape_factor<depos_order>(sy_old, y_old, j_new);
+            // Shape factor arrays
+            // Note that there are extra values above and below
+            // to possibly hold the factor for the old particle
+            // which can be at a different grid location.
+            amrex::Real AMREX_RESTRICT sx_new[depos_order + 3] = {0.};
+            amrex::Real AMREX_RESTRICT sx_old[depos_order + 3] = {0.};
+#if (defined WARPX_DIM_3D)
+            amrex::Real AMREX_RESTRICT sy_new[depos_order + 3] = {0.};
+            amrex::Real AMREX_RESTRICT sy_old[depos_order + 3] = {0.};
+#endif
+            amrex::Real AMREX_RESTRICT sz_new[depos_order + 3] = {0.};
+            amrex::Real AMREX_RESTRICT sz_old[depos_order + 3] = {0.};
+
+            // --- Compute shape factors
+            // Compute shape factors for position as they are now and at old positions
+            // [ijk]_new: leftmost grid point that the particle touches
+            const int i_new = compute_shape_factor<depos_order>(sx_new+1, x_new);
+            const int i_old = compute_shifted_shape_factor<depos_order>(sx_old, x_old, i_new);
+#if (defined WARPX_DIM_3D)
+            const int j_new = compute_shape_factor<depos_order>(sy_new+1, y_new);
+            const int j_old = compute_shifted_shape_factor<depos_order>(sy_old, y_old, j_new);
 #endif 
-                     const int k_new = compute_shape_factor<depos_order>(sz_new+1, z_new);
-                     const int k_old = compute_shifted_shape_factor<depos_order>(sz_old, z_old, k_new);
-
-                     // computes min/max positions of current contributions
-                     int dil = 1, diu = 1;
-                     if (i_old < i_new) dil = 0;
-                     if (i_old > i_new) diu = 0;
-#if (AMREX_SPACEDIM == 3)
-                     int djl = 1, dju = 1;
-                     if (j_old < j_new) djl = 0;
-                     if (j_old > j_new) dju = 0;
+            const int k_new = compute_shape_factor<depos_order>(sz_new+1, z_new);
+            const int k_old = compute_shifted_shape_factor<depos_order>(sz_old, z_old, k_new);
+
+            // computes min/max positions of current contributions
+            int dil = 1, diu = 1;
+            if (i_old < i_new) dil = 0;
+            if (i_old > i_new) diu = 0;
+#if (defined WARPX_DIM_3D)
+            int djl = 1, dju = 1;
+            if (j_old < j_new) djl = 0;
+            if (j_old > j_new) dju = 0;
 #endif
-                     int dkl = 1, dku = 1;
-                     if (k_old < k_new) dkl = 0;
-                     if (k_old > k_new) dku = 0;
-
-#if (AMREX_SPACEDIM == 3)
-
-                     for (int k=dkl; k<=depos_order+2-dku; k++) {
-                         for (int j=djl; j<=depos_order+2-dju; j++) {
-                             Real sdxi = 0.;
-                             for (int i=dil; i<=depos_order+1-diu; i++) {
-                                 sdxi += wqx*(sx_old[i] - sx_new[i])*((sy_new[j] + 0.5*(sy_old[j] - sy_new[j]))*sz_new[k] +
-                                         (0.5*sy_new[j] + 1./3.*(sy_old[j] - sy_new[j]))*(sz_old[k] - sz_new[k]));
-                                 amrex::Gpu::Atomic::Add( &Jx_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdxi);
-                             }
-                         }
-                     }
-                     for (int k=dkl; k<=depos_order+2-dku; k++) {
-                         for (int i=dil; i<=depos_order+2-diu; i++) {
-                             Real sdyj = 0.;
-                             for (int j=djl; j<=depos_order+1-dju; j++) {
-                                 sdyj += wqy*(sy_old[j] - sy_new[j])*((sz_new[k] + 0.5*(sz_old[k] - sz_new[k]))*sx_new[i] +
-                                         (0.5*sz_new[k] + 1./3.*(sz_old[k] - sz_new[k]))*(sx_old[i] - sx_new[i]));
-                                 amrex::Gpu::Atomic::Add( &Jy_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdyj);
-                             }
-                         }
-                     }
-                     for (int j=djl; j<=depos_order+2-dju; j++) {
-                         for (int i=dil; i<=depos_order+2-diu; i++) {
-                             Real sdzk = 0.;
-                             for (int k=dkl; k<=depos_order+1-dku; k++) {
-                                 sdzk += wqz*(sz_old[k] - sz_new[k])*((sx_new[i] + 0.5*(sx_old[i] - sx_new[i]))*sy_new[j] +
-                                         (0.5*sx_new[i] + 1./3.*(sx_old[i] - sx_new[i]))*(sy_old[j] - sy_new[j]));
-                                 amrex::Gpu::Atomic::Add( &Jz_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdzk);
-                             }
-                         }
-                     }
-
-#elif (AMREX_SPACEDIM == 2)
-
-                    for (int k=dkl; k<=depos_order+2-dku; k++) {
-                        Real sdxi = 0.;
-                        for (int i=dil; i<=depos_order+1-diu; i++) {
-                            sdxi += wqx*(sx_old[i] - sx_new[i])*(sz_new[k] + 0.5*(sz_old[k] - sz_new[k]));
-                            amrex::Gpu::Atomic::Add( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdxi);
-                        }
+            int dkl = 1, dku = 1;
+            if (k_old < k_new) dkl = 0;
+            if (k_old > k_new) dku = 0;
+
+#if (defined WARPX_DIM_3D)
+
+            for (int k=dkl; k<=depos_order+2-dku; k++) {
+                for (int j=djl; j<=depos_order+2-dju; j++) {
+                    amrex::Real sdxi = 0.;
+                    for (int i=dil; i<=depos_order+1-diu; i++) {
+                        sdxi += wqx*(sx_old[i] - sx_new[i])*((sy_new[j] + 0.5*(sy_old[j] - sy_new[j]))*sz_new[k] +
+                                                             (0.5*sy_new[j] + 1./3.*(sy_old[j] - sy_new[j]))*(sz_old[k] - sz_new[k]));
+                        amrex::Gpu::Atomic::Add( &Jx_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdxi);
                     }
-                    for (int k=dkl; k<=depos_order+2-dku; k++) {
-                        for (int i=dil; i<=depos_order+2-diu; i++) {
-                            const Real sdyj = wq*uyp[ip]*gaminv*invvol*((sz_new[k] + 0.5*(sz_old[k] - sz_new[k]))*sx_new[i] +
-                                        (0.5*sz_new[k] + 1./3.*(sz_old[k] - sz_new[k]))*(sx_old[i] - sx_new[i]));
-                            amrex::Gpu::Atomic::Add( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdyj);
-                        }
+                }
+            }
+            for (int k=dkl; k<=depos_order+2-dku; k++) {
+                for (int i=dil; i<=depos_order+2-diu; i++) {
+                    amrex::Real sdyj = 0.;
+                    for (int j=djl; j<=depos_order+1-dju; j++) {
+                        sdyj += wqy*(sy_old[j] - sy_new[j])*((sz_new[k] + 0.5*(sz_old[k] - sz_new[k]))*sx_new[i] +
+                                                             (0.5*sz_new[k] + 1./3.*(sz_old[k] - sz_new[k]))*(sx_old[i] - sx_new[i]));
+                        amrex::Gpu::Atomic::Add( &Jy_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdyj);
                     }
-                    for (int i=dil; i<=depos_order+2-diu; i++) {
-                        Real sdzk = 0.;
-                        for (int k=dkl; k<=depos_order+1-dku; k++) {
-                            sdzk += wqz*(sz_old[k] - sz_new[k])*(sx_new[i] + 0.5*(sx_old[i] - sx_new[i]));
-                            amrex::Gpu::Atomic::Add( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdzk);
-                        }
+                }
+            }
+            for (int j=djl; j<=depos_order+2-dju; j++) {
+                for (int i=dil; i<=depos_order+2-diu; i++) {
+                    amrex::Real sdzk = 0.;
+                    for (int k=dkl; k<=depos_order+1-dku; k++) {
+                        sdzk += wqz*(sz_old[k] - sz_new[k])*((sx_new[i] + 0.5*(sx_old[i] - sx_new[i]))*sy_new[j] +
+                                                             (0.5*sx_new[i] + 1./3.*(sx_old[i] - sx_new[i]))*(sy_old[j] - sy_new[j]));
+                        amrex::Gpu::Atomic::Add( &Jz_arr(lo.x+i_new-1+i, lo.y+j_new-1+j, lo.z+k_new-1+k), sdzk);
                     }
+                }
+            }
+
+#elif (defined WARPX_DIM_2D) || (defined WARPX_DIM_RZ)
+
+            for (int k=dkl; k<=depos_order+2-dku; k++) {
+                amrex::Real sdxi = 0.;
+                for (int i=dil; i<=depos_order+1-diu; i++) {
+                    sdxi += wqx*(sx_old[i] - sx_new[i])*(sz_new[k] + 0.5*(sz_old[k] - sz_new[k]));
+                    amrex::Gpu::Atomic::Add( &Jx_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdxi);
+                }
+            }
+            for (int k=dkl; k<=depos_order+2-dku; k++) {
+                for (int i=dil; i<=depos_order+2-diu; i++) {
+                    const amrex::Real sdyj = wq*vy*invvol*((sz_new[k] + 0.5*(sz_old[k] - sz_new[k]))*sx_new[i] +
+                                                           (0.5*sz_new[k] + 1./3.*(sz_old[k] - sz_new[k]))*(sx_old[i] - sx_new[i]));
+                    amrex::Gpu::Atomic::Add( &Jy_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdyj);
+                }
+            }
+            for (int i=dil; i<=depos_order+2-diu; i++) {
+                amrex::Real sdzk = 0.;
+                for (int k=dkl; k<=depos_order+1-dku; k++) {
+                    sdzk += wqz*(sz_old[k] - sz_new[k])*(sx_new[i] + 0.5*(sx_old[i] - sx_new[i]));
+                    amrex::Gpu::Atomic::Add( &Jz_arr(lo.x+i_new-1+i, lo.y+k_new-1+k, 0), sdzk);
+                }
+            }
+
 
 #endif
-                 }
+        }
         );
-
-
-
 }
 
 #endif // CURRENTDEPOSITION_H_
diff --git a/Source/Particles/Deposition/Make.package b/Source/Particles/Deposition/Make.package
index 0d5ebe2a7..e1aace998 100644
--- a/Source/Particles/Deposition/Make.package
+++ b/Source/Particles/Deposition/Make.package
@@ -1,3 +1,4 @@
 CEXE_headers += CurrentDeposition.H
+CEXE_headers += ChargeDeposition.H
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Particles/Deposition
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Particles/Deposition
diff --git a/Source/Particles/Gather/FieldGather.H b/Source/Particles/Gather/FieldGather.H
new file mode 100644
index 000000000..8f5e8d4cf
--- /dev/null
+++ b/Source/Particles/Gather/FieldGather.H
@@ -0,0 +1,216 @@
+#ifndef FIELDGATHER_H_
+#define FIELDGATHER_H_
+
+#include "ShapeFactors.H"
+
+/* \brief Field gather for particles handled by thread thread_num
+ * /param xp, yp, zp   : Pointer to arrays of particle positions.
+ * \param Exp, Eyp, Ezp: Pointer to array of electric field on particles.
+ * \param Bxp, Byp, Bzp: Pointer to array of magnetic field on particles.
+ * \param ex_arr ey_arr: Array4 of current density, either full array or tile.
+ * \param ez_arr bx_arr: Array4 of current density, either full array or tile.
+ * \param by_arr bz_arr: Array4 of current density, either full array or tile.
+ * \param np_to_gather : Number of particles for which field is gathered.
+ * \param dx           : 3D cell size
+ * \param xyzmin       : Physical lower bounds of domain.
+ * \param lo           : Index lower bounds of domain.
+ * \param stagger_shift: 0 if nodal, 0.5 if staggered.
+ */
+template <int depos_order, int lower_in_v>
+void doGatherShapeN(const amrex::Real * const xp,
+                    const amrex::Real * const yp,
+                    const amrex::Real * const zp,
+                    amrex::Real * const Exp, amrex::Real * const Eyp,
+                    amrex::Real * const Ezp, amrex::Real * const Bxp,
+                    amrex::Real * const Byp, amrex::Real * const Bzp,
+                    const amrex::Array4<const amrex::Real>& ex_arr,
+                    const amrex::Array4<const amrex::Real>& ey_arr,
+                    const amrex::Array4<const amrex::Real>& ez_arr,
+                    const amrex::Array4<const amrex::Real>& bx_arr,
+                    const amrex::Array4<const amrex::Real>& by_arr,
+                    const amrex::Array4<const amrex::Real>& bz_arr,
+                    const long np_to_gather,
+                    const std::array<amrex::Real, 3>& dx,
+                    const std::array<amrex::Real, 3> xyzmin,
+                    const amrex::Dim3 lo,
+                    const amrex::Real stagger_shift)
+{
+    const amrex::Real dxi = 1.0/dx[0];
+    const amrex::Real dzi = 1.0/dx[2];
+#if (AMREX_SPACEDIM == 3)
+    const amrex::Real dyi = 1.0/dx[1];
+#endif
+
+    const amrex::Real xmin = xyzmin[0];
+#if (AMREX_SPACEDIM == 3)
+    const amrex::Real ymin = xyzmin[1];
+#endif
+    const amrex::Real zmin = xyzmin[2];
+
+    // Loop over particles and gather fields from
+    // {e,b}{x,y,z}_arr to {E,B}{xyz}p.
+    amrex::ParallelFor(
+        np_to_gather,
+        [=] AMREX_GPU_DEVICE (long ip) {
+            // --- Compute shape factors
+            // x direction
+            // Get particle position
+#ifdef WARPX_DIM_RZ
+            const amrex::Real r = std::sqrt(xp[ip]*xp[ip] + yp[ip]*yp[ip]);
+            const amrex::Real x = (r - xmin)*dxi;
+#else
+            const amrex::Real x = (xp[ip]-xmin)*dxi;
+#endif
+            // Compute shape factors for node-centered quantities
+            amrex::Real AMREX_RESTRICT sx [depos_order + 1];
+            // j: leftmost grid point (node-centered) that particle touches
+            const int j  = compute_shape_factor<depos_order>(sx, x);
+            // Compute shape factors for cell-centered quantities
+            amrex::Real AMREX_RESTRICT sx0[depos_order + 1 - lower_in_v];
+            // j0: leftmost grid point (cell-centered) that particle touches
+            const int j0 = compute_shape_factor<depos_order - lower_in_v>(
+                sx0, x-stagger_shift);
+#if (AMREX_SPACEDIM == 3)
+            // y direction
+            const amrex::Real y = (yp[ip]-ymin)*dyi;
+            amrex::Real AMREX_RESTRICT sy [depos_order + 1];
+            const int k  = compute_shape_factor<depos_order>(sy, y);
+            amrex::Real AMREX_RESTRICT sy0[depos_order + 1 - lower_in_v];
+            const int k0 = compute_shape_factor<depos_order-lower_in_v>(
+                sy0, y-stagger_shift);
+#endif
+            // z direction
+            const amrex::Real z = (zp[ip]-zmin)*dzi;
+            amrex::Real AMREX_RESTRICT sz [depos_order + 1];
+            const int l  = compute_shape_factor<depos_order>(sz, z);
+            amrex::Real AMREX_RESTRICT sz0[depos_order + 1 - lower_in_v];
+            const int l0 = compute_shape_factor<depos_order - lower_in_v>(
+                sz0, z-stagger_shift);
+
+            // Set fields on particle to zero
+            Exp[ip] = 0;
+            Eyp[ip] = 0;
+            Ezp[ip] = 0;
+            Bxp[ip] = 0;
+            Byp[ip] = 0;
+            Bzp[ip] = 0;
+            // Each field is gathered in a separate block of 
+            // AMREX_SPACEDIM nested loops because the deposition
+            // order can differ for each component of each field
+            // when lower_in_v is set to 1
+#if (AMREX_SPACEDIM == 2)
+            // Gather field on particle Eyp[i] from field on grid ey_arr
+            for (int iz=0; iz<=depos_order; iz++){
+                for (int ix=0; ix<=depos_order; ix++){
+                    Eyp[ip] += sx[ix]*sz[iz]*
+                        ey_arr(lo.x+j+ix, lo.y+l+iz, 0);
+                }
+            }
+            // Gather field on particle Exp[i] from field on grid ex_arr
+            // Gather field on particle Bzp[i] from field on grid bz_arr
+            for (int iz=0; iz<=depos_order; iz++){
+                for (int ix=0; ix<=depos_order-lower_in_v; ix++){
+                    Exp[ip] += sx0[ix]*sz[iz]*
+                        ex_arr(lo.x+j0+ix, lo.y+l +iz, 0);
+                    Bzp[ip] += sx0[ix]*sz[iz]*
+                        bz_arr(lo.x+j0+ix, lo.y+l +iz, 0);
+                }
+            }
+            // Gather field on particle Ezp[i] from field on grid ez_arr
+            // Gather field on particle Bxp[i] from field on grid bx_arr
+            for (int iz=0; iz<=depos_order-lower_in_v; iz++){
+                for (int ix=0; ix<=depos_order; ix++){
+                    Ezp[ip] += sx[ix]*sz0[iz]*
+                        ez_arr(lo.x+j+ix, lo.y+l0 +iz, 0);
+                    Bxp[ip] += sx[ix]*sz0[iz]*
+                        bx_arr(lo.x+j+ix, lo.y+l0 +iz, 0);
+                }
+            }
+            // Gather field on particle Byp[i] from field on grid by_arr
+            for (int iz=0; iz<=depos_order-lower_in_v; iz++){
+                for (int ix=0; ix<=depos_order-lower_in_v; ix++){
+                    Byp[ip] += sx0[ix]*sz0[iz]*
+                        by_arr(lo.x+j0+ix, lo.y+l0+iz, 0);
+                }
+            }
+
+#ifdef WARPX_DIM_RZ
+            // Convert Exp and Eyp (which are actually Er and Etheta) to Ex and Ey
+            amrex::Real costheta;
+            amrex::Real sintheta;
+            if (r > 0.) {
+                costheta = xp[ip]/r;
+                sintheta = yp[ip]/r;
+            } else {
+                costheta = 1.;
+                sintheta = 0.;
+            }
+            const amrex::Real Exp_save = Exp[ip];
+            Exp[ip] = costheta*Exp[ip] - sintheta*Eyp[ip];
+            Eyp[ip] = costheta*Eyp[ip] + sintheta*Exp_save;
+            const amrex::Real Bxp_save = Bxp[ip];
+            Bxp[ip] = costheta*Bxp[ip] - sintheta*Byp[ip];
+            Byp[ip] = costheta*Byp[ip] + sintheta*Bxp_save;
+#endif
+
+#else // (AMREX_SPACEDIM == 3)
+            // Gather field on particle Exp[i] from field on grid ex_arr
+            for (int iz=0; iz<=depos_order; iz++){
+                for (int iy=0; iy<=depos_order; iy++){
+                    for (int ix=0; ix<=depos_order-lower_in_v; ix++){
+                        Exp[ip] += sx0[ix]*sy[iy]*sz[iz]*
+                            ex_arr(lo.x+j0+ix, lo.y+k+iy, lo.z+l+iz);
+                    }
+                }
+            }
+            // Gather field on particle Eyp[i] from field on grid ey_arr
+            for (int iz=0; iz<=depos_order; iz++){
+                for (int iy=0; iy<=depos_order-lower_in_v; iy++){
+                    for (int ix=0; ix<=depos_order; ix++){
+                        Eyp[ip] += sx[ix]*sy0[iy]*sz[iz]*
+                            ey_arr(lo.x+j+ix, lo.y+k0+iy, lo.z+l+iz);
+                    }
+                }
+            }
+            // Gather field on particle Ezp[i] from field on grid ez_arr
+            for (int iz=0; iz<=depos_order-lower_in_v; iz++){
+                for (int iy=0; iy<=depos_order; iy++){
+                    for (int ix=0; ix<=depos_order; ix++){
+                        Ezp[ip] += sx[ix]*sy[iy]*sz0[iz]*
+                            ez_arr(lo.x+j+ix, lo.y+k+iy, lo.z+l0+iz);
+                    }
+                }
+            }
+            // Gather field on particle Bzp[i] from field on grid bz_arr
+            for (int iz=0; iz<=depos_order; iz++){
+                for (int iy=0; iy<=depos_order-lower_in_v; iy++){
+                    for (int ix=0; ix<=depos_order-lower_in_v; ix++){
+                        Bzp[ip] += sx0[ix]*sy0[iy]*sz[iz]*
+                            bz_arr(lo.x+j0+ix, lo.y+k0+iy, lo.z+l+iz);
+                    }
+                }
+            }
+            // Gather field on particle Byp[i] from field on grid by_arr
+            for (int iz=0; iz<=depos_order-lower_in_v; iz++){
+                for (int iy=0; iy<=depos_order; iy++){
+                    for (int ix=0; ix<=depos_order-lower_in_v; ix++){
+                        Byp[ip] += sx0[ix]*sy[iy]*sz0[iz]*
+                            by_arr(lo.x+j0+ix, lo.y+k+iy, lo.z+l0+iz);
+                    }
+                }
+            }
+            // Gather field on particle Bxp[i] from field on grid bx_arr
+            for (int iz=0; iz<=depos_order-lower_in_v; iz++){
+                for (int iy=0; iy<=depos_order-lower_in_v; iy++){
+                    for (int ix=0; ix<=depos_order; ix++){
+                        Bxp[ip] += sx[ix]*sy0[iy]*sz0[iz]*
+                            bx_arr(lo.x+j+ix, lo.y+k0+iy, lo.z+l0+iz);
+                    }
+                }
+            }
+#endif
+        }
+        );
+}
+
+#endif // FIELDGATHER_H_
diff --git a/Source/Particles/Gather/Make.package b/Source/Particles/Gather/Make.package
new file mode 100644
index 000000000..10abfcaaf
--- /dev/null
+++ b/Source/Particles/Gather/Make.package
@@ -0,0 +1,3 @@
+CEXE_headers += FieldGather.H
+INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Particles/Gather
+VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Particles/Gather
diff --git a/Source/Particles/Make.package b/Source/Particles/Make.package
index 2038472a1..db90de1dc 100644
--- a/Source/Particles/Make.package
+++ b/Source/Particles/Make.package
@@ -9,9 +9,11 @@ CEXE_headers += MultiParticleContainer.H
 CEXE_headers += WarpXParticleContainer.H
 CEXE_headers += RigidInjectedParticleContainer.H
 CEXE_headers += PhysicalParticleContainer.H
+CEXE_headers += ShapeFactors.H
 
 include $(WARPX_HOME)/Source/Particles/Pusher/Make.package
 include $(WARPX_HOME)/Source/Particles/Deposition/Make.package
+include $(WARPX_HOME)/Source/Particles/Gather/Make.package
 
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Particles
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Particles
diff --git a/Source/Particles/MultiParticleContainer.H b/Source/Particles/MultiParticleContainer.H
index 869126fef..7c9ede411 100644
--- a/Source/Particles/MultiParticleContainer.H
+++ b/Source/Particles/MultiParticleContainer.H
@@ -85,8 +85,9 @@ public:
     /// in the MultiParticleContainer. This is the electromagnetic version of the field gather.
     ///
     void FieldGather (int lev,
-                      const amrex::MultiFab& Ex, const amrex::MultiFab& Ey, const amrex::MultiFab& Ez,
-                      const amrex::MultiFab& Bx, const amrex::MultiFab& By, const amrex::MultiFab& Bz); 
+                      const amrex::MultiFab& Ex, const amrex::MultiFab& Ey,
+                      const amrex::MultiFab& Ez, const amrex::MultiFab& Bx,
+                      const amrex::MultiFab& By, const amrex::MultiFab& Bz); 
 
     ///
     /// This evolves all the particles by one PIC time step, including current deposition, the
diff --git a/Source/Particles/MultiParticleContainer.cpp b/Source/Particles/MultiParticleContainer.cpp
index 9d39ec2f9..982e04e39 100644
--- a/Source/Particles/MultiParticleContainer.cpp
+++ b/Source/Particles/MultiParticleContainer.cpp
@@ -172,30 +172,6 @@ MultiParticleContainer::EvolveES (const Vector<std::array<std::unique_ptr<MultiF
 }
 
 void
-MultiParticleContainer::Evolve (int lev,
-                                const MultiFab& Ex, const MultiFab& Ey, const MultiFab& Ez,
-                                const MultiFab& Bx, const MultiFab& By, const MultiFab& Bz,
-                                MultiFab& jx, MultiFab& jy, MultiFab& jz,
-                                MultiFab* cjx,  MultiFab* cjy, MultiFab* cjz, 
-                                MultiFab* rho,
-                                const MultiFab* cEx, const MultiFab* cEy, const MultiFab* cEz,
-                                const MultiFab* cBx, const MultiFab* cBy, const MultiFab* cBz,
-                                Real t, Real dt)
-{
-    jx.setVal(0.0);
-    jy.setVal(0.0);
-    jz.setVal(0.0);
-    if (cjx) cjx->setVal(0.0);
-    if (cjy) cjy->setVal(0.0);
-    if (cjz) cjz->setVal(0.0);
-    if (rho) rho->setVal(0.0);
-    for (auto& pc : allcontainers) {
-	pc->Evolve(lev, Ex, Ey, Ez, Bx, By, Bz, jx, jy, jz, cjx, cjy, cjz,
-               rho, cEx, cEy, cEz, cBx, cBy, cBz, t, dt);
-    }    
-}
-
-void
 MultiParticleContainer::PushXES (Real dt)
 {
     for (auto& pc : allcontainers) {
@@ -240,8 +216,9 @@ MultiParticleContainer::sumParticleCharge (bool local)
 
 void
 MultiParticleContainer::FieldGather (int lev,
-                                     const MultiFab& Ex, const MultiFab& Ey, const MultiFab& Ez,
-                                     const MultiFab& Bx, const MultiFab& By, const MultiFab& Bz)
+                                     const MultiFab& Ex, const MultiFab& Ey,
+                                     const MultiFab& Ez, const MultiFab& Bx,
+                                     const MultiFab& By, const MultiFab& Bz)
 {
     for (auto& pc : allcontainers) {
         pc->FieldGather(lev, Ex, Ey, Ez, Bx, By, Bz);
@@ -331,7 +308,7 @@ MultiParticleContainer::RedistributeLocal (const int num_ghost)
 }
 
 Vector<long>
-MultiParticleContainer::NumberOfParticlesInGrid(int lev) const
+MultiParticleContainer::NumberOfParticlesInGrid (int lev) const
 {
     const bool only_valid=true, only_local=true;
     Vector<long> r = allcontainers[0]->NumberOfParticlesInGrid(lev,only_valid,only_local);
diff --git a/Source/Particles/PhysicalParticleContainer.H b/Source/Particles/PhysicalParticleContainer.H
index d55764682..b80619733 100644
--- a/Source/Particles/PhysicalParticleContainer.H
+++ b/Source/Particles/PhysicalParticleContainer.H
@@ -27,17 +27,37 @@ public:
                                const amrex::Vector<std::unique_ptr<amrex::FabArray<amrex::BaseFab<int> > > >& masks) override;
 
     virtual void EvolveES (const amrex::Vector<std::array<std::unique_ptr<amrex::MultiFab>, 3> >& E,
-                                 amrex::Vector<std::unique_ptr<amrex::MultiFab> >& rho,
+                           amrex::Vector<std::unique_ptr<amrex::MultiFab> >& rho,
                            amrex::Real t, amrex::Real dt) override;
 #endif // WARPX_DO_ELECTROSTATIC
     
-    virtual void FieldGather(int lev,
-                             const amrex::MultiFab& Ex,
-                             const amrex::MultiFab& Ey,
-                             const amrex::MultiFab& Ez,
-                             const amrex::MultiFab& Bx,
-                             const amrex::MultiFab& By,
-                             const amrex::MultiFab& Bz) final;
+    virtual void FieldGather (int lev,
+                              const amrex::MultiFab& Ex,
+                              const amrex::MultiFab& Ey,
+                              const amrex::MultiFab& Ez,
+                              const amrex::MultiFab& Bx,
+                              const amrex::MultiFab& By,
+                              const amrex::MultiFab& Bz) final;
+
+    void FieldGather (WarpXParIter& pti,
+                      RealVector& Exp,
+                      RealVector& Eyp,
+                      RealVector& Ezp,
+                      RealVector& Bxp,
+                      RealVector& Byp,
+                      RealVector& Bzp,
+                      amrex::FArrayBox const * exfab,
+                      amrex::FArrayBox const * eyfab,
+                      amrex::FArrayBox const * ezfab,
+                      amrex::FArrayBox const * bxfab,
+                      amrex::FArrayBox const * byfab,
+                      amrex::FArrayBox const * bzfab,
+                      const int ngE, const int e_is_nodal,
+                      const long offset,
+                      const long np_to_gather,
+                      int thread_num,
+                      int lev,
+                      int depos_lev);
 
     virtual void Evolve (int lev,
 			 const amrex::MultiFab& Ex,
@@ -87,11 +107,8 @@ public:
 
     // Inject particles in Box 'part_box'
     virtual void AddParticles (int lev);
+
     void AddPlasma(int lev, amrex::RealBox part_realbox = amrex::RealBox());
-    void AddPlasmaCPU (int lev, amrex::RealBox part_realbox);
-#ifdef AMREX_USE_GPU
-    void AddPlasmaGPU (int lev, amrex::RealBox part_realbox);
-#endif
 
     void MapParticletoBoostedFrame(amrex::Real& x, amrex::Real& y, amrex::Real& z, std::array<amrex::Real, 3>& u);
 
@@ -120,16 +137,8 @@ protected:
     bool boost_adjust_transverse_positions = false;
     bool do_backward_propagation = false;
 
-    long NumParticlesToAdd (const amrex::Box& overlap_box,
-			    const amrex::RealBox& overlap_realbox,
-			    const amrex::RealBox& tile_real_box,
-			    const amrex::RealBox& particle_real_box);
-  
-    int GetRefineFac(const amrex::Real x, const amrex::Real y, const amrex::Real z);
-    std::unique_ptr<amrex::IArrayBox> m_refined_injection_mask = nullptr;
-
     // Inject particles during the whole simulation
-    void ContinuousInjection(const amrex::RealBox& injection_box) override;
+    void ContinuousInjection (const amrex::RealBox& injection_box) override;
 
 };
 
diff --git a/Source/Particles/PhysicalParticleContainer.cpp b/Source/Particles/PhysicalParticleContainer.cpp
index d47a7b220..d10390204 100644
--- a/Source/Particles/PhysicalParticleContainer.cpp
+++ b/Source/Particles/PhysicalParticleContainer.cpp
@@ -6,65 +6,16 @@
 #include <WarpX.H>
 #include <WarpXConst.H>
 #include <WarpXWrappers.h>
+#include <FieldGather.H>
 
+#include <WarpXAlgorithmSelection.H>
 
-using namespace amrex;
-
-long PhysicalParticleContainer::
-NumParticlesToAdd(const Box& overlap_box, const RealBox& overlap_realbox,
-                  const RealBox& tile_realbox, const RealBox& particle_real_box)
-{
-    const int lev = 0;
-    const Geometry& geom = Geom(lev);
-    int num_ppc = plasma_injector->num_particles_per_cell;
-    const Real* dx = geom.CellSize();
+// Import low-level single-particle kernels
+#include <UpdatePosition.H>
+#include <UpdateMomentumBoris.H>
+#include <UpdateMomentumVay.H>
 
-    long np = 0;
-    const auto& overlap_corner = overlap_realbox.lo();
-    for (IntVect iv = overlap_box.smallEnd(); iv <= overlap_box.bigEnd(); overlap_box.next(iv))
-    {
-        int fac;
-        if (do_continuous_injection) {
-#if ( AMREX_SPACEDIM == 3 )
-            Real x = overlap_corner[0] + (iv[0] + 0.5)*dx[0];
-            Real y = overlap_corner[1] + (iv[1] + 0.5)*dx[1];
-            Real z = overlap_corner[2] + (iv[2] + 0.5)*dx[2];
-#elif ( AMREX_SPACEDIM == 2 )
-            Real x = overlap_corner[0] + (iv[0] + 0.5)*dx[0];
-            Real y = 0;
-            Real z = overlap_corner[1] + (iv[1] + 0.5)*dx[1];
-#endif
-            fac = GetRefineFac(x, y, z);
-        } else {
-            fac = 1.0;
-        }
-	
-        int ref_num_ppc = num_ppc * AMREX_D_TERM(fac, *fac, *fac);
-        for (int i_part=0; i_part<ref_num_ppc;i_part++) {
-            std::array<Real, 3> r;
-            plasma_injector->getPositionUnitBox(r, i_part, fac);
-#if ( AMREX_SPACEDIM == 3 )
-            Real x = overlap_corner[0] + (iv[0] + r[0])*dx[0];
-            Real y = overlap_corner[1] + (iv[1] + r[1])*dx[1];
-            Real z = overlap_corner[2] + (iv[2] + r[2])*dx[2];
-#elif ( AMREX_SPACEDIM == 2 )
-            Real x = overlap_corner[0] + (iv[0] + r[0])*dx[0];
-            Real y = 0;
-            Real z = overlap_corner[1] + (iv[1] + r[1])*dx[1];
-#endif
-            // If the new particle is not inside the tile box,
-            // go to the next generated particle.
-#if ( AMREX_SPACEDIM == 3 )
-            if(!tile_realbox.contains( RealVect{x, y, z} )) continue;
-#elif ( AMREX_SPACEDIM == 2 )
-            if(!tile_realbox.contains( RealVect{x, z} )) continue;
-#endif
-            ++np;
-        }
-    }
-    
-    return np;
-}
+using namespace amrex;
 
 PhysicalParticleContainer::PhysicalParticleContainer (AmrCore* amr_core, int ispecies,
                                                       const std::string& name)
@@ -127,9 +78,7 @@ PhysicalParticleContainer::PhysicalParticleContainer (AmrCore* amr_core)
 void PhysicalParticleContainer::InitData()
 {
     AddParticles(0); // Note - add on level 0
-    if (maxLevel() > 0) {
-        Redistribute();  // We then redistribute
-    }
+    Redistribute();  // We then redistribute
 }
 
 void PhysicalParticleContainer::MapParticletoBoostedFrame(Real& x, Real& y, Real& z, std::array<Real, 3>& u)
@@ -193,45 +142,36 @@ PhysicalParticleContainer::AddGaussianBeam(Real x_m, Real y_m, Real z_m,
     std::normal_distribution<double> distz(z_m, z_rms);
 
     if (ParallelDescriptor::IOProcessor()) {
-        std::array<Real, 3> u;
-        Real weight;
         // If do_symmetrize, create 4x fewer particles, and 
         // Replicate each particle 4 times (x,y) (-x,y) (x,-y) (-x,-y)
         if (do_symmetrize){
             npart /= 4;
         }
         for (long i = 0; i < npart; ++i) {
-#if ( AMREX_SPACEDIM == 3 | WARPX_RZ)
-            weight = q_tot/npart/charge;
+#if ( AMREX_SPACEDIM == 3 | WARPX_DIM_RZ)
+            Real weight = q_tot/npart/charge;
             Real x = distx(mt);
             Real y = disty(mt);
             Real z = distz(mt);
 #elif ( AMREX_SPACEDIM == 2 )
-            weight = q_tot/npart/charge/y_rms;
+            Real weight = q_tot/npart/charge/y_rms;
             Real x = distx(mt);
             Real y = 0.;
             Real z = distz(mt);
 #endif
             if (plasma_injector->insideBounds(x, y, z)) {
-                plasma_injector->getMomentum(u, x, y, z);
+                XDim3 u = plasma_injector->getMomentum(x, y, z);
+                u.x *= PhysConst::c;
+                u.y *= PhysConst::c;
+                u.z *= PhysConst::c;
                 if (do_symmetrize){
-                    std::array<Real, 3> u_tmp;
-                    Real x_tmp, y_tmp;
                     // Add four particles to the beam:
-                    // (x,ux,y,uy) (-x,-ux,y,uy) (x,ux,-y,-uy) (-x,-ux,-y,-uy)
-                    for (int ix=0; ix<2; ix++){
-                        for (int iy=0; iy<2; iy++){
-                            u_tmp = u;
-                            x_tmp     = x*std::pow(-1,ix);
-                            u_tmp[0] *= std::pow(-1,ix);
-                            y_tmp     = y*std::pow(-1,iy);
-                            u_tmp[1] *= std::pow(-1,iy);
-                            CheckAndAddParticle(x_tmp, y_tmp, z, 
-                                                u_tmp, weight/4);
-                        }
-                    }
+                    CheckAndAddParticle( x, y, z, { u.x, u.y, u.z}, weight/4. );
+                    CheckAndAddParticle( x,-y, z, { u.x,-u.y, u.z}, weight/4. );
+                    CheckAndAddParticle(-x, y, z, {-u.x, u.y, u.z}, weight/4. );
+                    CheckAndAddParticle(-x,-y, z, {-u.x,-u.y, u.z}, weight/4. );
                 } else {
-                    CheckAndAddParticle(x, y, z, u, weight);
+                    CheckAndAddParticle(x, y, z, {u.x,u.y,u.z}, weight);
                 }
             }
         }
@@ -322,28 +262,19 @@ PhysicalParticleContainer::AddParticles (int lev)
 void
 PhysicalParticleContainer::AddPlasma (int lev, RealBox part_realbox)
 {
-#ifdef AMREX_USE_GPU
-    AddPlasmaGPU(lev, part_realbox);
-#else
-    AddPlasmaCPU(lev, part_realbox);
-#endif
-}
-
-void
-PhysicalParticleContainer::AddPlasmaCPU (int lev, RealBox part_realbox)
-{
-    BL_PROFILE("PhysicalParticleContainer::AddPlasmaCPU");
+    BL_PROFILE("PhysicalParticleContainer::AddPlasma");
 
     // If no part_realbox is provided, initialize particles in the whole domain
     const Geometry& geom = Geom(lev);
     if (!part_realbox.ok()) part_realbox = geom.ProbDomain();
 
     int num_ppc = plasma_injector->num_particles_per_cell;
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
     Real rmax = std::min(plasma_injector->xmax, part_realbox.hi(0));
 #endif
 
-    const Real* dx = geom.CellSize();
+    const auto dx = geom.CellSizeArray();
+    const auto problo = geom.ProbLoArray();
 
     Real scale_fac;
 #if AMREX_SPACEDIM==3
@@ -358,490 +289,341 @@ PhysicalParticleContainer::AddPlasmaCPU (int lev, RealBox part_realbox)
         const int grid_id = mfi.index();
         const int tile_id = mfi.LocalTileIndex();
         GetParticles(lev)[std::make_pair(grid_id, tile_id)];
+        if (WarpX::do_boosted_frame_diagnostic && do_boosted_frame_diags) {
+            DefineAndReturnParticleTile(lev, grid_id, tile_id);
+        }
     }
 #endif
 
     MultiFab* cost = WarpX::getCosts(lev);
 
-    if ( (not m_refined_injection_mask) and WarpX::do_moving_window)
+    const int nlevs = numLevels();
+    static bool refine_injection = false;
+    static Box fine_injection_box;
+    static int rrfac = 1;
+    // This does not work if the mesh is dynamic.  But in that case, we should
+    // not use refined injected either.  We also assume there is only one fine level.
+    if (WarpX::do_moving_window and WarpX::refine_plasma
+        and do_continuous_injection and nlevs == 2)
     {
-        Box mask_box = geom.Domain();
-        mask_box.setSmall(WarpX::moving_window_dir, 0);
-        mask_box.setBig(WarpX::moving_window_dir, 0);
-        m_refined_injection_mask.reset( new IArrayBox(mask_box));
-        m_refined_injection_mask->setVal(-1);
+        refine_injection = true;
+        fine_injection_box = ParticleBoxArray(1).minimalBox();
+        fine_injection_box.setSmall(WarpX::moving_window_dir, std::numeric_limits<int>::lowest());
+        fine_injection_box.setBig(WarpX::moving_window_dir, std::numeric_limits<int>::max());
+        rrfac = m_gdb->refRatio(0)[0];
+        fine_injection_box.coarsen(rrfac);
     }
 
+    InjectorPosition* inj_pos = plasma_injector->getInjectorPosition();
+    InjectorDensity*  inj_rho = plasma_injector->getInjectorDensity();
+    InjectorMomentum* inj_mom = plasma_injector->getInjectorMomentum();
+    Real gamma_boost = WarpX::gamma_boost;
+    Real beta_boost = WarpX::beta_boost;
+    Real t = WarpX::GetInstance().gett_new(lev);
+    Real density_min = plasma_injector->density_min;
+    Real density_max = plasma_injector->density_max;
+
+#ifdef WARPX_DIM_RZ
+    bool radially_weighted = plasma_injector->radially_weighted;
+#endif
+
     MFItInfo info;
-    if (do_tiling) {
+    if (do_tiling && Gpu::notInLaunchRegion()) {
         info.EnableTiling(tile_size);
     }
-    info.SetDynamic(true);
-
 #ifdef _OPENMP
+    info.SetDynamic(true);
 #pragma omp parallel if (not WarpX::serialize_ics)
 #endif
+    for (MFIter mfi = MakeMFIter(lev, info); mfi.isValid(); ++mfi)
     {
-        std::array<Real,PIdx::nattribs> attribs;
-        attribs.fill(0.0);
-
-        // Loop through the tiles
-        for (MFIter mfi = MakeMFIter(lev, info); mfi.isValid(); ++mfi) {
-
-            Real wt = amrex::second();
-
-            const Box& tile_box = mfi.tilebox();
-            const RealBox tile_realbox = WarpX::getRealBox(tile_box, lev);
-
-            // Find the cells of part_box that overlap with tile_realbox
-            // If there is no overlap, just go to the next tile in the loop
-            RealBox overlap_realbox;
-            Box overlap_box;
-            Real ncells_adjust;
-            bool no_overlap = 0;
-
-            for (int dir=0; dir<AMREX_SPACEDIM; dir++) {
-                if ( tile_realbox.lo(dir) <= part_realbox.hi(dir) ) {
-                    ncells_adjust = std::floor( (tile_realbox.lo(dir) - part_realbox.lo(dir))/dx[dir] );
-                    overlap_realbox.setLo( dir, part_realbox.lo(dir) + std::max(ncells_adjust, 0.) * dx[dir]);
-                } else {
-                    no_overlap = 1; break;
-                }
-                if ( tile_realbox.hi(dir) >= part_realbox.lo(dir) ) {
-                    ncells_adjust = std::floor( (part_realbox.hi(dir) - tile_realbox.hi(dir))/dx[dir] );
-                    overlap_realbox.setHi( dir, part_realbox.hi(dir) - std::max(ncells_adjust, 0.) * dx[dir]);
-                } else {
-                    no_overlap = 1; break;
-                }
-                // Count the number of cells in this direction in overlap_realbox
-                overlap_box.setSmall( dir, 0 );
-                overlap_box.setBig( dir,
-                                    int( round((overlap_realbox.hi(dir)-overlap_realbox.lo(dir))/dx[dir] )) - 1);
+        Real wt = amrex::second();
+
+        const Box& tile_box = mfi.tilebox();
+        const RealBox tile_realbox = WarpX::getRealBox(tile_box, lev);
+
+        // Find the cells of part_box that overlap with tile_realbox
+        // If there is no overlap, just go to the next tile in the loop
+        RealBox overlap_realbox;
+        Box overlap_box;
+        IntVect shifted;
+        bool no_overlap = false;
+
+        for (int dir=0; dir<AMREX_SPACEDIM; dir++) {
+            if ( tile_realbox.lo(dir) <= part_realbox.hi(dir) ) {
+                Real ncells_adjust = std::floor( (tile_realbox.lo(dir) - part_realbox.lo(dir))/dx[dir] );
+                overlap_realbox.setLo( dir, part_realbox.lo(dir) + std::max(ncells_adjust, 0.) * dx[dir]);
+            } else {
+                no_overlap = true; break;
             }
-            if (no_overlap == 1) {
-                continue; // Go to the next tile
+            if ( tile_realbox.hi(dir) >= part_realbox.lo(dir) ) {
+                Real ncells_adjust = std::floor( (part_realbox.hi(dir) - tile_realbox.hi(dir))/dx[dir] );
+                overlap_realbox.setHi( dir, part_realbox.hi(dir) - std::max(ncells_adjust, 0.) * dx[dir]);
+            } else {
+                no_overlap = true; break;
             }
+            // Count the number of cells in this direction in overlap_realbox
+            overlap_box.setSmall( dir, 0 );
+            overlap_box.setBig( dir,
+                int( std::round((overlap_realbox.hi(dir)-overlap_realbox.lo(dir))
+                                /dx[dir] )) - 1);
+            shifted[dir] = std::round((overlap_realbox.lo(dir)-problo[dir])/dx[dir]);
+            // shifted is exact in non-moving-window direction.  That's all we care.
+        }
+        if (no_overlap == 1) {
+            continue; // Go to the next tile
+        }
 
-            const int grid_id = mfi.index();
-            const int tile_id = mfi.LocalTileIndex();
-
-            // Loop through the cells of overlap_box and inject
-            // the corresponding particles
-            const auto& overlap_corner = overlap_realbox.lo();
-            for (IntVect iv = overlap_box.smallEnd(); iv <= overlap_box.bigEnd(); overlap_box.next(iv))
-            {
-                int fac;
-                if (do_continuous_injection) {
-#if ( AMREX_SPACEDIM == 3 )
-                    Real x = overlap_corner[0] + (iv[0] + 0.5)*dx[0];
-                    Real y = overlap_corner[1] + (iv[1] + 0.5)*dx[1];
-                    Real z = overlap_corner[2] + (iv[2] + 0.5)*dx[2];
-#elif ( AMREX_SPACEDIM == 2 )
-                    Real x = overlap_corner[0] + (iv[0] + 0.5)*dx[0];
-                    Real y = 0;
-                    Real z = overlap_corner[1] + (iv[1] + 0.5)*dx[1];
-#endif
-                    fac = GetRefineFac(x, y, z);
-                } else {
-                    fac = 1.0;
-                }
-
-                int ref_num_ppc = num_ppc * AMREX_D_TERM(fac, *fac, *fac);
-                for (int i_part=0; i_part<ref_num_ppc;i_part++) {
-                    std::array<Real, 3> r;
-                    plasma_injector->getPositionUnitBox(r, i_part, fac);
-#if ( AMREX_SPACEDIM == 3 )
-                    Real x = overlap_corner[0] + (iv[0] + r[0])*dx[0];
-                    Real y = overlap_corner[1] + (iv[1] + r[1])*dx[1];
-                    Real z = overlap_corner[2] + (iv[2] + r[2])*dx[2];
-#elif ( AMREX_SPACEDIM == 2 )
-                    Real x = overlap_corner[0] + (iv[0] + r[0])*dx[0];
-                    Real y = 0;
-                    Real z = overlap_corner[1] + (iv[1] + r[1])*dx[1];
-#endif
-                    // If the new particle is not inside the tile box,
-                    // go to the next generated particle.
-#if ( AMREX_SPACEDIM == 3 )
-                    if(!tile_realbox.contains( RealVect{x, y, z} )) continue;
-#elif ( AMREX_SPACEDIM == 2 )
-                    if(!tile_realbox.contains( RealVect{x, z} )) continue;
-#endif
+        const int grid_id = mfi.index();
+        const int tile_id = mfi.LocalTileIndex();
 
-                    // Save the x and y values to use in the insideBounds checks.
-                    // This is needed with WARPX_RZ since x and y are modified.
-                    Real xb = x;
-                    Real yb = y;
-
-#ifdef WARPX_RZ
-                    // Replace the x and y, choosing the angle randomly.
-                    // These x and y are used to get the momentum and density
-                    Real theta = 2.*MathConst::pi*amrex::Random();
-                    y = x*std::sin(theta);
-                    x = x*std::cos(theta);
-#endif
+        // Max number of new particles, if particles are created in the whole
+        // overlap_box. All of them are created, and invalid ones are then 
+        // discaded
+        int max_new_particles = overlap_box.numPts() * num_ppc;
 
-                    Real dens;
-                    std::array<Real, 3> u;
-                    if (WarpX::gamma_boost == 1.){
-                        // Lab-frame simulation
-                        // If the particle is not within the species's
-                        // xmin, xmax, ymin, ymax, zmin, zmax, go to
-                        // the next generated particle.
-                        if (!plasma_injector->insideBounds(xb, yb, z)) continue;
-                        plasma_injector->getMomentum(u, x, y, z);
-                        dens = plasma_injector->getDensity(x, y, z);
-                    } else {
-                        // Boosted-frame simulation
-                        Real c = PhysConst::c;
-                        Real gamma_boost = WarpX::gamma_boost;
-                        Real beta_boost = WarpX::beta_boost;
-                        // Since the user provides the density distribution
-                        // at t_lab=0 and in the lab-frame coordinates,
-                        // we need to find the lab-frame position of this
-                        // particle at t_lab=0, from its boosted-frame coordinates
-                        // Assuming ballistic motion, this is given by:
-                        // z0_lab = gamma*( z_boost*(1-beta*betaz_lab) - ct_boost*(betaz_lab-beta) )
-                        // where betaz_lab is the speed of the particle in the lab frame
-                        //
-                        // In order for this equation to be solvable, betaz_lab
-                        // is explicitly assumed to have no dependency on z0_lab
-                        plasma_injector->getMomentum(u, x, y, 0.); // No z0_lab dependency
-                        // At this point u is the lab-frame momentum
-                        // => Apply the above formula for z0_lab
-                        Real gamma_lab = std::sqrt( 1 + (u[0]*u[0] + u[1]*u[1] + u[2]*u[2])/(c*c) );
-                        Real betaz_lab = u[2]/gamma_lab/c;
-                        Real t = WarpX::GetInstance().gett_new(lev);
-                        Real z0_lab = gamma_boost * ( z*(1-beta_boost*betaz_lab) - c*t*(betaz_lab-beta_boost) );
-                        // If the particle is not within the lab-frame zmin, zmax, etc.
-                        // go to the next generated particle.
-                        if (!plasma_injector->insideBounds(xb, yb, z0_lab)) continue;
-                        // call `getDensity` with lab-frame parameters
-                        dens = plasma_injector->getDensity(x, y, z0_lab);
-                        // At this point u and dens are the lab-frame quantities
-                        // => Perform Lorentz transform
-                        dens = gamma_boost * dens * ( 1 - beta_boost*betaz_lab );
-                        u[2] = gamma_boost * ( u[2] -beta_boost*c*gamma_lab );
-                    }
-                    Real weight = dens * scale_fac / (AMREX_D_TERM(fac, *fac, *fac));
-#ifdef WARPX_RZ
-                    if (plasma_injector->radially_weighted) {
-                        weight *= 2*MathConst::pi*xb;
-                    } else {
-                        // This is not correct since it might shift the particle
-                        // out of the local grid
-                        x = std::sqrt(xb*rmax);
-                        weight *= dx[0];
-                    }
-#endif
-                    attribs[PIdx::w ] = weight;
-                    attribs[PIdx::ux] = u[0];
-                    attribs[PIdx::uy] = u[1];
-                    attribs[PIdx::uz] = u[2];
-                    
-                    if (WarpX::do_boosted_frame_diagnostic && do_boosted_frame_diags)
-                    {
-                        auto& particle_tile = DefineAndReturnParticleTile(lev, grid_id, tile_id);
-                        particle_tile.push_back_real(particle_comps["xold"], x);
-                        particle_tile.push_back_real(particle_comps["yold"], y);
-                        particle_tile.push_back_real(particle_comps["zold"], z);
-
-                        particle_tile.push_back_real(particle_comps["uxold"], u[0]);
-                        particle_tile.push_back_real(particle_comps["uyold"], u[1]);
-                        particle_tile.push_back_real(particle_comps["uzold"], u[2]);
-                    }
-
-                    AddOneParticle(lev, grid_id, tile_id, x, y, z, attribs);
+        // If refine injection, build pointer dp_cellid that holds pointer to 
+        // array of refined cell IDs.
+        Vector<int> cellid_v;
+        if (refine_injection and lev == 0)
+        {
+            // then how many new particles will be injected is not that simple
+            // We have to shift fine_injection_box because overlap_box has been shifted.
+            Box fine_overlap_box = overlap_box & amrex::shift(fine_injection_box,shifted);
+            max_new_particles += fine_overlap_box.numPts() * num_ppc
+                * (AMREX_D_TERM(rrfac,*rrfac,*rrfac)-1);
+            for (int icell = 0, ncells = overlap_box.numPts(); icell < ncells; ++icell) {
+                IntVect iv = overlap_box.atOffset(icell);
+                int r = (fine_overlap_box.contains(iv)) ? AMREX_D_TERM(rrfac,*rrfac,*rrfac) : 1;
+                for (int ipart = 0; ipart < r; ++ipart) {
+                    cellid_v.push_back(icell);
+                    cellid_v.push_back(ipart);
                 }
             }
+        }
+        int const* hp_cellid = (cellid_v.empty()) ? nullptr : cellid_v.data();
+        amrex::AsyncArray<int> cellid_aa(hp_cellid, cellid_v.size());
+        int const* dp_cellid = cellid_aa.data();
 
-            if (cost) {
-                wt = (amrex::second() - wt) / tile_box.d_numPts();
-                Array4<Real> const& costarr = cost->array(mfi);
-                amrex::ParallelFor(tile_box,
-                                   [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
-                                   {
-                                       costarr(i,j,k) += wt;
-                                   });
-            }
+        // Update NextID to include particles created in this function
+        int pid;
+#pragma omp critical (add_plasma_nextid)
+        {
+            pid = ParticleType::NextID();
+            ParticleType::NextID(pid+max_new_particles);
         }
-    }
-}
+        const int cpuid = ParallelDescriptor::MyProc();
 
-#ifdef AMREX_USE_GPU
-void
-PhysicalParticleContainer::AddPlasmaGPU (int lev, RealBox part_realbox)
-{
-    BL_PROFILE("PhysicalParticleContainer::AddPlasmaGPU");
+        auto& particle_tile = GetParticles(lev)[std::make_pair(grid_id,tile_id)];
+        bool do_boosted = false;
+        if (WarpX::do_boosted_frame_diagnostic && do_boosted_frame_diags) {
+            do_boosted = true;
+            DefineAndReturnParticleTile(lev, grid_id, tile_id);
+        }
+        auto old_size = particle_tile.GetArrayOfStructs().size();
+        auto new_size = old_size + max_new_particles;
+        particle_tile.resize(new_size);
+
+        ParticleType* pp = particle_tile.GetArrayOfStructs()().data() + old_size;
+        auto& soa = particle_tile.GetStructOfArrays();
+        GpuArray<Real*,PIdx::nattribs> pa;
+        for (int ia = 0; ia < PIdx::nattribs; ++ia) {
+            pa[ia] = soa.GetRealData(ia).data() + old_size;
+        }
+        GpuArray<Real*,6> pb;
+        if (do_boosted) {
+            pb[0] = soa.GetRealData(particle_comps[ "xold"]).data() + old_size;
+            pb[1] = soa.GetRealData(particle_comps[ "yold"]).data() + old_size;
+            pb[2] = soa.GetRealData(particle_comps[ "zold"]).data() + old_size;
+            pb[3] = soa.GetRealData(particle_comps["uxold"]).data() + old_size;
+            pb[4] = soa.GetRealData(particle_comps["uyold"]).data() + old_size;
+            pb[5] = soa.GetRealData(particle_comps["uzold"]).data() + old_size;
+        }
 
-    // If no part_realbox is provided, initialize particles in the whole domain
-    const Geometry& geom = Geom(lev);
-    if (!part_realbox.ok()) part_realbox = geom.ProbDomain();
+        const GpuArray<Real,AMREX_SPACEDIM> overlap_corner
+            {AMREX_D_DECL(overlap_realbox.lo(0),
+                          overlap_realbox.lo(1),
+                          overlap_realbox.lo(2))};
 
-    int num_ppc = plasma_injector->num_particles_per_cell;
-#ifdef WARPX_RZ
-    Real rmax = std::min(plasma_injector->xmax, part_realbox.hi(0));
-#endif
+        std::size_t shared_mem_bytes = plasma_injector->sharedMemoryNeeded();
+        int lrrfac = rrfac;
 
-    const Real* dx = geom.CellSize();
+        // Loop over all new particles and inject them (creates too many 
+        // particles, in particular does not consider xmin, xmax etc.).
+        // The invalid ones are given negative ID and are deleted during the 
+        // next redistribute.
+        amrex::For(max_new_particles, [=] AMREX_GPU_DEVICE (int ip) noexcept
+        {
+            ParticleType& p = pp[ip];
+            p.id() = pid+ip;
+            p.cpu() = cpuid;
+
+            int cellid, i_part;
+            Real fac;
+            if (dp_cellid == nullptr) {
+                cellid = ip/num_ppc;
+                i_part = ip - cellid*num_ppc;
+                fac = 1.0;
+            } else {
+                cellid = dp_cellid[2*ip];
+                i_part = dp_cellid[2*ip+1];
+                fac = lrrfac;
+            }
 
-    Real scale_fac;
-#if AMREX_SPACEDIM==3
-    scale_fac = dx[0]*dx[1]*dx[2]/num_ppc;
-#elif AMREX_SPACEDIM==2
-    scale_fac = dx[0]*dx[1]/num_ppc;
-#endif
+            IntVect iv = overlap_box.atOffset(cellid);
 
-#ifdef _OPENMP
-    // First touch all tiles in the map in serial
-    for (MFIter mfi = MakeMFIter(lev); mfi.isValid(); ++mfi) {
-        const int grid_id = mfi.index();
-        const int tile_id = mfi.LocalTileIndex();
-        GetParticles(lev)[std::make_pair(grid_id, tile_id)];
-    }
+            const XDim3 r = inj_pos->getPositionUnitBox(i_part, fac);
+#if (AMREX_SPACEDIM == 3)
+            Real x = overlap_corner[0] + (iv[0]+r.x)*dx[0];
+            Real y = overlap_corner[1] + (iv[1]+r.y)*dx[1];
+            Real z = overlap_corner[2] + (iv[2]+r.z)*dx[2];
+#else
+            Real x = overlap_corner[0] + (iv[0]+r.x)*dx[0];
+            Real y = 0.0;
+            Real z = overlap_corner[1] + (iv[1]+r.y)*dx[1];
 #endif
 
-    MultiFab* cost = WarpX::getCosts(lev);
-
-    if ( (not m_refined_injection_mask) and WarpX::do_moving_window)
-    {
-        Box mask_box = geom.Domain();
-        mask_box.setSmall(WarpX::moving_window_dir, 0);
-        mask_box.setBig(WarpX::moving_window_dir, 0);
-        m_refined_injection_mask.reset( new IArrayBox(mask_box));
-        m_refined_injection_mask->setVal(-1);
-    }
-
-    MFItInfo info;
-    if (do_tiling) {
-        info.EnableTiling(tile_size);
-    }
-    info.SetDynamic(true);
-
-#ifdef _OPENMP
-#pragma omp parallel if (not WarpX::serialize_ics)
+#if (AMREX_SPACEDIM == 3)
+            if (!tile_realbox.contains(XDim3{x,y,z})) {
+                p.id() = -1;
+                return;
+            }
+#else
+            if (!tile_realbox.contains(XDim3{x,z,0.0})) {
+                p.id() = -1;
+                return;
+            }
 #endif
-    {
-        std::array<Real,PIdx::nattribs> attribs;
-        attribs.fill(0.0);
-
-        // Loop through the tiles
-        for (MFIter mfi = MakeMFIter(lev, info); mfi.isValid(); ++mfi) {
 
-            Real wt = amrex::second();
-
-            const Box& tile_box = mfi.tilebox();
-            const RealBox tile_realbox = WarpX::getRealBox(tile_box, lev);
-
-            // Find the cells of part_box that overlap with tile_realbox
-            // If there is no overlap, just go to the next tile in the loop
-            RealBox overlap_realbox;
-            Box overlap_box;
-            Real ncells_adjust;
-            bool no_overlap = 0;
+            // Save the x and y values to use in the insideBounds checks.
+            // This is needed with WARPX_DIM_RZ since x and y are modified.
+            Real xb = x;
+            Real yb = y;
+
+#ifdef WARPX_DIM_RZ
+            // Replace the x and y, choosing the angle randomly.
+            // These x and y are used to get the momentum and density
+            Real theta = 2.*MathConst::pi*amrex::Random();
+            x = xb*std::cos(theta);
+            y = xb*std::sin(theta);
+#endif
 
-            for (int dir=0; dir<AMREX_SPACEDIM; dir++) {
-                if ( tile_realbox.lo(dir) <= part_realbox.hi(dir) ) {
-                    ncells_adjust = std::floor( (tile_realbox.lo(dir) - part_realbox.lo(dir))/dx[dir] );
-                    overlap_realbox.setLo( dir, part_realbox.lo(dir) + std::max(ncells_adjust, 0.) * dx[dir]);
-                } else {
-                    no_overlap = 1; break;
+            Real dens;
+            XDim3 u;
+            if (gamma_boost == 1.) {
+                // Lab-frame simulation
+                // If the particle is not within the species's
+                // xmin, xmax, ymin, ymax, zmin, zmax, go to
+                // the next generated particle.
+                if (!inj_pos->insideBounds(xb, yb, z)) {
+                    p.id() = -1;
+                    return;
                 }
-                if ( tile_realbox.hi(dir) >= part_realbox.lo(dir) ) {
-                    ncells_adjust = std::floor( (part_realbox.hi(dir) - tile_realbox.hi(dir))/dx[dir] );
-                    overlap_realbox.setHi( dir, part_realbox.hi(dir) - std::max(ncells_adjust, 0.) * dx[dir]);
-                } else {
-                    no_overlap = 1; break;
+                u = inj_mom->getMomentum(x, y, z);
+                dens = inj_rho->getDensity(x, y, z);
+                // Remove particle if density below threshold
+                if ( dens < density_min ){
+                    p.id() = -1;
+                    return;
                 }
-                // Count the number of cells in this direction in overlap_realbox
-                overlap_box.setSmall( dir, 0 );
-                overlap_box.setBig( dir,
-                                    int( round((overlap_realbox.hi(dir)-overlap_realbox.lo(dir))/dx[dir] )) - 1);
-            }
-            if (no_overlap == 1) {
-                continue; // Go to the next tile
-            }
-
-            const int grid_id = mfi.index();
-            const int tile_id = mfi.LocalTileIndex();
-
-            Cuda::HostVector<ParticleType> host_particles;
-            std::array<Cuda::HostVector<Real>, PIdx::nattribs> host_attribs;
-	    
-            // Loop through the cells of overlap_box and inject
-            // the corresponding particles
-            const auto& overlap_corner = overlap_realbox.lo();
-            for (IntVect iv = overlap_box.smallEnd(); iv <= overlap_box.bigEnd(); overlap_box.next(iv))
-            {
-                int fac;
-                if (do_continuous_injection) {
-#if ( AMREX_SPACEDIM == 3 )
-                    Real x = overlap_corner[0] + (iv[0] + 0.5)*dx[0];
-                    Real y = overlap_corner[1] + (iv[1] + 0.5)*dx[1];
-                    Real z = overlap_corner[2] + (iv[2] + 0.5)*dx[2];
-#elif ( AMREX_SPACEDIM == 2 )
-                    Real x = overlap_corner[0] + (iv[0] + 0.5)*dx[0];
-                    Real y = 0;
-                    Real z = overlap_corner[1] + (iv[1] + 0.5)*dx[1];
-#endif
-                    fac = GetRefineFac(x, y, z);
-                } else {
-                    fac = 1.0;
+                // Cut density if above threshold
+                dens = amrex::min(dens, density_max);
+            } else {
+                // Boosted-frame simulation
+                // Since the user provides the density distribution
+                // at t_lab=0 and in the lab-frame coordinates,
+                // we need to find the lab-frame position of this
+                // particle at t_lab=0, from its boosted-frame coordinates
+                // Assuming ballistic motion, this is given by:
+                // z0_lab = gamma*( z_boost*(1-beta*betaz_lab) - ct_boost*(betaz_lab-beta) )
+                // where betaz_lab is the speed of the particle in the lab frame
+                //
+                // In order for this equation to be solvable, betaz_lab
+                // is explicitly assumed to have no dependency on z0_lab
+                u = inj_mom->getMomentum(x, y, 0.); // No z0_lab dependency
+                // At this point u is the lab-frame momentum
+                // => Apply the above formula for z0_lab
+                Real gamma_lab = std::sqrt( 1.+(u.x*u.x+u.y*u.y+u.z*u.z) );
+                Real betaz_lab = u.z/(gamma_lab);
+                Real z0_lab = gamma_boost * ( z*(1-beta_boost*betaz_lab)
+                                              - PhysConst::c*t*(betaz_lab-beta_boost) );
+                // If the particle is not within the lab-frame zmin, zmax, etc.
+                // go to the next generated particle.
+                if (!inj_pos->insideBounds(xb, yb, z0_lab)) {
+                    p.id() = -1;
+                    return;
                 }
+                // call `getDensity` with lab-frame parameters
+                dens = inj_rho->getDensity(x, y, z0_lab);
+                // Remove particle if density below threshold
+                if ( dens < density_min ){
+                    p.id() = -1;
+                    return;
+                }
+                // Cut density if above threshold
+                dens = amrex::min(dens, density_max);
+                // At this point u and dens are the lab-frame quantities
+                // => Perform Lorentz transform
+                dens = gamma_boost * dens * ( 1.0 - beta_boost*betaz_lab );
+                u.z = gamma_boost * ( u.z -beta_boost*gamma_lab );
+            }
 
-                int ref_num_ppc = num_ppc * AMREX_D_TERM(fac, *fac, *fac);
-                for (int i_part=0; i_part<ref_num_ppc;i_part++) {
-                    std::array<Real, 3> r;
-                    plasma_injector->getPositionUnitBox(r, i_part, fac);
-#if ( AMREX_SPACEDIM == 3 )
-                    Real x = overlap_corner[0] + (iv[0] + r[0])*dx[0];
-                    Real y = overlap_corner[1] + (iv[1] + r[1])*dx[1];
-                    Real z = overlap_corner[2] + (iv[2] + r[2])*dx[2];
-#elif ( AMREX_SPACEDIM == 2 )
-                    Real x = overlap_corner[0] + (iv[0] + r[0])*dx[0];
-                    Real y = 0;
-                    Real z = overlap_corner[1] + (iv[1] + r[1])*dx[1];
-#endif
-                    // If the new particle is not inside the tile box,
-                    // go to the next generated particle.
-#if ( AMREX_SPACEDIM == 3 )
-                    if(!tile_realbox.contains( RealVect{x, y, z} )) continue;
-#elif ( AMREX_SPACEDIM == 2 )
-                    if(!tile_realbox.contains( RealVect{x, z} )) continue;
-#endif
-
-                    // Save the x and y values to use in the insideBounds checks.
-                    // This is needed with WARPX_RZ since x and y are modified.
-                    Real xb = x;
-                    Real yb = y;
-
-#ifdef WARPX_RZ
-                    // Replace the x and y, choosing the angle randomly.
-                    // These x and y are used to get the momentum and density
-                    Real theta = 2.*MathConst::pi*amrex::Random();
-                    x = xb*std::cos(theta);
-                    y = xb*std::sin(theta);
-#endif
+            u.x *= PhysConst::c;
+            u.y *= PhysConst::c;
+            u.z *= PhysConst::c;
 
-                    Real dens;
-                    std::array<Real, 3> u;
-                    if (WarpX::gamma_boost == 1.){
-                        // Lab-frame simulation
-                        // If the particle is not within the species's
-                        // xmin, xmax, ymin, ymax, zmin, zmax, go to
-                        // the next generated particle.
-                        if (!plasma_injector->insideBounds(xb, yb, z)) continue;
-                        plasma_injector->getMomentum(u, x, y, z);
-                        dens = plasma_injector->getDensity(x, y, z);
-                    } else {
-                        // Boosted-frame simulation
-                        Real c = PhysConst::c;
-                        Real gamma_boost = WarpX::gamma_boost;
-                        Real beta_boost = WarpX::beta_boost;
-                        // Since the user provides the density distribution
-                        // at t_lab=0 and in the lab-frame coordinates,
-                        // we need to find the lab-frame position of this
-                        // particle at t_lab=0, from its boosted-frame coordinates
-                        // Assuming ballistic motion, this is given by:
-                        // z0_lab = gamma*( z_boost*(1-beta*betaz_lab) - ct_boost*(betaz_lab-beta) )
-                        // where betaz_lab is the speed of the particle in the lab frame
-                        //
-                        // In order for this equation to be solvable, betaz_lab
-                        // is explicitly assumed to have no dependency on z0_lab
-                        plasma_injector->getMomentum(u, x, y, 0.); // No z0_lab dependency
-                        // At this point u is the lab-frame momentum
-                        // => Apply the above formula for z0_lab
-                        Real gamma_lab = std::sqrt( 1 + (u[0]*u[0] + u[1]*u[1] + u[2]*u[2])/(c*c) );
-                        Real betaz_lab = u[2]/gamma_lab/c;
-                        Real t = WarpX::GetInstance().gett_new(lev);
-                        Real z0_lab = gamma_boost * ( z*(1-beta_boost*betaz_lab) - c*t*(betaz_lab-beta_boost) );
-                        // If the particle is not within the lab-frame zmin, zmax, etc.
-                        // go to the next generated particle.
-                        if (!plasma_injector->insideBounds(xb, yb, z0_lab)) continue;
-                        // call `getDensity` with lab-frame parameters
-                        dens = plasma_injector->getDensity(x, y, z0_lab);
-                        // At this point u and dens are the lab-frame quantities
-                        // => Perform Lorentz transform
-                        dens = gamma_boost * dens * ( 1 - beta_boost*betaz_lab );
-                        u[2] = gamma_boost * ( u[2] -beta_boost*c*gamma_lab );
-                    }
-                    Real weight = dens * scale_fac / (AMREX_D_TERM(fac, *fac, *fac));
-#ifdef WARPX_RZ
-                    if (plasma_injector->radially_weighted) {
-                        weight *= 2*MathConst::pi*xb;
-                    } else {
-                        // This is not correct since it might shift the particle
-                        // out of the local grid
-                        x = std::sqrt(xb*rmax);
-                        weight *= dx[0];
-                    }
+            // Real weight = dens * scale_fac / (AMREX_D_TERM(fac, *fac, *fac));
+            Real weight = dens * scale_fac;
+#ifdef WARPX_DIM_RZ
+            if (radially_weighted) {
+                weight *= 2.*MathConst::pi*xb;
+            } else {
+                // This is not correct since it might shift the particle
+                // out of the local grid
+                x = std::sqrt(xb*rmax);
+                weight *= dx[0];
+            }
 #endif
-                    attribs[PIdx::w ] = weight;
-                    attribs[PIdx::ux] = u[0];
-                    attribs[PIdx::uy] = u[1];
-                    attribs[PIdx::uz] = u[2];
-
-                    // note - this will be slow on the GPU, need to revisit
-                    if (WarpX::do_boosted_frame_diagnostic && do_boosted_frame_diags)
-                    {
-                        auto& particle_tile = DefineAndReturnParticleTile(lev, grid_id, tile_id);
-                        particle_tile.push_back_real(particle_comps["xold"], x);
-                        particle_tile.push_back_real(particle_comps["yold"], y);
-                        particle_tile.push_back_real(particle_comps["zold"], z);
-
-                        particle_tile.push_back_real(particle_comps["uxold"], u[0]);
-                        particle_tile.push_back_real(particle_comps["uyold"], u[1]);
-                        particle_tile.push_back_real(particle_comps["uzold"], u[2]);
-                    }
+            pa[PIdx::w ][ip] = weight;
+            pa[PIdx::ux][ip] = u.x;
+            pa[PIdx::uy][ip] = u.y;
+            pa[PIdx::uz][ip] = u.z;
+
+            if (do_boosted) {
+                pb[0][ip] = x;
+                pb[1][ip] = y;
+                pb[2][ip] = z;
+                pb[3][ip] = u.x;
+                pb[4][ip] = u.y;
+                pb[5][ip] = u.z;
+            }
 
-                    ParticleType p;
-                    p.id()  = ParticleType::NextID();
-                    p.cpu() = ParallelDescriptor::MyProc();
 #if (AMREX_SPACEDIM == 3)
-                    p.pos(0) = x;
-                    p.pos(1) = y;
-                    p.pos(2) = z;
+            p.pos(0) = x;
+            p.pos(1) = y;
+            p.pos(2) = z;
 #elif (AMREX_SPACEDIM == 2)
-#ifdef WARPX_RZ
-                    attribs[PIdx::theta] = theta;
+#ifdef WARPX_DIM_RZ
+            pa[PIdx::theta][ip] = theta;
 #endif
-                    p.pos(0) = xb;
-                    p.pos(1) = z;
+            p.pos(0) = xb;
+            p.pos(1) = z;
 #endif
-
-                    host_particles.push_back(p);
-                    for (int kk = 0; kk < PIdx::nattribs; ++kk)
-                        host_attribs[kk].push_back(attribs[kk]);
-                }
-            }
-
-            auto& particle_tile = GetParticles(lev)[std::make_pair(grid_id,tile_id)];
-            auto old_size = particle_tile.GetArrayOfStructs().size();
-            auto new_size = old_size + host_particles.size();
-            particle_tile.resize(new_size);
-
-            Cuda::thrust_copy(host_particles.begin(),
-                              host_particles.end(),
-                              particle_tile.GetArrayOfStructs().begin() + old_size);
-
-            for (int kk = 0; kk < PIdx::nattribs; ++kk) {
-                Cuda::thrust_copy(host_attribs[kk].begin(),
-                                  host_attribs[kk].end(),
-                                  particle_tile.GetStructOfArrays().GetRealData(kk).begin() + old_size);
-            }
-	    			 
-            if (cost) {
-                wt = (amrex::second() - wt) / tile_box.d_numPts();
-                Array4<Real> const& costarr = cost->array(mfi);
-                amrex::ParallelFor(tile_box,
-                                   [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
-                                   {
-                                       costarr(i,j,k) += wt;
-                                   });
-            }
-        }		
+        }, shared_mem_bytes);
+    			 
+        if (cost) {
+            wt = (amrex::second() - wt) / tile_box.d_numPts();
+            Array4<Real> const& costarr = cost->array(mfi);
+            amrex::ParallelFor(tile_box,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
+            {
+                costarr(i,j,k) += wt;
+            });
+        }
     }
+
+    // The function that calls this is responsible for redistributing particles.
 }
-#endif
 
 #ifdef WARPX_DO_ELECTROSTATIC
 void
@@ -1066,11 +848,14 @@ PhysicalParticleContainer::FieldGather (int lev,
     MultiFab* cost = WarpX::getCosts(lev);
 
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel 
 #endif
     {
-        Cuda::ManagedDeviceVector<Real> xp, yp, zp;
-
+#ifdef _OPENMP
+        int thread_num = omp_get_thread_num();
+#else
+        int thread_num = 0;
+#endif
         for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
         {
             Real wt = amrex::second();
@@ -1106,35 +891,15 @@ PhysicalParticleContainer::FieldGather (int lev,
             //
             // copy data from particle container to temp arrays
             //
-            pti.GetPosition(xp, yp, zp);
-
-            const std::array<Real,3>& xyzmin = WarpX::LowerCorner(box, lev);
-            const int* ixyzmin = box.loVect();
+            pti.GetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
 
             //
             // Field Gather
             //
-            const int ll4symtry          = false;
-            long lvect_fieldgathe = 64;
-            warpx_geteb_energy_conserving(
-                &np,
-                xp.dataPtr(),
-                yp.dataPtr(),
-                zp.dataPtr(),
-                Exp.dataPtr(),Eyp.dataPtr(),Ezp.dataPtr(),
-                Bxp.dataPtr(),Byp.dataPtr(),Bzp.dataPtr(),
-                ixyzmin,
-                &xyzmin[0], &xyzmin[1], &xyzmin[2],
-                &dx[0], &dx[1], &dx[2],
-                &WarpX::nox, &WarpX::noy, &WarpX::noz,
-                BL_TO_FORTRAN_ANYD(exfab),
-                BL_TO_FORTRAN_ANYD(eyfab),
-                BL_TO_FORTRAN_ANYD(ezfab),
-                BL_TO_FORTRAN_ANYD(bxfab),
-                BL_TO_FORTRAN_ANYD(byfab),
-                BL_TO_FORTRAN_ANYD(bzfab),
-                &ll4symtry, &WarpX::l_lower_order_in_v, &WarpX::do_nodal,
-                &lvect_fieldgathe, &WarpX::field_gathering_algo);
+            int e_is_nodal = Ex.is_nodal() and Ey.is_nodal() and Ez.is_nodal();
+            FieldGather(pti, Exp, Eyp, Ezp, Bxp, Byp, Bzp,
+                        &exfab, &eyfab, &ezfab, &bxfab, &byfab, &bzfab, 
+                        Ex.nGrow(), e_is_nodal, 0, np, thread_num, lev, lev);
 
             if (cost) {
                 const Box& tbx = pti.tilebox();
@@ -1164,7 +929,7 @@ PhysicalParticleContainer::Evolve (int lev,
     BL_PROFILE("PPC::Evolve()");
     BL_PROFILE_VAR_NS("PPC::Evolve::Copy", blp_copy);
     BL_PROFILE_VAR_NS("PICSAR::FieldGather", blp_pxr_fg);
-    BL_PROFILE_VAR_NS("PICSAR::ParticlePush", blp_pxr_pp);
+    BL_PROFILE_VAR_NS("PPC::ParticlePush", blp_ppc_pp);
     BL_PROFILE_VAR_NS("PPC::Evolve::partition", blp_partition);
     
     const std::array<Real,3>& dx = WarpX::CellSize(lev);
@@ -1391,57 +1156,40 @@ PhysicalParticleContainer::Evolve (int lev,
             pti.GetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
             BL_PROFILE_VAR_STOP(blp_copy);
 
-            if (rho) DepositCharge(pti, wp, rho, crho, 0, np_current, np, thread_num, lev);
+            if (rho) {
+                DepositCharge(pti, wp, rho, 0, 0, np_current, thread_num, lev, lev);
+                if (has_buffer){
+                    DepositCharge(pti, wp, crho, 0, np_current, np-np_current, thread_num, lev, lev-1);
+                }
+            }
             
             if (! do_not_push)
             {
+                const long np_gather = (cEx) ? nfine_gather : np;
+
+                int e_is_nodal = Ex.is_nodal() and Ey.is_nodal() and Ez.is_nodal();
+
                 //
                 // Field Gather of Aux Data (i.e., the full solution)
                 //
-                const int ll4symtry          = false;
-                long lvect_fieldgathe = 64;
-
-                const std::array<Real,3>& xyzmin_grid = WarpX::LowerCorner(box, lev);
-                const int* ixyzmin_grid = box.loVect();
-                
-                const long np_gather = (cEx) ? nfine_gather : np;
-
                 BL_PROFILE_VAR_START(blp_pxr_fg);
-
-                warpx_geteb_energy_conserving(
-                    &np_gather,
-                    m_xp[thread_num].dataPtr(),
-                    m_yp[thread_num].dataPtr(),
-                    m_zp[thread_num].dataPtr(),
-                    Exp.dataPtr(),Eyp.dataPtr(),Ezp.dataPtr(),
-                    Bxp.dataPtr(),Byp.dataPtr(),Bzp.dataPtr(),
-                    ixyzmin_grid,
-                    &xyzmin_grid[0], &xyzmin_grid[1], &xyzmin_grid[2],
-                    &dx[0], &dx[1], &dx[2],
-                    &WarpX::nox, &WarpX::noy, &WarpX::noz,
-                    BL_TO_FORTRAN_ANYD(*exfab),
-                    BL_TO_FORTRAN_ANYD(*eyfab),
-                    BL_TO_FORTRAN_ANYD(*ezfab),
-                    BL_TO_FORTRAN_ANYD(*bxfab),
-                    BL_TO_FORTRAN_ANYD(*byfab),
-                    BL_TO_FORTRAN_ANYD(*bzfab),
-                    &ll4symtry, &WarpX::l_lower_order_in_v, &WarpX::do_nodal,
-                    &lvect_fieldgathe, &WarpX::field_gathering_algo);
+                FieldGather(pti, Exp, Eyp, Ezp, Bxp, Byp, Bzp,
+                            exfab, eyfab, ezfab, bxfab, byfab, bzfab, 
+                            Ex.nGrow(), e_is_nodal, 0, np_gather, thread_num, lev, lev);
 
                 if (np_gather < np)
                 {
                     const IntVect& ref_ratio = WarpX::RefRatio(lev-1);
                     const Box& cbox = amrex::coarsen(box,ref_ratio);
-                    const std::array<Real,3>& cxyzmin_grid = WarpX::LowerCorner(cbox, lev-1);
-                    const int* cixyzmin_grid = cbox.loVect();
-
-                    const FArrayBox* cexfab = &(*cEx)[pti];
-                    const FArrayBox* ceyfab = &(*cEy)[pti];
-                    const FArrayBox* cezfab = &(*cEz)[pti];
-                    const FArrayBox* cbxfab = &(*cBx)[pti];
-                    const FArrayBox* cbyfab = &(*cBy)[pti];
-                    const FArrayBox* cbzfab = &(*cBz)[pti];
 
+                    // Data on the grid
+                    FArrayBox const* cexfab = &(*cEx)[pti];
+                    FArrayBox const* ceyfab = &(*cEy)[pti];
+                    FArrayBox const* cezfab = &(*cEz)[pti];
+                    FArrayBox const* cbxfab = &(*cBx)[pti];
+                    FArrayBox const* cbyfab = &(*cBy)[pti];
+                    FArrayBox const* cbzfab = &(*cBz)[pti];
+                    
                     if (WarpX::use_fdtd_nci_corr)
                     {
 #if (AMREX_SPACEDIM == 2)
@@ -1494,26 +1242,14 @@ PhysicalParticleContainer::Evolve (int lev,
 #endif
                     }
                     
-                    long ncrse = np - nfine_gather;
-                    warpx_geteb_energy_conserving(
-                        &ncrse,
-                        m_xp[thread_num].dataPtr()+nfine_gather,
-                        m_yp[thread_num].dataPtr()+nfine_gather,
-                        m_zp[thread_num].dataPtr()+nfine_gather,
-                        Exp.dataPtr()+nfine_gather, Eyp.dataPtr()+nfine_gather, Ezp.dataPtr()+nfine_gather,
-                        Bxp.dataPtr()+nfine_gather, Byp.dataPtr()+nfine_gather, Bzp.dataPtr()+nfine_gather,
-                        cixyzmin_grid,
-                        &cxyzmin_grid[0], &cxyzmin_grid[1], &cxyzmin_grid[2],
-                        &cdx[0], &cdx[1], &cdx[2],
-                        &WarpX::nox, &WarpX::noy, &WarpX::noz,
-                        BL_TO_FORTRAN_ANYD(*cexfab),
-                        BL_TO_FORTRAN_ANYD(*ceyfab),
-                        BL_TO_FORTRAN_ANYD(*cezfab),
-                        BL_TO_FORTRAN_ANYD(*cbxfab),
-                        BL_TO_FORTRAN_ANYD(*cbyfab),
-                        BL_TO_FORTRAN_ANYD(*cbzfab),
-                        &ll4symtry, &WarpX::l_lower_order_in_v, &WarpX::do_nodal,
-                        &lvect_fieldgathe, &WarpX::field_gathering_algo);
+                    // Field gather for particles in gather buffers
+                    e_is_nodal = cEx->is_nodal() and cEy->is_nodal() and cEz->is_nodal();
+                    FieldGather(pti, Exp, Eyp, Ezp, Bxp, Byp, Bzp, 
+                                cexfab, ceyfab, cezfab,
+                                cbxfab, cbyfab, cbzfab,
+                                cEx->nGrow(), e_is_nodal, 
+                                nfine_gather, np-nfine_gather, 
+                                thread_num, lev, lev-1);
                 }
 
                 BL_PROFILE_VAR_STOP(blp_pxr_fg);
@@ -1521,10 +1257,10 @@ PhysicalParticleContainer::Evolve (int lev,
                 //
                 // Particle Push
                 //
-                BL_PROFILE_VAR_START(blp_pxr_pp);
+                BL_PROFILE_VAR_START(blp_ppc_pp);
                 PushPX(pti, m_xp[thread_num], m_yp[thread_num], m_zp[thread_num], 
                        m_giv[thread_num], dt);
-                BL_PROFILE_VAR_STOP(blp_pxr_pp);
+                BL_PROFILE_VAR_STOP(blp_ppc_pp);
 
                 //
                 // Current Deposition
@@ -1561,7 +1297,12 @@ PhysicalParticleContainer::Evolve (int lev,
                 BL_PROFILE_VAR_STOP(blp_copy);
             }
             
-            if (rho) DepositCharge(pti, wp, rho, crho, 1, np_current, np, thread_num, lev);
+            if (rho) {
+                DepositCharge(pti, wp, rho, 1, 0, np_current, thread_num, lev, lev);
+                if (has_buffer){
+                    DepositCharge(pti, wp, crho, 1, np_current, np-np_current, thread_num, lev, lev-1);
+                }
+            }
 
             if (cost) {
                 const Box& tbx = pti.tilebox();
@@ -1742,36 +1483,52 @@ PhysicalParticleContainer::PushPX(WarpXParIter& pti,
                                   Real dt)
 {
 
+    // This wraps the momentum and position advance so that inheritors can modify the call.
+    auto& attribs = pti.GetAttribs();
+    // Extract pointers to the different particle quantities
+    Real* const AMREX_RESTRICT x = xp.dataPtr();
+    Real* const AMREX_RESTRICT y = yp.dataPtr();
+    Real* const AMREX_RESTRICT z = zp.dataPtr();
+    Real* const AMREX_RESTRICT gi = giv.dataPtr();
+    Real* const AMREX_RESTRICT ux = attribs[PIdx::ux].dataPtr();
+    Real* const AMREX_RESTRICT uy = attribs[PIdx::uy].dataPtr();
+    Real* const AMREX_RESTRICT uz = attribs[PIdx::uz].dataPtr();
+    const Real* const AMREX_RESTRICT Ex = attribs[PIdx::Ex].dataPtr();
+    const Real* const AMREX_RESTRICT Ey = attribs[PIdx::Ey].dataPtr();
+    const Real* const AMREX_RESTRICT Ez = attribs[PIdx::Ez].dataPtr();
+    const Real* const AMREX_RESTRICT Bx = attribs[PIdx::Bx].dataPtr();
+    const Real* const AMREX_RESTRICT By = attribs[PIdx::By].dataPtr();
+    const Real* const AMREX_RESTRICT Bz = attribs[PIdx::Bz].dataPtr();
+
     if (WarpX::do_boosted_frame_diagnostic && do_boosted_frame_diags)
     {
-        copy_attribs(pti, xp.dataPtr(), yp.dataPtr(), zp.dataPtr());
+        copy_attribs(pti, x, y, z);
     }
 
-    // The following attributes should be included in CPP version of warpx_particle_pusher
-	// This wraps the call to warpx_particle_pusher so that inheritors can modify the call.
-    auto& attribs = pti.GetAttribs();
-    auto& uxp = attribs[PIdx::ux];
-    auto& uyp = attribs[PIdx::uy];
-    auto& uzp = attribs[PIdx::uz];
-    auto& Exp = attribs[PIdx::Ex];
-    auto& Eyp = attribs[PIdx::Ey];
-    auto& Ezp = attribs[PIdx::Ez];
-    auto& Bxp = attribs[PIdx::Bx];
-    auto& Byp = attribs[PIdx::By];
-    auto& Bzp = attribs[PIdx::Bz];
-    const long np  = pti.numParticles();
-    
-    warpx_particle_pusher(&np,
-                          xp.dataPtr(),
-                          yp.dataPtr(),
-                          zp.dataPtr(),
-                          uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
-                          giv.dataPtr(),
-                          Exp.dataPtr(), Eyp.dataPtr(), Ezp.dataPtr(),
-                          Bxp.dataPtr(), Byp.dataPtr(), Bzp.dataPtr(),
-                          &this->charge, &this->mass, &dt,
-                          &WarpX::particle_pusher_algo);
-
+    // Loop over the particles and update their momentum
+    const Real q = this->charge;
+    const Real m = this-> mass;
+    if (WarpX::particle_pusher_algo == ParticlePusherAlgo::Boris){
+        amrex::ParallelFor( pti.numParticles(),
+            [=] AMREX_GPU_DEVICE (long i) {
+                UpdateMomentumBoris( ux[i], uy[i], uz[i], gi[i],
+                      Ex[i], Ey[i], Ez[i], Bx[i], By[i], Bz[i], q, m, dt);
+                UpdatePosition( x[i], y[i], z[i],
+                      ux[i], uy[i], uz[i], dt );
+            }
+        );
+    } else if (WarpX::particle_pusher_algo == ParticlePusherAlgo::Vay) {
+        amrex::ParallelFor( pti.numParticles(),
+            [=] AMREX_GPU_DEVICE (long i) {
+                UpdateMomentumVay( ux[i], uy[i], uz[i], gi[i],
+                      Ex[i], Ey[i], Ez[i], Bx[i], By[i], Bz[i], q, m, dt);
+                UpdatePosition( x[i], y[i], z[i],
+                      ux[i], uy[i], uz[i], dt );
+            }
+        );
+    } else {
+      amrex::Abort("Unknown particle pusher");
+    };
 }
 
 void
@@ -1800,9 +1557,6 @@ PhysicalParticleContainer::PushP (int lev, Real dt,
 
             auto& attribs = pti.GetAttribs();
 
-            auto& uxp = attribs[PIdx::ux];
-            auto& uyp = attribs[PIdx::uy];
-            auto& uzp = attribs[PIdx::uz];
             auto& Exp = attribs[PIdx::Ex];
             auto& Eyp = attribs[PIdx::Ey];
             auto& Ezp = attribs[PIdx::Ez];
@@ -1834,42 +1588,44 @@ PhysicalParticleContainer::PushP (int lev, Real dt,
             //
             pti.GetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
 
-            const std::array<Real,3>& xyzmin_grid = WarpX::LowerCorner(box, lev);
-            const int* ixyzmin_grid = box.loVect();
-
-            const int ll4symtry          = false;
-            long lvect_fieldgathe = 64;
-
-            warpx_geteb_energy_conserving(
-                &np,
-                m_xp[thread_num].dataPtr(),
-                m_yp[thread_num].dataPtr(),
-                m_zp[thread_num].dataPtr(),
-                Exp.dataPtr(),Eyp.dataPtr(),Ezp.dataPtr(),
-                Bxp.dataPtr(),Byp.dataPtr(),Bzp.dataPtr(),
-                ixyzmin_grid,
-                &xyzmin_grid[0], &xyzmin_grid[1], &xyzmin_grid[2],
-                &dx[0], &dx[1], &dx[2],
-                &WarpX::nox, &WarpX::noy, &WarpX::noz,
-                BL_TO_FORTRAN_ANYD(exfab),
-                BL_TO_FORTRAN_ANYD(eyfab),
-                BL_TO_FORTRAN_ANYD(ezfab),
-                BL_TO_FORTRAN_ANYD(bxfab),
-                BL_TO_FORTRAN_ANYD(byfab),
-                BL_TO_FORTRAN_ANYD(bzfab),
-                &ll4symtry, &WarpX::l_lower_order_in_v, &WarpX::do_nodal,
-                &lvect_fieldgathe, &WarpX::field_gathering_algo);
-
-            warpx_particle_pusher_momenta(&np,
-                                          m_xp[thread_num].dataPtr(),
-                                          m_yp[thread_num].dataPtr(),
-                                          m_zp[thread_num].dataPtr(),
-                                          uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
-                                          m_giv[thread_num].dataPtr(),
-                                          Exp.dataPtr(), Eyp.dataPtr(), Ezp.dataPtr(),
-                                          Bxp.dataPtr(), Byp.dataPtr(), Bzp.dataPtr(),
-                                          &this->charge, &this->mass, &dt,
-                                          &WarpX::particle_pusher_algo);
+            int e_is_nodal = Ex.is_nodal() and Ey.is_nodal() and Ez.is_nodal();
+            FieldGather(pti, Exp, Eyp, Ezp, Bxp, Byp, Bzp,
+                        &exfab, &eyfab, &ezfab, &bxfab, &byfab, &bzfab, 
+                        Ex.nGrow(), e_is_nodal, 0, np, thread_num, lev, lev);
+
+            // This wraps the momentum advance so that inheritors can modify the call.
+            // Extract pointers to the different particle quantities
+            Real* const AMREX_RESTRICT gi = m_giv[thread_num].dataPtr();
+            Real* const AMREX_RESTRICT ux = attribs[PIdx::ux].dataPtr();
+            Real* const AMREX_RESTRICT uy = attribs[PIdx::uy].dataPtr();
+            Real* const AMREX_RESTRICT uz = attribs[PIdx::uz].dataPtr();
+            const Real* const AMREX_RESTRICT Expp = Exp.dataPtr();
+            const Real* const AMREX_RESTRICT Eypp = Eyp.dataPtr();
+            const Real* const AMREX_RESTRICT Ezpp = Ezp.dataPtr();
+            const Real* const AMREX_RESTRICT Bxpp = Bxp.dataPtr();
+            const Real* const AMREX_RESTRICT Bypp = Byp.dataPtr();
+            const Real* const AMREX_RESTRICT Bzpp = Bzp.dataPtr();
+
+            // Loop over the particles and update their momentum
+            const Real q = this->charge;
+            const Real m = this-> mass;
+            if (WarpX::particle_pusher_algo == ParticlePusherAlgo::Boris){
+                amrex::ParallelFor( pti.numParticles(),
+                    [=] AMREX_GPU_DEVICE (long i) {
+                        UpdateMomentumBoris( ux[i], uy[i], uz[i], gi[i],
+                              Expp[i], Eypp[i], Ezpp[i], Bxpp[i], Bypp[i], Bzpp[i], q, m, dt);
+                    }
+                );
+            } else if (WarpX::particle_pusher_algo == ParticlePusherAlgo::Vay) {
+                amrex::ParallelFor( pti.numParticles(),
+                    [=] AMREX_GPU_DEVICE (long i) {
+                        UpdateMomentumVay( ux[i], uy[i], uz[i], gi[i],
+                              Expp[i], Eypp[i], Ezpp[i], Bxpp[i], Bypp[i], Bzpp[i], q, m, dt);
+                    }
+                );
+            } else {
+              amrex::Abort("Unknown particle pusher");
+            };
         }
     }
 }
@@ -2034,74 +1790,6 @@ void PhysicalParticleContainer::GetParticleSlice(const int direction, const Real
     }
 }
 
-int PhysicalParticleContainer::GetRefineFac(const Real x, const Real y, const Real z)
-{
-    if (finestLevel() == 0) return 1;
-    if (not WarpX::refine_plasma) return 1;
-
-    IntVect iv;
-    const Geometry& geom = Geom(0);
-
-    std::array<Real, 3> offset;
-
-#if ( AMREX_SPACEDIM == 3)
-    offset[0] = geom.ProbLo(0);
-    offset[1] = geom.ProbLo(1);
-    offset[2] = geom.ProbLo(2);
-#elif ( AMREX_SPACEDIM == 2 )
-    offset[0] = geom.ProbLo(0);
-    offset[1] = 0.0;
-    offset[2] = geom.ProbLo(1);
-#endif
-
-    AMREX_D_TERM(iv[0]=static_cast<int>(floor((x-offset[0])*geom.InvCellSize(0)));,
-                 iv[1]=static_cast<int>(floor((y-offset[1])*geom.InvCellSize(1)));,
-                 iv[2]=static_cast<int>(floor((z-offset[2])*geom.InvCellSize(2))););
-
-    iv += geom.Domain().smallEnd();
-
-    const int dir = WarpX::moving_window_dir;
-
-    IntVect iv2 = iv;
-    iv2[dir] = 0;
-
-    if ( (*m_refined_injection_mask)(iv2) != -1) return (*m_refined_injection_mask)(iv2);
-
-    int ref_fac = 1;
-    for (int lev = 0; lev < finestLevel(); ++lev)
-    {
-        const IntVect rr = m_gdb->refRatio(lev);
-        const BoxArray& fine_ba = this->ParticleBoxArray(lev+1);
-        const int num_boxes = fine_ba.size();
-        Vector<Box> stretched_boxes;
-        const int safety_factor = 4;
-        for (int i = 0; i < num_boxes; ++i)
-        {
-            Box bx = fine_ba[i];
-            bx.coarsen(ref_fac*rr[dir]);
-            bx.setSmall(dir, std::numeric_limits<int>::min()/safety_factor);
-            bx.setBig(dir, std::numeric_limits<int>::max()/safety_factor);
-            stretched_boxes.push_back(bx);
-        }
-
-        BoxArray stretched_ba(stretched_boxes.dataPtr(), stretched_boxes.size());
-
-        const int num_ghost = 0;
-        if ( stretched_ba.intersects(Box(iv, iv), num_ghost) )
-        {
-            ref_fac *= rr[dir];
-        }
-        else
-        {
-            break;
-        }
-    }
-
-    (*m_refined_injection_mask)(iv2) = ref_fac;
-
-    return ref_fac;
-}
-
 /* \brief Inject particles during the simulation
  * \param injection_box: domain where particles should be injected.
  */
@@ -2112,3 +1800,134 @@ PhysicalParticleContainer::ContinuousInjection(const RealBox& injection_box)
     const int lev=0;
     AddPlasma(lev, injection_box);
 }
+
+/* \brief Gather fields from FArrayBox exfab, eyfab, ezfab, bxfab, byfab, 
+ * bzfab into arrays of fields on particles Exp, Eyp, Ezp, Bxp, Byp, Bzp.
+ * \param Exp-Bzp: fields on particles.
+ * \param exfab-bzfab: FAB of electric and magnetic fields for particles in pti
+ * \param ngE: number of guard cells for E
+ * \param e_is_nodal: 0 if E is staggered, 1 if E is nodal
+ * \param offset: index of first particle for which fields are gathered
+ * \param np_to_gather: number of particles onto which fields are gathered
+ * \param thread_num: if using OpenMP, thread number
+ * \param lev: level on which particles are located
+ * \param gather_lev: level from which particles gather fields (lev-1) for 
+          particles in buffers.
+ */
+void
+PhysicalParticleContainer::FieldGather (WarpXParIter& pti,
+                                        RealVector& Exp,
+                                        RealVector& Eyp,
+                                        RealVector& Ezp,
+                                        RealVector& Bxp,
+                                        RealVector& Byp,
+                                        RealVector& Bzp,
+                                        FArrayBox const * exfab,
+                                        FArrayBox const * eyfab,
+                                        FArrayBox const * ezfab,
+                                        FArrayBox const * bxfab,
+                                        FArrayBox const * byfab,
+                                        FArrayBox const * bzfab,
+                                        const int ngE, const int e_is_nodal,
+                                        const long offset,
+                                        const long np_to_gather,
+                                        int thread_num,
+                                        int lev,
+                                        int gather_lev)
+{
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE((gather_lev==(lev-1)) ||
+                                     (gather_lev==(lev  )),
+                                     "Gather buffers only work for lev-1");
+    
+    // If no particles, do not do anything
+    if (np_to_gather == 0) return;
+    // Get cell size on gather_lev
+    const std::array<Real,3>& dx = WarpX::CellSize(std::max(gather_lev,0));
+    // Set staggering shift depending on e_is_nodal
+    const Real stagger_shift = e_is_nodal ? 0.0 : 0.5;
+    
+    // Get box from which field is gathered.
+    // If not gathering from the finest level, the box is coarsened.
+    Box box;
+    if (lev == gather_lev) {
+        box = pti.tilebox();
+    } else {
+        const IntVect& ref_ratio = WarpX::RefRatio(gather_lev);
+        box = amrex::coarsen(pti.tilebox(),ref_ratio);
+    }
+    
+    // Add guard cells to the box.
+    box.grow(ngE);
+    
+    const Array4<const Real>& ex_arr = exfab->array();
+    const Array4<const Real>& ey_arr = eyfab->array();
+    const Array4<const Real>& ez_arr = ezfab->array();
+    const Array4<const Real>& bx_arr = bxfab->array();
+    const Array4<const Real>& by_arr = byfab->array();
+    const Array4<const Real>& bz_arr = bzfab->array();
+    
+    const Real * const AMREX_RESTRICT xp = m_xp[thread_num].dataPtr() + offset;
+    const Real * const AMREX_RESTRICT zp = m_zp[thread_num].dataPtr() + offset;
+    const Real * const AMREX_RESTRICT yp = m_yp[thread_num].dataPtr() + offset;
+    
+    // Lower corner of tile box physical domain
+    const std::array<Real, 3>& xyzmin = WarpX::LowerCorner(box, gather_lev);
+    
+    const Dim3 lo = lbound(box);
+    
+    // Depending on l_lower_in_v and WarpX::nox, call
+    // different versions of template function doGatherShapeN
+    if (WarpX::l_lower_order_in_v){
+        if        (WarpX::nox == 1){
+            doGatherShapeN<1,1>(xp, yp, zp,
+                                Exp.dataPtr() + offset, Eyp.dataPtr() + offset,
+                                Ezp.dataPtr() + offset, Bxp.dataPtr() + offset,
+                                Byp.dataPtr() + offset, Bzp.dataPtr() + offset,
+                                ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
+                                np_to_gather, dx,
+                                xyzmin, lo, stagger_shift);
+        } else if (WarpX::nox == 2){
+            doGatherShapeN<2,1>(xp, yp, zp,
+                                Exp.dataPtr() + offset, Eyp.dataPtr() + offset,
+                                Ezp.dataPtr() + offset, Bxp.dataPtr() + offset,
+                                Byp.dataPtr() + offset, Bzp.dataPtr() + offset,
+                                ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
+                                np_to_gather, dx,
+                                xyzmin, lo, stagger_shift);
+        } else if (WarpX::nox == 3){
+            doGatherShapeN<3,1>(xp, yp, zp,
+                                Exp.dataPtr() + offset, Eyp.dataPtr() + offset,
+                                Ezp.dataPtr() + offset, Bxp.dataPtr() + offset,
+                                Byp.dataPtr() + offset, Bzp.dataPtr() + offset,
+                                ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
+                                np_to_gather, dx,
+                                xyzmin, lo, stagger_shift);
+        }
+    } else {
+        if        (WarpX::nox == 1){
+            doGatherShapeN<1,0>(xp, yp, zp,
+                                Exp.dataPtr() + offset, Eyp.dataPtr() + offset,
+                                Ezp.dataPtr() + offset, Bxp.dataPtr() + offset,
+                                Byp.dataPtr() + offset, Bzp.dataPtr() + offset,
+                                ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
+                                np_to_gather, dx,
+                                xyzmin, lo, stagger_shift);
+        } else if (WarpX::nox == 2){
+            doGatherShapeN<2,0>(xp, yp, zp,
+                                Exp.dataPtr() + offset, Eyp.dataPtr() + offset,
+                                Ezp.dataPtr() + offset, Bxp.dataPtr() + offset,
+                                Byp.dataPtr() + offset, Bzp.dataPtr() + offset,
+                                ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
+                                np_to_gather, dx,
+                                xyzmin, lo, stagger_shift);
+        } else if (WarpX::nox == 3){
+            doGatherShapeN<3,0>(xp, yp, zp,
+                                Exp.dataPtr() + offset, Eyp.dataPtr() + offset,
+                                Ezp.dataPtr() + offset, Bxp.dataPtr() + offset,
+                                Byp.dataPtr() + offset, Bzp.dataPtr() + offset,
+                                ex_arr, ey_arr, ez_arr, bx_arr, by_arr, bz_arr,
+                                np_to_gather, dx,
+                                xyzmin, lo, stagger_shift);
+        }
+    }
+}
diff --git a/Source/Particles/Pusher/GetAndSetPosition.H b/Source/Particles/Pusher/GetAndSetPosition.H
index 42c61343e..3c74baeb2 100644
--- a/Source/Particles/Pusher/GetAndSetPosition.H
+++ b/Source/Particles/Pusher/GetAndSetPosition.H
@@ -5,7 +5,7 @@
 #include <WarpXParticleContainer.H>
 #include <AMReX_REAL.H>
 
-#ifndef WARPX_RZ
+#ifndef WARPX_DIM_RZ
 
 /* \brief Extract the particle's coordinates from the ParticleType struct `p`,
  *        and stores them in the variables `x`, `y`, `z`. */
@@ -42,7 +42,7 @@ void SetPosition(
 #endif
 }
 
-# else // if WARPX_RZ is True
+# elif defined WARPX_DIM_RZ
 
 /* \brief Extract the particle's coordinates from `theta` and the attributes
  *         of the ParticleType struct `p` (which contains the radius),
@@ -71,6 +71,6 @@ void SetCylindricalPositionFromCartesian(
     p.pos(1) = z;
 }
 
-#endif // WARPX_RZ
+#endif // WARPX_DIM_RZ
 
 #endif // WARPX_PARTICLES_PUSHER_GETANDSETPOSITION_H_
diff --git a/Source/Particles/Pusher/Make.package b/Source/Particles/Pusher/Make.package
index 8c8e77905..95a38fa2d 100644
--- a/Source/Particles/Pusher/Make.package
+++ b/Source/Particles/Pusher/Make.package
@@ -1,4 +1,6 @@
 CEXE_headers += GetAndSetPosition.H
 CEXE_headers += UpdatePosition.H
+CEXE_headers += UpdateMomentumBoris.H
+CEXE_headers += UpdateMomentumVay.H
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Particles/Pusher
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Particles/Pusher
diff --git a/Source/Particles/Pusher/UpdateMomentumBoris.H b/Source/Particles/Pusher/UpdateMomentumBoris.H
new file mode 100644
index 000000000..71e9a8ed1
--- /dev/null
+++ b/Source/Particles/Pusher/UpdateMomentumBoris.H
@@ -0,0 +1,47 @@
+#ifndef WARPX_PARTICLES_PUSHER_UPDATEMOMENTUM_BORIS_H_
+#define WARPX_PARTICLES_PUSHER_UPDATEMOMENTUM_BORIS_H_
+
+#include <AMReX_REAL.H>
+
+/* \brief Push the particle's positions over one timestep,
+ *    given the value of its momenta `ux`, `uy`, `uz` */
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+void UpdateMomentumBoris(
+    amrex::Real& ux, amrex::Real& uy, amrex::Real& uz, amrex::Real& gaminv,
+    const amrex::Real Ex, const amrex::Real Ey, const amrex::Real Ez,
+    const amrex::Real Bx, const amrex::Real By, const amrex::Real Bz,
+    const amrex::Real q, const amrex::Real m, const amrex::Real dt )
+{
+    const amrex::Real econst = 0.5*q*dt/m;
+
+    // First half-push for E
+    ux += econst*Ex;
+    uy += econst*Ey;
+    uz += econst*Ez;
+    // Compute temporary gamma factor
+    constexpr amrex::Real inv_c2 = 1./(PhysConst::c*PhysConst::c);
+    const amrex::Real inv_gamma = 1./std::sqrt(1. + (ux*ux + uy*uy + uz*uz)*inv_c2);
+    // Magnetic rotation
+    // - Compute temporary variables
+    const amrex::Real tx = econst*inv_gamma*Bx;
+    const amrex::Real ty = econst*inv_gamma*By;
+    const amrex::Real tz = econst*inv_gamma*Bz;
+    const amrex::Real tsqi = 2./(1. + tx*tx + ty*ty + tz*tz);
+    const amrex::Real sx = tx*tsqi;
+    const amrex::Real sy = ty*tsqi;
+    const amrex::Real sz = tz*tsqi;
+    const amrex::Real ux_p = ux + uy*tz - uz*ty;
+    const amrex::Real uy_p = uy + uz*tx - ux*tz;
+    const amrex::Real uz_p = uz + ux*ty - uy*tx;
+    // - Update momentum
+    ux += uy_p*sz - uz_p*sy;
+    uy += uz_p*sx - ux_p*sz;
+    uz += ux_p*sy - uy_p*sx;
+    // Second half-push for E
+    ux += econst*Ex;
+    uy += econst*Ey;
+    uz += econst*Ez;
+    gaminv = 1./std::sqrt(1. + (ux*ux + uy*uy + uz*uz)*inv_c2);
+}
+
+#endif // WARPX_PARTICLES_PUSHER_UPDATEMOMENTUM_BORIS_H_
diff --git a/Source/Particles/Pusher/UpdateMomentumVay.H b/Source/Particles/Pusher/UpdateMomentumVay.H
new file mode 100644
index 000000000..044297e22
--- /dev/null
+++ b/Source/Particles/Pusher/UpdateMomentumVay.H
@@ -0,0 +1,54 @@
+#ifndef WARPX_PARTICLES_PUSHER_UPDATEMOMENTUM_VAY_H_
+#define WARPX_PARTICLES_PUSHER_UPDATEMOMENTUM_VAY_H_
+
+#include <AMReX_FArrayBox.H>
+#include <WarpXConst.H>
+#include <AMReX_REAL.H>
+
+/* \brief Push the particle's positions over one timestep,
+ *    given the value of its momenta `ux`, `uy`, `uz` */
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+void UpdateMomentumVay(
+    amrex::Real& ux, amrex::Real& uy, amrex::Real& uz, amrex::Real& gaminv,
+    const amrex::Real Ex, const amrex::Real Ey, const amrex::Real Ez,
+    const amrex::Real Bx, const amrex::Real By, const amrex::Real Bz,
+    const amrex::Real q, const amrex::Real m, const amrex::Real dt )
+{
+    // Constants
+    const amrex::Real econst = q*dt/m;
+    const amrex::Real bconst = 0.5*q*dt/m;
+    constexpr amrex::Real invclight = 1./PhysConst::c;
+    constexpr amrex::Real invclightsq = 1./(PhysConst::c*PhysConst::c);
+    // Compute initial gamma
+    const amrex::Real inv_gamma = 1./std::sqrt(1. + (ux*ux + uy*uy + uz*uz)*invclightsq);
+    // Get tau
+    const amrex::Real taux = bconst*Bx;
+    const amrex::Real tauy = bconst*By;
+    const amrex::Real tauz = bconst*Bz;
+    const amrex::Real tausq = taux*taux+tauy*tauy+tauz*tauz;
+    // Get U', gamma'^2
+    const amrex::Real uxpr = ux + econst*Ex + (uy*tauz-uz*tauy)*inv_gamma;
+    const amrex::Real uypr = uy + econst*Ey + (uz*taux-ux*tauz)*inv_gamma;
+    const amrex::Real uzpr = uz + econst*Ez + (ux*tauy-uy*taux)*inv_gamma;
+    const amrex::Real gprsq = (1. + (uxpr*uxpr + uypr*uypr + uzpr*uzpr)*invclightsq);
+    // Get u*
+    const amrex::Real ust = (uxpr*taux + uypr*tauy + uzpr*tauz)*invclight;
+    // Get new gamma
+    const amrex::Real sigma = gprsq-tausq;
+    const amrex::Real gisq = 2./(sigma + std::sqrt(sigma*sigma + 4.*(tausq + ust*ust)) );
+    // Get t, s
+    const amrex::Real bg = bconst*std::sqrt(gisq);
+    const amrex::Real tx = bg*Bx;
+    const amrex::Real ty = bg*By;
+    const amrex::Real tz = bg*Bz;
+    const amrex::Real s = 1./(1.+tausq*gisq);
+    // Get t.u'
+    const amrex::Real tu = tx*uxpr + ty*uypr + tz*uzpr;
+    // Get new U
+    ux = s*(uxpr+tx*tu+uypr*tz-uzpr*ty);
+    uy = s*(uypr+ty*tu+uzpr*tx-uxpr*tz);
+    uz = s*(uzpr+tz*tu+uxpr*ty-uypr*tx);
+    gaminv = 1./std::sqrt(1. + (ux*ux + uy*uy + uz*uz)*invclightsq);
+}
+
+#endif // WARPX_PARTICLES_PUSHER_UPDATEMOMENTUM_VAY_H_
diff --git a/Source/Particles/Pusher/UpdatePosition.H b/Source/Particles/Pusher/UpdatePosition.H
index 0a4f579f4..a9df63a30 100644
--- a/Source/Particles/Pusher/UpdatePosition.H
+++ b/Source/Particles/Pusher/UpdatePosition.H
@@ -20,7 +20,7 @@ void UpdatePosition(
     const amrex::Real inv_gamma = 1./std::sqrt(1. + (ux*ux + uy*uy + uz*uz)*inv_c2);
     // Update positions over one time step
     x += ux * inv_gamma * dt;
-#if (AMREX_SPACEDIM == 3) || (defined WARPX_RZ) // RZ pushes particles in 3D
+#if (AMREX_SPACEDIM == 3) || (defined WARPX_DIM_RZ) // RZ pushes particles in 3D
     y += uy * inv_gamma * dt;
 #endif
     z += uz * inv_gamma * dt;
diff --git a/Source/Particles/RigidInjectedParticleContainer.H b/Source/Particles/RigidInjectedParticleContainer.H
index 0b27a2f2f..b920ece0a 100644
--- a/Source/Particles/RigidInjectedParticleContainer.H
+++ b/Source/Particles/RigidInjectedParticleContainer.H
@@ -43,7 +43,7 @@ public:
                          amrex::Real dt) override;
 
     virtual void PushPX(WarpXParIter& pti,
-	                amrex::Cuda::ManagedDeviceVector<amrex::Real>& xp,
+                        amrex::Cuda::ManagedDeviceVector<amrex::Real>& xp,
                         amrex::Cuda::ManagedDeviceVector<amrex::Real>& yp,
                         amrex::Cuda::ManagedDeviceVector<amrex::Real>& zp,
                         amrex::Cuda::ManagedDeviceVector<amrex::Real>& giv,
@@ -77,7 +77,6 @@ private:
     // Temporary quantites
     amrex::Real zinject_plane_lev;
     amrex::Real zinject_plane_lev_previous;
-    amrex::Vector<int> done_injecting_temp;
     bool done_injecting_lev;
 
 };
diff --git a/Source/Particles/RigidInjectedParticleContainer.cpp b/Source/Particles/RigidInjectedParticleContainer.cpp
index 9bd4cb4fc..36cb9d224 100644
--- a/Source/Particles/RigidInjectedParticleContainer.cpp
+++ b/Source/Particles/RigidInjectedParticleContainer.cpp
@@ -10,6 +10,9 @@
 #include <WarpX_f.H>
 #include <WarpX.H>
 #include <WarpXConst.H>
+#include <WarpXAlgorithmSelection.H>
+#include <UpdateMomentumBoris.H>
+#include <UpdateMomentumVay.H>
 
 using namespace amrex;
 
@@ -204,48 +207,58 @@ RigidInjectedParticleContainer::BoostandRemapParticles()
 
 void
 RigidInjectedParticleContainer::PushPX(WarpXParIter& pti,
-	                               Cuda::ManagedDeviceVector<Real>& xp,
+                                       Cuda::ManagedDeviceVector<Real>& xp,
                                        Cuda::ManagedDeviceVector<Real>& yp,
                                        Cuda::ManagedDeviceVector<Real>& zp,
                                        Cuda::ManagedDeviceVector<Real>& giv,
                                        Real dt)
 {
 
-    if (WarpX::do_boosted_frame_diagnostic && do_boosted_frame_diags)
-    {
-        copy_attribs(pti, xp.dataPtr(), yp.dataPtr(), zp.dataPtr());
-    }
-    
-    // This wraps the call to warpx_particle_pusher so that inheritors can modify the call.
+    // This wraps the momentum and position advance so that inheritors can modify the call.
     auto& attribs = pti.GetAttribs();
     auto& uxp = attribs[PIdx::ux];
     auto& uyp = attribs[PIdx::uy];
     auto& uzp = attribs[PIdx::uz];
-    auto& Exp = attribs[PIdx::Ex];
-    auto& Eyp = attribs[PIdx::Ey];
-    auto& Ezp = attribs[PIdx::Ez];
-    auto& Bxp = attribs[PIdx::Bx];
-    auto& Byp = attribs[PIdx::By];
-    auto& Bzp = attribs[PIdx::Bz];
-    const long np  = pti.numParticles();
 
     // Save the position and momenta, making copies
     Cuda::ManagedDeviceVector<Real> xp_save, yp_save, zp_save;
     RealVector uxp_save, uyp_save, uzp_save;
 
+    Real* const AMREX_RESTRICT x = xp.dataPtr();
+    Real* const AMREX_RESTRICT y = yp.dataPtr();
+    Real* const AMREX_RESTRICT z = zp.dataPtr();
+    Real* const AMREX_RESTRICT gi = giv.dataPtr();
+    Real* const AMREX_RESTRICT ux = uxp.dataPtr();
+    Real* const AMREX_RESTRICT uy = uyp.dataPtr();
+    Real* const AMREX_RESTRICT uz = uzp.dataPtr();
+    Real* const AMREX_RESTRICT Exp = attribs[PIdx::Ex].dataPtr();
+    Real* const AMREX_RESTRICT Eyp = attribs[PIdx::Ey].dataPtr();
+    Real* const AMREX_RESTRICT Ezp = attribs[PIdx::Ez].dataPtr();
+    Real* const AMREX_RESTRICT Bxp = attribs[PIdx::Bx].dataPtr();
+    Real* const AMREX_RESTRICT Byp = attribs[PIdx::By].dataPtr();
+    Real* const AMREX_RESTRICT Bzp = attribs[PIdx::Bz].dataPtr();
+
     if (!done_injecting_lev) {
-        xp_save = xp;
-        yp_save = yp;
-        zp_save = zp;
-        uxp_save = uxp;
-        uyp_save = uyp;
-        uzp_save = uzp;
+        if (!(WarpX::do_boosted_frame_diagnostic && do_boosted_frame_diags)) {
+            // If the old values are not already saved, create copies here.
+            xp_save = xp;
+            yp_save = yp;
+            zp_save = zp;
+            uxp_save = uxp;
+            uyp_save = uyp;
+            uzp_save = uzp;
+        }
+
         // Scale the fields of particles about to cross the injection plane.
         // This only approximates what should be happening. The particles
         // should by advanced a fraction of a time step instead.
         // Scaling the fields is much easier and may be good enough.
-        for (int i=0 ; i < zp.size() ; i++) {
-            const Real dtscale = dt - (zinject_plane_lev_previous - zp[i])/(vzbeam_ave_boosted + WarpX::beta_boost*PhysConst::c);
+        const Real v_boost = WarpX::beta_boost*PhysConst::c;
+        const Real z_plane_previous = zinject_plane_lev_previous;
+        const Real vz_ave_boosted = vzbeam_ave_boosted;
+        amrex::ParallelFor( pti.numParticles(),
+            [=] AMREX_GPU_DEVICE (long i) {
+            const Real dtscale = dt - (z_plane_previous - z[i])/(vz_ave_boosted + v_boost);
             if (0. < dtscale && dtscale < dt) {
                 Exp[i] *= dtscale;
                 Eyp[i] *= dtscale;
@@ -255,46 +268,60 @@ RigidInjectedParticleContainer::PushPX(WarpXParIter& pti,
                 Bzp[i] *= dtscale;
             }
         }
+        );
     }
 
-    warpx_particle_pusher(&np,
-                          xp.dataPtr(),
-                          yp.dataPtr(),
-                          zp.dataPtr(),
-                          uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
-                          giv.dataPtr(),
-                          Exp.dataPtr(), Eyp.dataPtr(), Ezp.dataPtr(),
-                          Bxp.dataPtr(), Byp.dataPtr(), Bzp.dataPtr(),
-                          &this->charge, &this->mass, &dt,
-                          &WarpX::particle_pusher_algo);
+    PhysicalParticleContainer::PushPX(pti, xp, yp, zp, giv, dt);
 
     if (!done_injecting_lev) {
-#ifdef _OPENMP
-        const int tid = omp_get_thread_num();
-#else
-        const int tid = 0;
-#endif
+
+        Real* AMREX_RESTRICT x_save;
+        Real* AMREX_RESTRICT y_save;
+        Real* AMREX_RESTRICT z_save;
+        Real* AMREX_RESTRICT ux_save;
+        Real* AMREX_RESTRICT uy_save;
+        Real* AMREX_RESTRICT uz_save;
+        if (!(WarpX::do_boosted_frame_diagnostic && do_boosted_frame_diags)) {
+            x_save = xp_save.dataPtr();
+            y_save = yp_save.dataPtr();
+            z_save = zp_save.dataPtr();
+            ux_save = uxp_save.dataPtr();
+            uy_save = uyp_save.dataPtr();
+            uz_save = uzp_save.dataPtr();
+        } else {
+            x_save = pti.GetAttribs(particle_comps["xold"]).dataPtr();
+            y_save = pti.GetAttribs(particle_comps["yold"]).dataPtr();
+            z_save = pti.GetAttribs(particle_comps["zold"]).dataPtr();
+            ux_save = pti.GetAttribs(particle_comps["uxold"]).dataPtr();
+            uy_save = pti.GetAttribs(particle_comps["uyold"]).dataPtr();
+            uz_save = pti.GetAttribs(particle_comps["uzold"]).dataPtr();
+        }
+
         // Undo the push for particles not injected yet.
         // The zp are advanced a fixed amount.
-        for (int i=0 ; i < zp.size() ; i++) {
-            if (zp[i] <= zinject_plane_lev) {
-                uxp[i] = uxp_save[i];
-                uyp[i] = uyp_save[i];
-                uzp[i] = uzp_save[i];
-                giv[i] = 1./std::sqrt(1. + (uxp[i]*uxp[i] + uyp[i]*uyp[i] + uzp[i]*uzp[i])/(PhysConst::c*PhysConst::c));
-                xp[i] = xp_save[i];
-                yp[i] = yp_save[i];
-                if (rigid_advance) {
-                    zp[i] = zp_save[i] + dt*vzbeam_ave_boosted;
+        const Real z_plane_lev = zinject_plane_lev;
+        const Real vz_ave_boosted = vzbeam_ave_boosted;
+        const bool rigid = rigid_advance;
+        const Real inv_csq = 1./(PhysConst::c*PhysConst::c);
+        amrex::ParallelFor( pti.numParticles(),
+            [=] AMREX_GPU_DEVICE (long i) {
+            if (z[i] <= z_plane_lev) {
+                ux[i] = ux_save[i];
+                uy[i] = uy_save[i];
+                uz[i] = uz_save[i];
+                gi[i] = 1./std::sqrt(1. + (ux[i]*ux[i] + uy[i]*uy[i] + uz[i]*uz[i])*inv_csq);
+                x[i] = x_save[i];
+                y[i] = y_save[i];
+                if (rigid) {
+                    z[i] = z_save[i] + dt*vz_ave_boosted;
                 }
                 else {
-                    zp[i] = zp_save[i] + dt*uzp[i]*giv[i];
+                    z[i] = z_save[i] + dt*uz[i]*gi[i];
                 }
-                done_injecting_temp[tid] = 0;
             }
         }
+        );
     }
-
 }
 
 void
@@ -314,28 +341,26 @@ RigidInjectedParticleContainer::Evolve (int lev,
     zinject_plane_levels[lev] -= dt*WarpX::beta_boost*PhysConst::c;
     zinject_plane_lev = zinject_plane_levels[lev];
 
-    // Setup check of whether more particles need to be injected
-#ifdef _OPENMP
-    const int nthreads = omp_get_max_threads();
-#else
-    const int nthreads = 1;
-#endif
-    done_injecting_temp.assign(nthreads, 1); // We do not use bool because vector<bool> is special.
+    // Set the done injecting flag whan the inject plane moves out of the
+    // simulation domain.
+    // It is much easier to do this check, rather than checking if all of the
+    // particles have crossed the inject plane.
+    const Real* plo = Geom(lev).ProbLo();
+    const Real* phi = Geom(lev).ProbHi();
+    const int zdir = AMREX_SPACEDIM-1;
+    done_injecting[lev] = ((zinject_plane_levels[lev] < plo[zdir] && WarpX::moving_window_v + WarpX::beta_boost*PhysConst::c >= 0.) ||
+                           (zinject_plane_levels[lev] > phi[zdir] && WarpX::moving_window_v + WarpX::beta_boost*PhysConst::c <= 0.));
     done_injecting_lev = done_injecting[lev];
 
     PhysicalParticleContainer::Evolve (lev,
-				       Ex, Ey, Ez,
-				       Bx, By, Bz,
-				       jx, jy, jz,
+                                       Ex, Ey, Ez,
+                                       Bx, By, Bz,
+                                       jx, jy, jz,
                                        cjx, cjy, cjz,
                                        rho, crho,
                                        cEx, cEy, cEz,
                                        cBx, cBy, cBz,
                                        t, dt);
-
-    // Check if all done_injecting_temp are still true.
-    done_injecting[lev] = std::all_of(done_injecting_temp.begin(), done_injecting_temp.end(),
-                                      [](int i) -> bool { return i; });
 }
 
 void
@@ -343,6 +368,8 @@ RigidInjectedParticleContainer::PushP (int lev, Real dt,
                                        const MultiFab& Ex, const MultiFab& Ey, const MultiFab& Ez,
                                        const MultiFab& Bx, const MultiFab& By, const MultiFab& Bz)
 {
+    BL_PROFILE("RigidInjectedParticleContainer::PushP");
+
     if (do_not_push) return;
 
     const std::array<Real,3>& dx = WarpX::CellSize(lev);
@@ -351,8 +378,11 @@ RigidInjectedParticleContainer::PushP (int lev, Real dt,
 #pragma omp parallel
 #endif
     {
-        Cuda::ManagedDeviceVector<Real> xp, yp, zp, giv;
-
+#ifdef _OPENMP
+        int thread_num = omp_get_thread_num();
+#else
+        int thread_num = 0;
+#endif
         for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
         {
             const Box& box = pti.validbox();
@@ -386,65 +416,74 @@ RigidInjectedParticleContainer::PushP (int lev, Real dt,
             Byp.assign(np,WarpX::B_external[1]);
             Bzp.assign(np,WarpX::B_external[2]);
 
-            giv.resize(np);
+            m_giv[thread_num].resize(np);
 
             //
             // copy data from particle container to temp arrays
             //
-            pti.GetPosition(xp, yp, zp);
+            pti.GetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
 
-            const std::array<Real,3>& xyzmin_grid = WarpX::LowerCorner(box, lev);
-            const int* ixyzmin_grid = box.loVect();
-
-            const int ll4symtry          = false;
-            const int l_lower_order_in_v = true;
-            long lvect_fieldgathe = 64;
-            warpx_geteb_energy_conserving(
-                &np,
-                xp.dataPtr(),
-                yp.dataPtr(),
-                zp.dataPtr(),
-                Exp.dataPtr(),Eyp.dataPtr(),Ezp.dataPtr(),
-                Bxp.dataPtr(),Byp.dataPtr(),Bzp.dataPtr(),
-                ixyzmin_grid,
-                &xyzmin_grid[0], &xyzmin_grid[1], &xyzmin_grid[2],
-                &dx[0], &dx[1], &dx[2],
-                &WarpX::nox, &WarpX::noy, &WarpX::noz,
-                BL_TO_FORTRAN_ANYD(exfab),
-                BL_TO_FORTRAN_ANYD(eyfab),
-                BL_TO_FORTRAN_ANYD(ezfab),
-                BL_TO_FORTRAN_ANYD(bxfab),
-                BL_TO_FORTRAN_ANYD(byfab),
-                BL_TO_FORTRAN_ANYD(bzfab),
-                &ll4symtry, &l_lower_order_in_v, &WarpX::do_nodal,
-                &lvect_fieldgathe, &WarpX::field_gathering_algo);
+            int e_is_nodal = Ex.is_nodal() and Ey.is_nodal() and Ez.is_nodal();
+            FieldGather(pti, Exp, Eyp, Ezp, Bxp, Byp, Bzp,
+                        &exfab, &eyfab, &ezfab, &bxfab, &byfab, &bzfab,
+                        Ex.nGrow(), e_is_nodal, 0, np, thread_num, lev, lev);
 
             // Save the position and momenta, making copies
             auto uxp_save = uxp;
             auto uyp_save = uyp;
             auto uzp_save = uzp;
 
-            warpx_particle_pusher_momenta(&np,
-                                          xp.dataPtr(),
-                                          yp.dataPtr(),
-                                          zp.dataPtr(),
-                                          uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
-                                          giv.dataPtr(),
-                                          Exp.dataPtr(), Eyp.dataPtr(), Ezp.dataPtr(),
-                                          Bxp.dataPtr(), Byp.dataPtr(), Bzp.dataPtr(),
-                                          &this->charge, &this->mass, &dt,
-                                          &WarpX::particle_pusher_algo);
+            // This wraps the momentum advance so that inheritors can modify the call.
+            // Extract pointers to the different particle quantities
+            const Real* const AMREX_RESTRICT zp = m_zp[thread_num].dataPtr();
+            Real* const AMREX_RESTRICT gi = m_giv[thread_num].dataPtr();
+            Real* const AMREX_RESTRICT uxpp = uxp.dataPtr();
+            Real* const AMREX_RESTRICT uypp = uyp.dataPtr();
+            Real* const AMREX_RESTRICT uzpp = uzp.dataPtr();
+            const Real* const AMREX_RESTRICT Expp = Exp.dataPtr();
+            const Real* const AMREX_RESTRICT Eypp = Eyp.dataPtr();
+            const Real* const AMREX_RESTRICT Ezpp = Ezp.dataPtr();
+            const Real* const AMREX_RESTRICT Bxpp = Bxp.dataPtr();
+            const Real* const AMREX_RESTRICT Bypp = Byp.dataPtr();
+            const Real* const AMREX_RESTRICT Bzpp = Bzp.dataPtr();
+
+            // Loop over the particles and update their momentum
+            const Real q = this->charge;
+            const Real m = this->mass;
+            if (WarpX::particle_pusher_algo == ParticlePusherAlgo::Boris){
+                amrex::ParallelFor( pti.numParticles(),
+                    [=] AMREX_GPU_DEVICE (long i) {
+                        UpdateMomentumBoris( uxpp[i], uypp[i], uzpp[i], gi[i],
+                              Expp[i], Eypp[i], Ezpp[i], Bxpp[i], Bypp[i], Bzpp[i], q, m, dt);
+                    }
+                );
+            } else if (WarpX::particle_pusher_algo == ParticlePusherAlgo::Vay) {
+                amrex::ParallelFor( pti.numParticles(),
+                    [=] AMREX_GPU_DEVICE (long i) {
+                        UpdateMomentumVay( uxpp[i], uypp[i], uzpp[i], gi[i],
+                              Expp[i], Eypp[i], Ezpp[i], Bxpp[i], Bypp[i], Bzpp[i], q, m, dt);
+                    }
+                );
+            } else {
+              amrex::Abort("Unknown particle pusher");
+            };
 
             // Undo the push for particles not injected yet.
             // It is assumed that PushP will only be called on the first and last steps
             // and that no particles will cross zinject_plane.
-            for (int i=0 ; i < zp.size() ; i++) {
-                if (zp[i] <= zinject_plane_levels[lev]) {
-                    uxp[i] = uxp_save[i];
-                    uyp[i] = uyp_save[i];
-                    uzp[i] = uzp_save[i];
+            const Real* const AMREX_RESTRICT ux_save = uxp_save.dataPtr();
+            const Real* const AMREX_RESTRICT uy_save = uyp_save.dataPtr();
+            const Real* const AMREX_RESTRICT uz_save = uzp_save.dataPtr();
+            const Real zz = zinject_plane_levels[lev];
+            amrex::ParallelFor( pti.numParticles(),
+                [=] AMREX_GPU_DEVICE (long i) {
+                if (zp[i] <= zz) {
+                    uxpp[i] = ux_save[i];
+                    uypp[i] = uy_save[i];
+                    uzpp[i] = uz_save[i];
                 }
             }
+            );
 
         }
     }
diff --git a/Source/Particles/ShapeFactors.H b/Source/Particles/ShapeFactors.H
new file mode 100644
index 000000000..9d185714a
--- /dev/null
+++ b/Source/Particles/ShapeFactors.H
@@ -0,0 +1,117 @@
+#ifndef SHAPEFACTORS_H_
+#define SHAPEFACTORS_H_
+
+// Compute shape factor and return index of leftmost cell where 
+// particle writes.
+// Specialized templates are defined below for orders 0 to 3.
+template <int depos_order>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shape_factor(amrex::Real* const sx, amrex::Real xint)
+{
+    return 0;
+};
+
+// Compute shape factor for order 0.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shape_factor <0> (amrex::Real* const sx, amrex::Real xmid){
+    const int j = (int) (xmid+0.5);
+    sx[0] = 1.0;
+    return j;
+}
+
+// Compute shape factor for order 1.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shape_factor <1> (amrex::Real* const sx, amrex::Real xmid){
+    const int j = (int) xmid;
+    const amrex::Real xint = xmid-j;
+    sx[0] = 1.0 - xint;
+    sx[1] = xint;
+    return j;
+}
+
+// Compute shape factor for order 2.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shape_factor <2> (amrex::Real* const sx, amrex::Real xmid){
+    const int j = (int) (xmid+0.5);
+    const amrex::Real xint = xmid-j;
+    sx[0] = 0.5*(0.5-xint)*(0.5-xint);
+    sx[1] = 0.75-xint*xint;
+    sx[2] = 0.5*(0.5+xint)*(0.5+xint);
+    // index of the leftmost cell where particle deposits
+    return j-1;
+}
+
+// Compute shape factor for order 3.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shape_factor <3> (amrex::Real* const sx, amrex::Real xmid){
+    const int j = (int) xmid;
+    const amrex::Real xint = xmid-j;
+    sx[0] = 1.0/6.0*(1.0-xint)*(1.0-xint)*(1.0-xint);
+    sx[1] = 2.0/3.0-xint*xint*(1-xint/2.0);
+    sx[2] = 2.0/3.0-(1-xint)*(1-xint)*(1.0-0.5*(1-xint));
+    sx[3] = 1.0/6.0*xint*xint*xint;
+    // index of the leftmost cell where particle deposits
+    return j-1;
+}
+
+// Compute shifted shape factor and return index of leftmost cell where
+// particle writes, for Esirkepov algorithm.
+// Specialized templates are defined below for orders 1, 2 and 3.
+template <int depos_order>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor (amrex::Real* const sx,
+                                  const amrex::Real x_old,
+                                  const int i_new);
+
+// Compute shape factor for order 1.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor <1> (amrex::Real* const sx,
+                                      const amrex::Real x_old,
+                                      const int i_new){
+    const int i = (int) x_old;
+    const int i_shift = i - i_new;
+    const amrex::Real xint = x_old - i;
+    sx[1+i_shift] = 1.0 - xint;
+    sx[2+i_shift] = xint;
+    return i;
+}
+
+// Compute shape factor for order 2.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor <2> (amrex::Real* const sx,
+                                      const amrex::Real x_old,
+                                      const int i_new){
+    const int i = (int) (x_old+0.5);
+    const int i_shift = i - (i_new + 1);
+    const amrex::Real xint = x_old - i;
+    sx[1+i_shift] = 0.5*(0.5-xint)*(0.5-xint);
+    sx[2+i_shift] = 0.75-xint*xint;
+    sx[3+i_shift] = 0.5*(0.5+xint)*(0.5+xint);
+    // index of the leftmost cell where particle deposits
+    return i-1;
+}
+
+// Compute shape factor for order 3.
+template <>
+AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+int compute_shifted_shape_factor <3> (amrex::Real* const sx,
+                                      const amrex::Real x_old,
+                                      const int i_new){
+    const int i = (int) x_old;
+    const int i_shift = i - (i_new + 1);
+    const amrex::Real xint = x_old - i;
+    sx[1+i_shift] = 1.0/6.0*(1.0-xint)*(1.0-xint)*(1.0-xint);
+    sx[2+i_shift] = 2.0/3.0-xint*xint*(1-xint/2.0);
+    sx[3+i_shift] = 2.0/3.0-(1-xint)*(1-xint)*(1.0-0.5*(1-xint));
+    sx[4+i_shift] = 1.0/6.0*xint*xint*xint;
+    // index of the leftmost cell where particle deposits
+    return i-1;
+}
+
+#endif // SHAPEFACTORS_H_
diff --git a/Source/Particles/WarpXParticleContainer.H b/Source/Particles/WarpXParticleContainer.H
index 662b2e1b8..ac5b47ada 100644
--- a/Source/Particles/WarpXParticleContainer.H
+++ b/Source/Particles/WarpXParticleContainer.H
@@ -13,7 +13,7 @@ struct PIdx
     enum { // Particle Attributes stored in amrex::ParticleContainer's struct of array
 	w = 0,  // weight
 	ux, uy, uz, Ex, Ey, Ez, Bx, By, Bz,
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
         theta, // RZ needs all three position components
 #endif
 	nattribs
@@ -104,8 +104,9 @@ public:
                                 const amrex::Vector<std::unique_ptr<amrex::FabArray<amrex::BaseFab<int> > > >& masks) {}
 
     virtual void FieldGather (int lev,
-                              const amrex::MultiFab& Ex, const amrex::MultiFab& Ey, const amrex::MultiFab& Ez,
-                              const amrex::MultiFab& Bx, const amrex::MultiFab& By, const amrex::MultiFab& Bz) {}
+                              const amrex::MultiFab& Ex, const amrex::MultiFab& Ey,
+                              const amrex::MultiFab& Ez, const amrex::MultiFab& Bx,
+                              const amrex::MultiFab& By, const amrex::MultiFab& Bz) {}
 
 #ifdef WARPX_DO_ELECTROSTATIC    
     virtual void EvolveES (const amrex::Vector<std::array<std::unique_ptr<amrex::MultiFab>, 3> >& E,
@@ -166,13 +167,13 @@ public:
 
     virtual void DepositCharge(WarpXParIter& pti,
                                RealVector& wp,
-                               amrex::MultiFab* rhomf,
-                               amrex::MultiFab* crhomf,
+                               amrex::MultiFab* rho,
                                int icomp,
-                               const long np_current,
-                               const long np,
+                               const long offset,
+                               const long np_to_depose,
                                int thread_num,
-                               int lev );
+                               int lev,
+                               int depos_lev);
 
     virtual void DepositCurrent(WarpXParIter& pti,
                                 RealVector& wp,
diff --git a/Source/Particles/WarpXParticleContainer.cpp b/Source/Particles/WarpXParticleContainer.cpp
index a20f0035e..befa5cfed 100644
--- a/Source/Particles/WarpXParticleContainer.cpp
+++ b/Source/Particles/WarpXParticleContainer.cpp
@@ -12,6 +12,7 @@
 #include <GetAndSetPosition.H>
 #include <UpdatePosition.H>
 #include <CurrentDeposition.H>
+#include <ChargeDeposition.H>
 
 using namespace amrex;
 
@@ -27,7 +28,7 @@ void
 WarpXParIter::GetPosition (Cuda::ManagedDeviceVector<Real>& x, Cuda::ManagedDeviceVector<Real>& y, Cuda::ManagedDeviceVector<Real>& z) const
 {
     amrex::ParIter<0,0,PIdx::nattribs>::GetPosition(x, z);
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
     const auto& attribs = GetAttribs();
     const auto& theta = attribs[PIdx::theta];
     y.resize(x.size());
@@ -44,10 +45,10 @@ WarpXParIter::GetPosition (Cuda::ManagedDeviceVector<Real>& x, Cuda::ManagedDevi
 void
 WarpXParIter::SetPosition (const Cuda::ManagedDeviceVector<Real>& x, const Cuda::ManagedDeviceVector<Real>& y, const Cuda::ManagedDeviceVector<Real>& z)
 {
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
     auto& attribs = GetAttribs();
     auto& theta = attribs[PIdx::theta];
-    Cuda::DeviceVector<Real> r(x.size());
+    Cuda::ManagedDeviceVector<Real> r(x.size());
     for (unsigned int i=0 ; i < x.size() ; i++) {
         theta[i] = std::atan2(y[i], x[i]);
         r[i] = std::sqrt(x[i]*x[i] + y[i]*y[i]);
@@ -80,7 +81,7 @@ WarpXParticleContainer::WarpXParticleContainer (AmrCore* amr_core, int ispecies)
     particle_comps["Bx"] = PIdx::Bx;
     particle_comps["By"] = PIdx::By;
     particle_comps["Bz"] = PIdx::Bz;
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
     particle_comps["theta"] = PIdx::theta;
 #endif
 
@@ -163,7 +164,7 @@ WarpXParticleContainer::AddOneParticle (ParticleTileType& particle_tile,
     p.pos(1) = y;
     p.pos(2) = z;
 #elif (AMREX_SPACEDIM == 2)
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
     attribs[PIdx::theta] = std::atan2(y, x);
     x = std::sqrt(x*x + y*y);
 #endif
@@ -209,7 +210,7 @@ WarpXParticleContainer::AddNParticles (int lev,
 
     std::size_t np = iend-ibegin;
 
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
     Vector<Real> theta(np);
 #endif
 
@@ -228,7 +229,7 @@ WarpXParticleContainer::AddNParticles (int lev,
         p.pos(1) = y[i];
         p.pos(2) = z[i];
 #elif (AMREX_SPACEDIM == 2)
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
         theta[i-ibegin] = std::atan2(y[i], x[i]);
         p.pos(0) = std::sqrt(x[i]*x[i] + y[i]*y[i]);
 #else
@@ -265,7 +266,7 @@ WarpXParticleContainer::AddNParticles (int lev,
 
         for (int comp = PIdx::uz+1; comp < PIdx::nattribs; ++comp)
         {
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
             if (comp == PIdx::theta) {
                 particle_tile.push_back_real(comp, theta.front(), theta.back());
             }
@@ -394,14 +395,6 @@ WarpXParticleContainer::DepositCurrentFortran(WarpXParIter& pti,
         &WarpX::nox,&WarpX::noy,&WarpX::noz, &j_is_nodal,
         &lvect,&WarpX::current_deposition_algo);
 
-#ifdef WARPX_RZ
-    // Rescale current in r-z mode
-    warpx_current_deposition_rz_volume_scaling(
-        jx_ptr, &ngJ, jxntot.getVect(),
-        jy_ptr, &ngJ, jyntot.getVect(),
-        jz_ptr, &ngJ, jzntot.getVect(),
-        &xyzmin[0], &dx[0]);
-#endif
     BL_PROFILE_VAR_STOP(blp_pxr_cd);
 
 #ifndef AMREX_USE_GPU
@@ -503,7 +496,8 @@ WarpXParticleContainer::DepositCurrent(WarpXParIter& pti,
     Real* AMREX_RESTRICT yp = m_yp[thread_num].dataPtr() + offset;
 
     // Lower corner of tile box physical domain
-    const std::array<Real, 3>& xyzmin = WarpX::LowerCorner(tilebox, depos_lev);;
+    // Note that this includes guard cells since it is after tilebox.ngrow
+    const std::array<Real, 3>& xyzmin = WarpX::LowerCorner(tilebox, depos_lev);
     // xyzmin is built on pti.tilebox(), so it does 
     // not include staggering, so the stagger_shift has to be done by hand.
     // Alternatively, we could define xyzminx from tbx (and the same for 3 
@@ -513,36 +507,36 @@ WarpXParticleContainer::DepositCurrent(WarpXParIter& pti,
 
     if (WarpX::current_deposition_algo == CurrentDepositionAlgo::Esirkepov) {
         if        (WarpX::nox == 1){
-            doEsirkepovDepositionShapeN<1>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                                           uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+            doEsirkepovDepositionShapeN<1>(xp, yp, zp, wp.dataPtr() + offset, uxp.dataPtr() + offset, 
+                                           uyp.dataPtr() + offset, uzp.dataPtr() + offset, jx_arr, jy_arr, 
                                            jz_arr, np_to_depose, dt, dx,
                                            xyzmin, lo, q);
         } else if (WarpX::nox == 2){
-            doEsirkepovDepositionShapeN<2>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                                           uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+            doEsirkepovDepositionShapeN<2>(xp, yp, zp, wp.dataPtr() + offset, uxp.dataPtr() + offset, 
+                                           uyp.dataPtr() + offset, uzp.dataPtr() + offset, jx_arr, jy_arr, 
                                            jz_arr, np_to_depose, dt, dx,
                                            xyzmin, lo, q);
         } else if (WarpX::nox == 3){
-            doEsirkepovDepositionShapeN<3>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                                           uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
+            doEsirkepovDepositionShapeN<3>(xp, yp, zp, wp.dataPtr() + offset, uxp.dataPtr() + offset, 
+                                           uyp.dataPtr() + offset, uzp.dataPtr() + offset, jx_arr, jy_arr, 
                                            jz_arr, np_to_depose, dt, dx,
                                            xyzmin, lo, q);
         }
     } else {
         if        (WarpX::nox == 1){
-            doDepositionShapeN<1>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                                  uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
-                                  jz_arr, offset, np_to_depose, dt, dx,
+            doDepositionShapeN<1>(xp, yp, zp, wp.dataPtr() + offset, uxp.dataPtr() + offset, 
+                                  uyp.dataPtr() + offset, uzp.dataPtr() + offset, jx_arr, jy_arr, 
+                                  jz_arr, np_to_depose, dt, dx,
                                   xyzmin, lo, stagger_shift, q);
         } else if (WarpX::nox == 2){
-            doDepositionShapeN<2>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                                  uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
-                                  jz_arr, offset, np_to_depose, dt, dx,
+            doDepositionShapeN<2>(xp, yp, zp, wp.dataPtr() + offset, uxp.dataPtr() + offset, 
+                                  uyp.dataPtr() + offset, uzp.dataPtr() + offset, jx_arr, jy_arr, 
+                                  jz_arr, np_to_depose, dt, dx,
                                   xyzmin, lo, stagger_shift, q);
         } else if (WarpX::nox == 3){
-            doDepositionShapeN<3>(xp, yp, zp, wp.dataPtr(), uxp.dataPtr(), 
-                                  uyp.dataPtr(), uzp.dataPtr(), jx_arr, jy_arr, 
-                                  jz_arr, offset, np_to_depose, dt, dx,
+            doDepositionShapeN<3>(xp, yp, zp, wp.dataPtr() + offset, uxp.dataPtr() + offset, 
+                                  uyp.dataPtr() + offset, uzp.dataPtr() + offset, jx_arr, jy_arr, 
+                                  jz_arr, np_to_depose, dt, dx,
                                   xyzmin, lo, stagger_shift, q);
         }
     }
@@ -559,140 +553,87 @@ WarpXParticleContainer::DepositCurrent(WarpXParIter& pti,
 }
 
 void
-WarpXParticleContainer::DepositCharge ( WarpXParIter& pti, RealVector& wp,
-                                        MultiFab* rhomf, MultiFab* crhomf, int icomp,
-                                        const long np_current,
-                                        const long np, int thread_num, int lev )
+WarpXParticleContainer::DepositCharge (WarpXParIter& pti, RealVector& wp,
+                                       MultiFab* rho, int icomp,
+                                       const long offset, const long np_to_depose,
+                                       int thread_num, int lev, int depos_lev)
 {
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE((depos_lev==(lev-1)) ||
+                                     (depos_lev==(lev  )),
+                                     "Deposition buffers only work for lev-1");
 
-  BL_PROFILE_VAR_NS("PICSAR::ChargeDeposition", blp_pxr_chd);
-  BL_PROFILE_VAR_NS("PPC::Evolve::Accumulate", blp_accumulate);
-
-  const std::array<Real,3>& xyzmin_tile = WarpX::LowerCorner(pti.tilebox(), lev);
-  const long lvect = 8;
+    // If no particles, do not do anything
+    if (np_to_depose == 0) return;
 
-  long ngRho = rhomf->nGrow();
-  Real* data_ptr;
-  Box tile_box = convert(pti.tilebox(), IntVect::TheUnitVector());
+    const long ngRho = rho->nGrow();
+    const std::array<Real,3>& dx = WarpX::CellSize(std::max(depos_lev,0));
+    const Real q = this->charge;
 
-  const std::array<Real,3>& dx = WarpX::CellSize(lev);
-  const std::array<Real,3>& cdx = WarpX::CellSize(std::max(lev-1,0));
+    BL_PROFILE_VAR_NS("PPC::ChargeDeposition", blp_ppc_chd);
+    BL_PROFILE_VAR_NS("PPC::Evolve::Accumulate", blp_accumulate);
 
-  // Deposit charge for particles that are not in the current buffers
-  if (np_current > 0)
-  {
-      const std::array<Real, 3>& xyzmin = xyzmin_tile;
+    // Get tile box where charge is deposited.
+    // The tile box is different when depositing in the buffers (depos_lev<lev)
+    // or when depositing inside the level (depos_lev=lev)
+    Box tilebox;
+    if (lev == depos_lev) {
+        tilebox = pti.tilebox();
+    } else {
+        const IntVect& ref_ratio = WarpX::RefRatio(depos_lev);
+        tilebox = amrex::coarsen(pti.tilebox(),ref_ratio);
+    }
+    
+    tilebox.grow(ngRho);
 
 #ifdef AMREX_USE_GPU
-      data_ptr = (*rhomf)[pti].dataPtr(icomp);
-      auto rholen = (*rhomf)[pti].length();
+    // No tiling on GPU: rho_arr points to the full rho array.
+    MultiFab rhoi(*rho, amrex::make_alias, icomp, 1);
+    Array4<Real> const& rho_arr = rhoi.array(pti);
 #else
-      tile_box.grow(ngRho);
-      local_rho[thread_num].resize(tile_box);
+    // Tiling is on: rho_arr points to local_rho[thread_num]
+    const Box tb = amrex::convert(tilebox, IntVect::TheUnitVector());
 
-      data_ptr = local_rho[thread_num].dataPtr();
-      auto rholen = local_rho[thread_num].length();
+    local_rho[thread_num].resize(tb);
 
-      local_rho[thread_num].setVal(0.0);
-#endif
+    // local_rho[thread_num] is set to zero
+    local_rho[thread_num].setVal(0.0);
 
-#if (AMREX_SPACEDIM == 3)
-      const long nx = rholen[0]-1-2*ngRho;
-      const long ny = rholen[1]-1-2*ngRho;
-      const long nz = rholen[2]-1-2*ngRho;
-#else
-      const long nx = rholen[0]-1-2*ngRho;
-      const long ny = 0;
-      const long nz = rholen[1]-1-2*ngRho;
+    Array4<Real> const& rho_arr = local_rho[thread_num].array();
 #endif
-      BL_PROFILE_VAR_START(blp_pxr_chd);
-      warpx_charge_deposition(data_ptr, &np_current,
-                              m_xp[thread_num].dataPtr(),
-                              m_yp[thread_num].dataPtr(),
-                              m_zp[thread_num].dataPtr(),
-                              wp.dataPtr(),
-                              &this->charge,
-                              &xyzmin[0], &xyzmin[1], &xyzmin[2],
-                              &dx[0], &dx[1], &dx[2], &nx, &ny, &nz,
-                              &ngRho, &ngRho, &ngRho,
-                              &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                              &lvect, &WarpX::charge_deposition_algo);
-#ifdef WARPX_RZ
-      warpx_charge_deposition_rz_volume_scaling(
-                               data_ptr, &ngRho, rholen.getVect(),
-                               &xyzmin[0], &dx[0]);
-#endif
-      BL_PROFILE_VAR_STOP(blp_pxr_chd);
-
-#ifndef AMREX_USE_GPU
-      BL_PROFILE_VAR_START(blp_accumulate);
-
-      (*rhomf)[pti].atomicAdd(local_rho[thread_num], tile_box, tile_box, 0, icomp, 1);
-
-      BL_PROFILE_VAR_STOP(blp_accumulate);
-#endif
-  }
-
-  // Deposit charge for particles that are in the current buffers
-  if (np_current < np)
-  {
-      const IntVect& ref_ratio = WarpX::RefRatio(lev-1);
-      const Box& ctilebox = amrex::coarsen(pti.tilebox(), ref_ratio);
-      const std::array<Real,3>& cxyzmin_tile = WarpX::LowerCorner(ctilebox, lev-1);
-
-#ifdef AMREX_USE_GPU
-      data_ptr = (*crhomf)[pti].dataPtr(icomp);
-      auto rholen = (*crhomf)[pti].length();
-#else
-      tile_box = amrex::convert(ctilebox, IntVect::TheUnitVector());
-      tile_box.grow(ngRho);
-      local_rho[thread_num].resize(tile_box);
-
-      data_ptr = local_rho[thread_num].dataPtr();
-      auto rholen = local_rho[thread_num].length();
+    // GPU, no tiling: deposit directly in rho
+    // CPU, tiling: deposit into local_rho
 
-      local_rho[thread_num].setVal(0.0);
-#endif
+    Real* AMREX_RESTRICT xp = m_xp[thread_num].dataPtr() + offset;
+    Real* AMREX_RESTRICT zp = m_zp[thread_num].dataPtr() + offset;
+    Real* AMREX_RESTRICT yp = m_yp[thread_num].dataPtr() + offset;
 
-#if (AMREX_SPACEDIM == 3)
-      const long nx = rholen[0]-1-2*ngRho;
-      const long ny = rholen[1]-1-2*ngRho;
-      const long nz = rholen[2]-1-2*ngRho;
-#else
-      const long nx = rholen[0]-1-2*ngRho;
-      const long ny = 0;
-      const long nz = rholen[1]-1-2*ngRho;
-#endif
+    // Lower corner of tile box physical domain
+    // Note that this includes guard cells since it is after tilebox.ngrow
+    const std::array<Real, 3>& xyzmin = WarpX::LowerCorner(tilebox, depos_lev);
+    // Indices of the lower bound
+    const Dim3 lo = lbound(tilebox);
 
-      long ncrse = np - np_current;
-      BL_PROFILE_VAR_START(blp_pxr_chd);
-      warpx_charge_deposition(data_ptr, &ncrse,
-                              m_xp[thread_num].dataPtr() + np_current,
-                              m_yp[thread_num].dataPtr() + np_current,
-                              m_zp[thread_num].dataPtr() + np_current,
-                              wp.dataPtr() + np_current,
-                              &this->charge,
-                              &cxyzmin_tile[0], &cxyzmin_tile[1], &cxyzmin_tile[2],
-                              &cdx[0], &cdx[1], &cdx[2], &nx, &ny, &nz,
-                              &ngRho, &ngRho, &ngRho,
-                              &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                              &lvect, &WarpX::charge_deposition_algo);
-#ifdef WARPX_RZ
-      warpx_charge_deposition_rz_volume_scaling(
-                               data_ptr, &ngRho, rholen.getVect(),
-                               &cxyzmin_tile[0], &cdx[0]);
-#endif
-      BL_PROFILE_VAR_STOP(blp_pxr_chd);
+    BL_PROFILE_VAR_START(blp_ppc_chd);
+    if        (WarpX::nox == 1){
+        doChargeDepositionShapeN<1>(xp, yp, zp, wp.dataPtr()+offset, rho_arr,
+                                    np_to_depose, dx, xyzmin, lo, q);
+    } else if (WarpX::nox == 2){
+        doChargeDepositionShapeN<2>(xp, yp, zp, wp.dataPtr()+offset, rho_arr,
+                                    np_to_depose, dx, xyzmin, lo, q);
+    } else if (WarpX::nox == 3){
+        doChargeDepositionShapeN<3>(xp, yp, zp, wp.dataPtr()+offset, rho_arr,
+                                    np_to_depose, dx, xyzmin, lo, q);
+    }
+    BL_PROFILE_VAR_STOP(blp_ppc_chd);
 
 #ifndef AMREX_USE_GPU
-      BL_PROFILE_VAR_START(blp_accumulate);
+    BL_PROFILE_VAR_START(blp_accumulate);
 
-      (*crhomf)[pti].atomicAdd(local_rho[thread_num], tile_box, tile_box, 0, icomp, 1);
+    (*rho)[pti].atomicAdd(local_rho[thread_num], tb, tb, 0, icomp, 1);
 
-      BL_PROFILE_VAR_STOP(blp_accumulate);
+    BL_PROFILE_VAR_STOP(blp_accumulate);
 #endif
-    }
-};
+}
 
 void
 WarpXParticleContainer::DepositCharge (Vector<std::unique_ptr<MultiFab> >& rho, bool local)
@@ -769,8 +710,6 @@ WarpXParticleContainer::GetChargeDensity (int lev, bool local)
     BoxArray nba = ba;
     nba.surroundingNodes();
 
-    const std::array<Real,3>& dx = WarpX::CellSize(lev);
-
     const int ng = WarpX::nox;
 
     auto rho = std::unique_ptr<MultiFab>(new MultiFab(nba,dm,1,ng));
@@ -780,75 +719,28 @@ WarpXParticleContainer::GetChargeDensity (int lev, bool local)
 #pragma omp parallel
     {
 #endif
-        Cuda::ManagedDeviceVector<Real> xp, yp, zp;
 #ifdef _OPENMP
-        FArrayBox rho_loc;
+        int thread_num = omp_get_thread_num();
+#else
+        int thread_num = 0;
 #endif
 
         for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
         {
+            const long np = pti.numParticles();
             auto& wp = pti.GetAttribs(PIdx::w);
 
-            const long np  = pti.numParticles();
-
-            pti.GetPosition(xp, yp, zp);
+            pti.GetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
 
-            // Data on the grid
-            Real* data_ptr;
-            FArrayBox& rhofab = (*rho)[pti];
+            DepositCharge(pti, wp, rho.get(), 0, 0, np, thread_num, lev, lev);
+        }
 #ifdef _OPENMP
-            const std::array<Real,3>& xyzmin_tile = WarpX::LowerCorner(pti.tilebox(), lev);
-            Box tile_box = convert(pti.tilebox(), IntVect::TheUnitVector());
-            const std::array<Real, 3>& xyzmin = xyzmin_tile;
-            tile_box.grow(ng);
-            rho_loc.resize(tile_box);
-            rho_loc = 0.0;
-            data_ptr = rho_loc.dataPtr();
-            auto rholen = rho_loc.length();
-#else
-            const Box& box = pti.validbox();
-            const std::array<Real,3>& xyzmin_grid = WarpX::LowerCorner(box, lev);
-            const std::array<Real, 3>& xyzmin = xyzmin_grid;
-            data_ptr = rhofab.dataPtr();
-            auto rholen = rhofab.length();
-#endif
-
-#if (AMREX_SPACEDIM == 3)
-            const long nx = rholen[0]-1-2*ng;
-            const long ny = rholen[1]-1-2*ng;
-            const long nz = rholen[2]-1-2*ng;
-#else
-            const long nx = rholen[0]-1-2*ng;
-            const long ny = 0;
-            const long nz = rholen[1]-1-2*ng;
-#endif
-
-            long nxg = ng;
-            long nyg = ng;
-            long nzg = ng;
-            long lvect = 8;
-
-            warpx_charge_deposition(data_ptr,
-                                    &np,
-                                    xp.dataPtr(),
-                                    yp.dataPtr(),
-                                    zp.dataPtr(), wp.dataPtr(),
-                                    &this->charge, &xyzmin[0], &xyzmin[1], &xyzmin[2],
-                                    &dx[0], &dx[1], &dx[2], &nx, &ny, &nz,
-                                    &nxg, &nyg, &nzg, &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                                    &lvect, &WarpX::charge_deposition_algo);
-#ifdef WARPX_RZ
-            long ngRho = WarpX::nox;
-            warpx_charge_deposition_rz_volume_scaling(
-                                     data_ptr, &ngRho, rholen.getVect(),
-                                     &xyzmin[0], &dx[0]);
+    }
 #endif
 
-#ifdef _OPENMP
-            rhofab.atomicAdd(rho_loc);
-        }
+#ifdef WARPX_DIM_RZ
+    WarpX::GetInstance().ApplyInverseVolumeScalingToChargeDensity(rho.get(), lev);
 #endif
-    }
 
     if (!local) rho->SumBoundary(gm.periodicity());
 
@@ -1022,7 +914,7 @@ WarpXParticleContainer::PushX (int lev, Real dt)
             Real* AMREX_RESTRICT ux = attribs[PIdx::ux].dataPtr();
             Real* AMREX_RESTRICT uy = attribs[PIdx::uy].dataPtr();
             Real* AMREX_RESTRICT uz = attribs[PIdx::uz].dataPtr();
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
             Real* AMREX_RESTRICT theta = attribs[PIdx::theta].dataPtr();
 #endif
             // Loop over the particles and update their position
@@ -1030,12 +922,12 @@ WarpXParticleContainer::PushX (int lev, Real dt)
                 [=] AMREX_GPU_DEVICE (long i) {
                     ParticleType& p = pstructs[i]; // Particle object that gets updated
                     Real x, y, z; // Temporary variables
-#ifndef WARPX_RZ
+#ifndef WARPX_DIM_RZ
                     GetPosition( x, y, z, p ); // Initialize x, y, z
                     UpdatePosition( x, y, z, ux[i], uy[i], uz[i], dt);
                     SetPosition( p, x, y, z ); // Update the object p
 #else
-                    // For WARPX_RZ, the particles are still pushed in 3D Cartesian
+                    // For WARPX_DIM_RZ, the particles are still pushed in 3D Cartesian
                     GetCartesianPositionFromCylindrical( x, y, z, p, theta[i] );
                     UpdatePosition( x, y, z, ux[i], uy[i], uz[i], dt);
                     SetCylindricalPositionFromCartesian( p, theta[i], x, y, z );
diff --git a/Source/Utils/WarpXAlgorithmSelection.H b/Source/Utils/WarpXAlgorithmSelection.H
index 3fb23698a..6a32513b7 100644
--- a/Source/Utils/WarpXAlgorithmSelection.H
+++ b/Source/Utils/WarpXAlgorithmSelection.H
@@ -34,11 +34,9 @@ struct CurrentDepositionAlgo {
 };
 
 struct ChargeDepositionAlgo {
-    // These numbers corresponds to the algorithm code in WarpX's
-    // `warpx_charge_deposition` function
+    // Only the Standard algorithm is implemented
     enum {
-         Vectorized = 0,
-         Standard = 1
+         Standard = 0
     };
 };
 
diff --git a/Source/Utils/WarpXAlgorithmSelection.cpp b/Source/Utils/WarpXAlgorithmSelection.cpp
index 2c8038ccd..842085a36 100644
--- a/Source/Utils/WarpXAlgorithmSelection.cpp
+++ b/Source/Utils/WarpXAlgorithmSelection.cpp
@@ -8,7 +8,7 @@
 
 const std::map<std::string, int> maxwell_solver_algo_to_int = {
     {"yee",     MaxwellSolverAlgo::Yee },
-#ifndef WARPX_RZ // Not available in RZ
+#ifndef WARPX_DIM_RZ // Not available in RZ
     {"ckc",     MaxwellSolverAlgo::CKC },
 #endif
     {"default", MaxwellSolverAlgo::Yee }
@@ -31,12 +31,7 @@ const std::map<std::string, int> current_deposition_algo_to_int = {
 
 const std::map<std::string, int> charge_deposition_algo_to_int = {
     {"standard",   ChargeDepositionAlgo::Standard },
-#if (!defined AMREX_USE_GPU)&&(AMREX_SPACEDIM == 3) // Only available on CPU and 3D
-    {"vectorized", ChargeDepositionAlgo::Vectorized },
-    {"default",    ChargeDepositionAlgo::Vectorized }
-#else
     {"default",    ChargeDepositionAlgo::Standard }
-#endif
 };
 
 const std::map<std::string, int> gathering_algo_to_int = {
diff --git a/Source/WarpX.H b/Source/WarpX.H
index a25eef9e4..927cc1f32 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -152,12 +152,12 @@ public:
     BilinearFilter bilinear_filter;
     amrex::Vector< std::unique_ptr<NCIGodfreyFilter> > nci_godfrey_filter_exeybz;
     amrex::Vector< std::unique_ptr<NCIGodfreyFilter> > nci_godfrey_filter_bxbyez;
-    
+
     static int num_mirrors;
     amrex::Vector<amrex::Real> mirror_z;
     amrex::Vector<amrex::Real> mirror_z_width;
     amrex::Vector<int> mirror_z_npoints;
-    
+
     void applyMirrors(amrex::Real time);
 
     void ComputeDt ();
@@ -178,6 +178,16 @@ public:
     void EvolveE (int lev, PatchType patch_type, amrex::Real dt);
     void EvolveF (int lev, PatchType patch_type, amrex::Real dt, DtType dt_type);
 
+#ifdef WARPX_DIM_RZ
+    void ApplyInverseVolumeScalingToCurrentDensity(amrex::MultiFab* Jx,
+                                                   amrex::MultiFab* Jy,
+                                                   amrex::MultiFab* Jz,
+                                                   int lev);
+
+    void ApplyInverseVolumeScalingToChargeDensity(amrex::MultiFab* Rho,
+                                                  int lev);
+#endif
+
     void DampPML ();
     void DampPML (int lev);
     void DampPML (int lev, PatchType patch_type);
@@ -247,6 +257,7 @@ public:
 
     static int do_moving_window;
     static int moving_window_dir;
+    static amrex::Real moving_window_v;
 
     // slice generation //
     void InitializeSliceMultiFabs ();
@@ -489,17 +500,18 @@ private:
     int do_pml = 1;
     int pml_ncell = 10;
     int pml_delta = 10;
+    amrex::IntVect do_pml_Lo = amrex::IntVect::TheUnitVector();
+    amrex::IntVect do_pml_Hi = amrex::IntVect::TheUnitVector();
     amrex::Vector<std::unique_ptr<PML> > pml;
 
     amrex::Real moving_window_x = std::numeric_limits<amrex::Real>::max();
-    amrex::Real moving_window_v = std::numeric_limits<amrex::Real>::max();
     amrex::Real current_injection_position = 0;
 
     // Plasma injection parameters
     int warpx_do_continuous_injection = 0;
     int num_injected_species = -1;
     amrex::Vector<int> injected_plasma_species;
-    
+
     int do_electrostatic = 0;
     int n_buffer = 4;
     amrex::Real const_dt = 0.5e-11;
diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp
index 1f5ade13a..1b653fd7f 100644
--- a/Source/WarpX.cpp
+++ b/Source/WarpX.cpp
@@ -30,6 +30,7 @@ Vector<Real> WarpX::B_external(3, 0.0);
 
 int WarpX::do_moving_window = 0;
 int WarpX::moving_window_dir = -1;
+Real WarpX::moving_window_v = std::numeric_limits<amrex::Real>::max();
 
 Real WarpX::gamma_boost = 1.;
 Real WarpX::beta_boost = 0.;
@@ -334,7 +335,19 @@ WarpX::ReadParameters ()
                "The boosted frame diagnostic currently only works if the boost is in the z direction.");
 
         pp.get("num_snapshots_lab", num_snapshots_lab);
-        pp.get("dt_snapshots_lab", dt_snapshots_lab);
+
+        // Read either dz_snapshots_lab or dt_snapshots_lab
+        bool snapshot_interval_is_specified = 0;
+        Real dz_snapshots_lab = 0;
+        snapshot_interval_is_specified += pp.query("dt_snapshots_lab", dt_snapshots_lab);
+        if ( pp.query("dz_snapshots_lab", dz_snapshots_lab) ){
+            dt_snapshots_lab = dz_snapshots_lab/PhysConst::c;
+            snapshot_interval_is_specified = 1;
+        }
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(
+            snapshot_interval_is_specified,
+            "When using back-transformed diagnostics, user should specify either dz_snapshots_lab or dt_snapshots_lab.");
+
         pp.get("gamma_boost", gamma_boost);
 
         pp.query("do_boosted_frame_fields", do_boosted_frame_fields);
@@ -383,6 +396,22 @@ WarpX::ReadParameters ()
         pp.query("pml_ncell", pml_ncell);
         pp.query("pml_delta", pml_delta);
 
+        Vector<int> parse_do_pml_Lo(AMREX_SPACEDIM,1);
+        pp.queryarr("do_pml_Lo", parse_do_pml_Lo);
+        do_pml_Lo[0] = parse_do_pml_Lo[0];
+        do_pml_Lo[1] = parse_do_pml_Lo[1];
+#if (AMREX_SPACEDIM == 3)
+        do_pml_Lo[2] = parse_do_pml_Lo[2];
+#endif
+        Vector<int> parse_do_pml_Hi(AMREX_SPACEDIM,1);
+        pp.queryarr("do_pml_Hi", parse_do_pml_Hi);
+        do_pml_Hi[0] = parse_do_pml_Hi[0];
+        do_pml_Hi[1] = parse_do_pml_Hi[1];
+#if (AMREX_SPACEDIM == 3)
+        do_pml_Hi[2] = parse_do_pml_Hi[2];
+#endif
+
+
         pp.query("dump_openpmd", dump_openpmd);
         pp.query("dump_plotfiles", dump_plotfiles);
         pp.query("plot_raw_fields", plot_raw_fields);
@@ -393,7 +422,7 @@ WarpX::ReadParameters ()
         if (not user_fields_to_plot){
             // If not specified, set default values
             fields_to_plot = {"Ex", "Ey", "Ez", "Bx", "By",
-                              "Bz", "jx", "jy", "jz", 
+                              "Bz", "jx", "jy", "jz",
                               "part_per_cell"};
         }
         // set plot_rho to true of the users requests it, so that
@@ -411,9 +440,9 @@ WarpX::ReadParameters ()
         // If user requests to plot proc_number for a serial run,
         // delete proc_number from fields_to_plot
         if (ParallelDescriptor::NProcs() == 1){
-            fields_to_plot.erase(std::remove(fields_to_plot.begin(), 
-                                             fields_to_plot.end(), 
-                                             "proc_number"), 
+            fields_to_plot.erase(std::remove(fields_to_plot.begin(),
+                                             fields_to_plot.end(),
+                                             "proc_number"),
                                  fields_to_plot.end());
         }
 
@@ -497,11 +526,9 @@ WarpX::ReadParameters ()
     {
         ParmParse pp("algo");
         // If not in RZ mode, read use_picsar_deposition
-        // In RZ mode, use_picsar_deposition is on, as the C++ version 
+        // In RZ mode, use_picsar_deposition is on, as the C++ version
         // of the deposition does not support RZ
-#ifndef WARPX_RZ
         pp.query("use_picsar_deposition", use_picsar_deposition);
-#endif
         current_deposition_algo = GetAlgorithmInteger(pp, "current_deposition");
         charge_deposition_algo = GetAlgorithmInteger(pp, "charge_deposition");
         field_gathering_algo = GetAlgorithmInteger(pp, "field_gathering");
@@ -876,6 +903,21 @@ WarpX::AllocLevelMFs (int lev, const BoxArray& ba, const DistributionMapping& dm
             rho_cp[lev].reset(new MultiFab(amrex::convert(cba,IntVect::TheUnitVector()),dm,2,ngRho));
             rho_cp_owner_masks[lev] = std::move(rho_cp[lev]->OwnerMask(cperiod));
         }
+        if (fft_hybrid_mpi_decomposition == false){
+            // Allocate and initialize the spectral solver
+            std::array<Real,3> cdx = CellSize(lev-1);
+    #if (AMREX_SPACEDIM == 3)
+            RealVect cdx_vect(cdx[0], cdx[1], cdx[2]);
+    #elif (AMREX_SPACEDIM == 2)
+            RealVect cdx_vect(cdx[0], cdx[2]);
+    #endif
+            // Get the cell-centered box, with guard cells
+            BoxArray realspace_ba = cba;  // Copy box
+            realspace_ba.enclosedCells().grow(ngE); // cell-centered + guard cells
+            // Define spectral solver
+            spectral_solver_cp[lev].reset( new SpectralSolver( realspace_ba, dm,
+                nox_fft, noy_fft, noz_fft, do_nodal, cdx_vect, dt[lev] ) );
+        }
 #endif
     }
 
@@ -907,7 +949,7 @@ WarpX::AllocLevelMFs (int lev, const BoxArray& ba, const DistributionMapping& dm
             current_buf[lev][0].reset( new MultiFab(amrex::convert(cba,jx_nodal_flag),dm,1,ngJ));
             current_buf[lev][1].reset( new MultiFab(amrex::convert(cba,jy_nodal_flag),dm,1,ngJ));
             current_buf[lev][2].reset( new MultiFab(amrex::convert(cba,jz_nodal_flag),dm,1,ngJ));
-            if (do_dive_cleaning || plot_rho) {
+            if (rho_cp[lev]) {
                 charge_buf[lev].reset( new MultiFab(amrex::convert(cba,IntVect::TheUnitVector()),dm,2,ngRho));
             }
             current_buffer_masks[lev].reset( new iMultiFab(ba, dm, 1, 1) );
@@ -995,7 +1037,7 @@ WarpX::ComputeDivB (MultiFab& divB, int dcomp,
 {
     Real dxinv = 1./dx[0], dyinv = 1./dx[1], dzinv = 1./dx[2];
 
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
     const Real rmin = GetInstance().Geom(0).ProbLo(0);
 #endif
 
@@ -1014,7 +1056,7 @@ WarpX::ComputeDivB (MultiFab& divB, int dcomp,
         [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
         {
             warpx_computedivb(i, j, k, dcomp, divBfab, Bxfab, Byfab, Bzfab, dxinv, dyinv, dzinv
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
                               ,rmin
 #endif
                               );
@@ -1029,7 +1071,7 @@ WarpX::ComputeDivB (MultiFab& divB, int dcomp,
 {
     Real dxinv = 1./dx[0], dyinv = 1./dx[1], dzinv = 1./dx[2];
 
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
     const Real rmin = GetInstance().Geom(0).ProbLo(0);
 #endif
 
@@ -1048,7 +1090,7 @@ WarpX::ComputeDivB (MultiFab& divB, int dcomp,
         [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
         {
             warpx_computedivb(i, j, k, dcomp, divBfab, Bxfab, Byfab, Bzfab, dxinv, dyinv, dzinv
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
                               ,rmin
 #endif
                               );
@@ -1063,7 +1105,7 @@ WarpX::ComputeDivE (MultiFab& divE, int dcomp,
 {
     Real dxinv = 1./dx[0], dyinv = 1./dx[1], dzinv = 1./dx[2];
 
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
     const Real rmin = GetInstance().Geom(0).ProbLo(0);
 #endif
 
@@ -1082,7 +1124,7 @@ WarpX::ComputeDivE (MultiFab& divE, int dcomp,
         [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
         {
             warpx_computedive(i, j, k, dcomp, divEfab, Exfab, Eyfab, Ezfab, dxinv, dyinv, dzinv
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
                               ,rmin
 #endif
                               );
@@ -1097,7 +1139,7 @@ WarpX::ComputeDivE (MultiFab& divE, int dcomp,
 {
     Real dxinv = 1./dx[0], dyinv = 1./dx[1], dzinv = 1./dx[2];
 
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
     const Real rmin = GetInstance().Geom(0).ProbLo(0);
 #endif
 
@@ -1116,7 +1158,7 @@ WarpX::ComputeDivE (MultiFab& divE, int dcomp,
         [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
         {
             warpx_computedive(i, j, k, dcomp, divEfab, Exfab, Eyfab, Ezfab, dxinv, dyinv, dzinv
-#ifdef WARPX_RZ
+#ifdef WARPX_DIM_RZ
                               ,rmin
 #endif
                               );