90 files changed, 1525 insertions, 917 deletions
diff --git a/.travis.yml b/.travis.yml
index 891bef8f8..3ee3605aa 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,3 +1,4 @@
+dist: xenial
 language: c++
 sudo: true
 
diff --git a/Docs/source/building/building.rst b/Docs/source/building/building.rst
index a9ea4e755..140659ec8 100644
--- a/Docs/source/building/building.rst
+++ b/Docs/source/building/building.rst
@@ -24,7 +24,9 @@ single directory (e.g. ``warpx_directory``):
 Basic compilation
 -----------------
 
-``cd`` into the directory ``WarpX`` and type
+WarpX requires a C/C++ and Fortran compiler (e.g., GCC or Intel) and an
+MPI implementation (e.g., OpenMPI or MPICH). Then ``cd`` into the directory
+``WarpX`` and type
 
 ::
 
@@ -73,8 +75,8 @@ Advanced building instructions
    python
    spack
 
-Building for specific plateforms
---------------------------------
+Building for specific platforms
+-------------------------------
 
 .. toctree::
    :maxdepth: 1
diff --git a/Docs/source/running_cpp/parameters.rst b/Docs/source/running_cpp/parameters.rst
index b6461db65..c38ef472d 100644
--- a/Docs/source/running_cpp/parameters.rst
+++ b/Docs/source/running_cpp/parameters.rst
@@ -29,12 +29,12 @@ Overall simulation parameters
     (The direction ``y`` cannot be used in 2D simulations.)
 
 * ``warpx.zmax_plasma_to_compute_max_step`` (`float`) optional
-    Can be useful when running in a boosted frame. If specified, automatically 
-    calculates the number of iterations required in the boosted frame for the 
-    lower `z` end of the simulation domain to reach 
-    ``warpx.zmax_plasma_to_compute_max_step`` (typically the plasma end, 
-    given in the lab frame). The value of ``max_step`` is overwritten, and 
-    printed to standard output. Currently only works if the Lorentz boost and 
+    Can be useful when running in a boosted frame. If specified, automatically
+    calculates the number of iterations required in the boosted frame for the
+    lower `z` end of the simulation domain to reach
+    ``warpx.zmax_plasma_to_compute_max_step`` (typically the plasma end,
+    given in the lab frame). The value of ``max_step`` is overwritten, and
+    printed to standard output. Currently only works if the Lorentz boost and
     the moving window are along the z direction.
 
 * ``warpx.verbose`` (`0` or `1`)
@@ -187,9 +187,18 @@ Particle initialization
     * ``NRandomPerCell``: injection with a fixed number of randomly-distributed particles per cell.
       This requires the additional parameter ``<species_name>.num_particles_per_cell``.
 
+    * ``gaussian_beam``: Inject particle beam with gaussian distribution in 
+      space in all directions. This requires additional parameters: 
+      ``<species_name>.q_tot`` (beam charge), 
+      ``<species_name>.npart`` (number of particles in the beam), 
+      ``<species_name>.x/y/z_m`` (average position in `x/y/z`), 
+      ``<species_name>.x/y/z_rms`` (standard deviation in `x/y/z`), 
+      and optional argument ``<species_name>.do_symmetrize`` (whether to 
+      symmetrize the beam in the x and y directions).
+
 * ``<species_name>.do_continuous_injection`` (`0` or `1`)
-    Whether to inject particles during the simulation, and not only at 
-    initialization. This can be required whith a moving window and/or when 
+    Whether to inject particles during the simulation, and not only at
+    initialization. This can be required whith a moving window and/or when
     running in a boosted frame.
 
 * ``<species_name>.profile`` (`string`)
@@ -295,20 +304,20 @@ Particle initialization
 * ``<species>.plot_species`` (`0` or `1` optional; default `1`)
     Whether to plot particle quantities for this species.
 
-* ``<species>.plot_vars`` (list of `strings` separated by spaces, optional) 
-    List of particle quantities to write to `plotfiles`. By defaults, all 
-    quantities are written to file. Choices are 
+* ``<species>.plot_vars`` (list of `strings` separated by spaces, optional)
+    List of particle quantities to write to `plotfiles`. By defaults, all
+    quantities are written to file. Choices are
     * ``w`` for the particle weight,
-    * ``ux`` ``uy`` ``uz`` for the particle momentum, 
+    * ``ux`` ``uy`` ``uz`` for the particle momentum,
     * ``Ex`` ``Ey`` ``Ez`` for the electric field on particles,
     * ``Bx`` ``By`` ``Bz`` for the magnetic field on particles.
-    The particle positions are always included. Use 
-    ``<species>.plot_vars = none`` to plot no particle data, except 
+    The particle positions are always included. Use
+    ``<species>.plot_vars = none`` to plot no particle data, except
     particle position.
 
 * ``<species>.do_boosted_frame_diags`` (`0` or `1` optional, default `1`)
-    Only used when ``warpx.do_boosted_frame_diagnostic=1``. When running in a 
-    boosted frame, whether or not to plot back-transformed diagnostics for 
+    Only used when ``warpx.do_boosted_frame_diagnostic=1``. When running in a
+    boosted frame, whether or not to plot back-transformed diagnostics for
     this species.
 
 * ``warpx.serialize_ics`` (`0 or 1`)
@@ -466,7 +475,7 @@ Laser initialization
     See definition in Akturk et al., Opt Express, vol 12, no 19 (2014).
 
 * ``<laser_name>.do_continuous_injection`` (`0` or `1`) optional (default `0`).
-    Whether or not to use continuous injection (`0` or not `0`).
+    Whether or not to use continuous injection.
     If the antenna starts outside of the simulation domain but enters it 
     at some point (due to moving window or moving antenna in the boosted 
     frame), use this so that the laser antenna is injected when it reaches 
@@ -512,53 +521,60 @@ Numerics and algorithms
     Number of passes along each direction for the bilinear filter.
     In 2D simulations, only the first two values are read.
 
-* ``algo.current_deposition`` (`integer`)
-    The algorithm for current deposition:
+* ``algo.current_deposition`` (`string`, optional)
+    The algorithm for current deposition. Available options are:
 
-     - ``0``: Esirkepov deposition, vectorized
-     - ``1``: Esirkepov deposition, non-optimized
-     - ``2``: Direct deposition, vectorized
-     - ``3``: Direct deposition, non-optimized
+     - ``esirkepov``: the charge-conserving Esirkepov algorithm
+       (see `Esirkepov, Comp. Phys. Comm. (2001) <https://www.sciencedirect.com/science/article/pii/S0010465500002289>`__)
+     - ``direct``: simpler current deposition algorithm, described in
+       the section :doc:`../theory/picsar_theory`. Note that this algorithm is not strictly charge-conserving.
+     - ``direct-vectorized`` (only available in 3D, and when running on CPU/KNL - as opposed to GPU):
+       mathematically equivalent to ``direct``, but uses an optimized algorithm
+       for vectorization on CPU/KNL (see `Vincenti, Comp. Phys. Comm. (2017)
+       <https://www.sciencedirect.com/science/article/pii/S0010465516302764>`__)
 
-    .. warning::
+    If ``algo.current_deposition`` is not specified, the default is ``esirkepov``.
 
-        On GPU, use ``algo.current_deposition=0`` for Esirkepov
-	or ``3`` for direct deposition.
+* ``algo.charge_deposition`` (`string`, optional)
+    The algorithm for the charge density deposition. Available options are:
 
-* ``algo.charge_deposition`` (`integer`)
-    The algorithm for the charge density deposition:
+     - ``standard``: standard charge deposition algorithm, described in
+       the section :doc:`../theory/picsar_theory`.
+     - ``vectorized`` (only available in 3D, and when running on CPU/KNL - as opposed to GPU):
+       mathematically equivalent to ``standard``, but uses an optimized algorithm
+       for vectorization on CPU/KNL (see `Vincenti, Comp. Phys. Comm. (2017)
+       <https://www.sciencedirect.com/science/article/pii/S0010465516302764>`__)
 
-     - ``0``: Vectorized version
-     - ``1``: Non-optimized version
+    If ``algo.charge_deposition`` is not specified, ``vectorized`` is the default
+    whenever it is available ; ``standard`` is the default otherwise.
 
-    .. warning::
+* ``algo.field_gathering`` (`string`, optional)
+    The algorithm for field gathering. Available options are:
 
-        The vectorized version does not run on GPU. Use
-		``algo.charge_deposition=1`` when running on GPU.
-       
-* ``algo.field_gathering`` (`integer`)
-    The algorithm for field gathering:
+     - ``standard``: gathers directly from the grid points (either staggered
+       or nodal gridpoints depending on ``warpx.do_nodal``).
+     - ``vectorized`` (not available when running on GPU): mathematically
+       equivalent to ``standard``, but uses optimized vector instructions for CPU/KNL.
 
-     - ``0``: Vectorized version
-     - ``1``: Non-optimized version
+    If ``algo.field_gathering`` is not specified, ``vectorized`` is the default
+    on CPU/KNL ; ``standard`` is the default on GPU.
 
-    .. warning::
+* ``algo.particle_pusher`` (`string`, optional)
+    The algorithm for the particle pusher. Available options are:
 
-        The vectorized version does not run on GPU. Use
-		``algo.field_gather=1`` when running on GPU.
+     - ``boris``: Boris pusher.
+     - ``vay``: Vay pusher (see `Vay, Phys. Plasmas (2008) <https://aip.scitation.org/doi/10.1063/1.2837054>`__)
 
-* ``algo.particle_pusher`` (`integer`)
-    The algorithm for the particle pusher:
+     If ``algo.particle_pusher`` is not specified, ``boris`` is the default.
 
-     - ``0``: Boris pusher
-     - ``1``: Vay pusher
+* ``algo.maxwell_fdtd_solver`` (`string`, optional)
+    The algorithm for the FDTD Maxwell field solver. Available options are:
 
-* ``algo.maxwell_fdtd_solver`` (`string`)
-    The algorithm for the FDTD Maxwell field solver:
+     - ``yee``: Yee FDTD solver.
+     - ``ckc``: (not available in ``RZ`` geometry) Cole-Karkkainen solver with Cowan
+       coefficients (see `Cowan, PRSTAB 16 (2013) <https://journals.aps.org/prab/abstract/10.1103/PhysRevSTAB.16.041303>`__)
 
-     - ``yee``: Yee FDTD solver
-     - ``ckc``: Cole-Karkkainen solver with Cowan
-       coefficients (see Cowan - PRST-AB 16, 041303 (2013))
+     If ``algo.maxwell_fdtd_solver`` is not specified, ``yee`` is the default.
 
 * ``interpolation.nox``, ``interpolation.noy``, ``interpolation.noz`` (`integer`)
     The order of the shape factors for the macroparticles, for the 3 dimensions of space.
@@ -581,17 +597,17 @@ Numerics and algorithms
     fields are defined at different points in space)
 
 * ``warpx.do_subcycling`` (`0` or `1`; default: 0)
-    Whether or not to use sub-cycling. Different refinement levels have a 
-    different cell size, which results in different Courant–Friedrichs–Lewy 
-    (CFL) limits for the time step. By default, when using mesh refinement, 
-    the same time step is used for all levels. This time step is 
-    taken as the CFL limit of the finest level. Hence, for coarser 
-    levels, the timestep is only a fraction of the CFL limit for this 
-    level, which may lead to numerical artifacts. With sub-cycling, each level 
-    evolves with its own time step, set to its own CFL limit. In practice, it 
-    means that when level 0 performs one iteration, level 1 performs two 
-    iterations. Currently, this option is only supported when 
-    ``amr.max_level = 1``. More information can be found at 
+    Whether or not to use sub-cycling. Different refinement levels have a
+    different cell size, which results in different Courant–Friedrichs–Lewy
+    (CFL) limits for the time step. By default, when using mesh refinement,
+    the same time step is used for all levels. This time step is
+    taken as the CFL limit of the finest level. Hence, for coarser
+    levels, the timestep is only a fraction of the CFL limit for this
+    level, which may lead to numerical artifacts. With sub-cycling, each level
+    evolves with its own time step, set to its own CFL limit. In practice, it
+    means that when level 0 performs one iteration, level 1 performs two
+    iterations. Currently, this option is only supported when
+    ``amr.max_level = 1``. More information can be found at
     https://ieeexplore.ieee.org/document/8659392.
 
 * ``psatd.nox``, ``psatd.noy``, ``pstad.noz`` (`integer`) optional (default `16` for all)
@@ -658,7 +674,7 @@ Diagnostics and output
     The directory in which to save the lab frame data when using the
     **back-transformed diagnostics**. If not specified, the default is
     is `lab_frame_data`.
-    
+
 * ``warpx.num_snapshots_lab`` (`integer`)
     Only used when ``warpx.do_boosted_frame_diagnostic`` is ``1``.
     The number of lab-frame snapshots that will be written.
@@ -672,9 +688,9 @@ Diagnostics and output
     Whether to use the **back-transformed diagnostics** for the fields.
 
 * ``warpx.boosted_frame_diag_fields`` (space-separated list of `string`)
-    Which fields to dumped in back-transformed diagnostics. Choices are 
-    'Ex', 'Ey', Ez', 'Bx', 'By', Bz', 'jx', 'jy', jz' and 'rho'. Example: 
-    ``warpx.boosted_frame_diag_fields = Ex Ez By``. By default, all fields 
+    Which fields to dumped in back-transformed diagnostics. Choices are
+    'Ex', 'Ey', Ez', 'Bx', 'By', Bz', 'jx', 'jy', jz' and 'rho'. Example:
+    ``warpx.boosted_frame_diag_fields = Ex Ez By``. By default, all fields
     are dumped.
 
 * ``warpx.plot_raw_fields`` (`0` or `1`) optional (default `0`)
@@ -715,13 +731,13 @@ Diagnostics and output
     The extent of the slice are defined by the co-ordinates of the lower corner (``slice.dom_lo``) and upper corner (``slice.dom_hi``). The slice could be 1D, 2D, or 3D, aligned with the co-ordinate axes and the first axis of the coordinates is x. For example: if for a 3D simulation, an x-z slice is to be extracted at y = 0.0, then the y-value of slice.dom_lo and slice.dom_hi must be equal to 0.0
 
 * ``slice.coarsening_ratio`` (`2 integers in 2D`, `3 integers in 3D`; default `1`)
-    The coarsening ratio input must be greater than 0. Default is 1 in all directions. 
-    In the directions that is reduced, i.e., for an x-z slice in 3D, the reduced y-dimension has a default coarsening ratio equal to 1. 
+    The coarsening ratio input must be greater than 0. Default is 1 in all directions.
+    In the directions that is reduced, i.e., for an x-z slice in 3D, the reduced y-dimension has a default coarsening ratio equal to 1.
 
 * ``slice.plot_int`` (`integer`)
     The number of PIC cycles inbetween two consecutive data dumps for the slice. Use a
     negative number to disable slice generation and slice data dumping.
-    
+
 
 
 Checkpoints and restart
diff --git a/Docs/source/theory/picsar_theory.rst b/Docs/source/theory/picsar_theory.rst
index 7338d5c36..135d78dea 100644
--- a/Docs/source/theory/picsar_theory.rst
+++ b/Docs/source/theory/picsar_theory.rst
@@ -328,6 +328,8 @@ a collocated and a staggered formulation is application-dependent.
 
 Spectral solvers used to be very popular in the years 1970s to early 1990s, before being replaced by finite-difference methods with the advent of parallel supercomputers that favored local methods. However, it was shown recently that standard domain decomposition with Fast Fourier Transforms that are local to each subdomain could be used effectively with PIC spectral methods (Jean-Luc Vay, Haber, and Godfrey 2013), at the cost of truncation errors in the guard cells that could be neglected. A detailed analysis of the effectiveness of the method with exact evaluation of the magnitude of the effect of the truncation error is given in (Vincenti and Vay 2016) for stencils of arbitrary order (up-to the infinite “spectral” order).
 
+.. _current_deposition:
+
 Current deposition
 ------------------
 
diff --git a/Examples/Modules/RigidInjection/inputs b/Examples/Modules/RigidInjection/inputs
index d7e3b77e8..3c0d52ed0 100644
--- a/Examples/Modules/RigidInjection/inputs
+++ b/Examples/Modules/RigidInjection/inputs
@@ -23,10 +23,6 @@ geometry.prob_hi     =  2.    2.    4.
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 
 # interpolation
 interpolation.nox = 3
diff --git a/Examples/Modules/boosted_diags/inputs.2d b/Examples/Modules/boosted_diags/inputs.2d
index 33eee5392..528eb6cd9 100644
--- a/Examples/Modules/boosted_diags/inputs.2d
+++ b/Examples/Modules/boosted_diags/inputs.2d
@@ -24,10 +24,7 @@ geometry.prob_hi     =  150.e-6    150.e-6    0.
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 3
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
+algo.current_deposition = direct
 
 # Numerics
 interpolation.nox = 3
diff --git a/Examples/Modules/boosted_diags/inputs.3d b/Examples/Modules/boosted_diags/inputs.3d
index 33eee5392..528eb6cd9 100644
--- a/Examples/Modules/boosted_diags/inputs.3d
+++ b/Examples/Modules/boosted_diags/inputs.3d
@@ -24,10 +24,7 @@ geometry.prob_hi     =  150.e-6    150.e-6    0.
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 3
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
+algo.current_deposition = direct
 
 # Numerics
 interpolation.nox = 3
diff --git a/Examples/Modules/charged_beam/inputs b/Examples/Modules/charged_beam/inputs
index 36b991c3e..18b645281 100644
--- a/Examples/Modules/charged_beam/inputs
+++ b/Examples/Modules/charged_beam/inputs
@@ -23,10 +23,7 @@ geometry.prob_hi     =  20.e-6    20.e-6    20.e-6
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 3
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
+algo.current_deposition = direct
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Modules/gaussian_beam/inputs b/Examples/Modules/gaussian_beam/inputs
index 0cab8d140..46cd785f2 100644
--- a/Examples/Modules/gaussian_beam/inputs
+++ b/Examples/Modules/gaussian_beam/inputs
@@ -23,10 +23,6 @@ geometry.prob_hi     =  2.    2.    2.
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 
 # interpolation
 interpolation.nox = 3
diff --git a/Examples/Modules/laser_injection/inputs b/Examples/Modules/laser_injection/inputs
index 4186f2682..91ea135d1 100644
--- a/Examples/Modules/laser_injection/inputs
+++ b/Examples/Modules/laser_injection/inputs
@@ -21,10 +21,6 @@ geometry.prob_hi     =  20.e-6    20.e-6    12.e-6
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Modules/laser_injection/inputs.2d.rt b/Examples/Modules/laser_injection/inputs.2d.rt
index 1c40525f8..db726a98c 100644
--- a/Examples/Modules/laser_injection/inputs.2d.rt
+++ b/Examples/Modules/laser_injection/inputs.2d.rt
@@ -23,10 +23,7 @@ warpx.serialize_ics = 1
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 1
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
+algo.current_deposition = esirkepov
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Modules/laser_injection/inputs.rt b/Examples/Modules/laser_injection/inputs.rt
index b6f49d661..cec710403 100644
--- a/Examples/Modules/laser_injection/inputs.rt
+++ b/Examples/Modules/laser_injection/inputs.rt
@@ -23,10 +23,6 @@ warpx.serialize_ics = 1
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Modules/nci_corrector/inputs2d b/Examples/Modules/nci_corrector/inputs2d
index 2f022f4cf..8db312fc6 100644
--- a/Examples/Modules/nci_corrector/inputs2d
+++ b/Examples/Modules/nci_corrector/inputs2d
@@ -25,10 +25,7 @@ warpx.verbose = 1
 warpx.use_filter = 1
 
 # Algorithms
-algo.current_deposition = 1
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
+algo.current_deposition = esirkepov
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
diff --git a/Examples/Physics_applications/laser_acceleration/inputs.2d b/Examples/Physics_applications/laser_acceleration/inputs.2d
index 11ddc6b0b..1a76feacf 100644
--- a/Examples/Physics_applications/laser_acceleration/inputs.2d
+++ b/Examples/Physics_applications/laser_acceleration/inputs.2d
@@ -17,10 +17,6 @@ warpx.fine_tag_hi =  5.e-6   -25.e-6
 #################################
 ############ NUMERICS ###########
 #################################
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 interpolation.nox = 3 # Particle interpolation order. Must be the same in x, y, and z
 interpolation.noy = 3
 interpolation.noz = 3
diff --git a/Examples/Physics_applications/laser_acceleration/inputs.2d.boost b/Examples/Physics_applications/laser_acceleration/inputs.2d.boost
index 18c354634..fb03ae3f6 100644
--- a/Examples/Physics_applications/laser_acceleration/inputs.2d.boost
+++ b/Examples/Physics_applications/laser_acceleration/inputs.2d.boost
@@ -1,8 +1,8 @@
 #################################
 ######### BOX PARAMETERS ########
 #################################
-# max_step = 2700
-stop_time = 1.9e-12
+max_step = 1000
+# stop_time = 1.9e-12
 amr.n_cell =  128 1024
 amr.max_grid_size = 64
 amr.blocking_factor = 32
@@ -19,10 +19,8 @@ geometry.prob_hi     =  128.e-6     0.96e-6
 #################################
 warpx.verbose = 1
 amrex.v = 1
-algo.current_deposition = 2
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 1
+algo.current_deposition = direct-vectorized
+algo.particle_pusher = vay
 algo.maxwell_fdtd_solver = ckc
 interpolation.nox = 3
 interpolation.noy = 3
@@ -39,7 +37,7 @@ warpx.serialize_ics = 1
 #################################
 ####### BOOST PARAMETERS ########
 #################################
-warpx.gamma_boost = 10.
+warpx.gamma_boost = 30.
 warpx.boost_direction = z
 warpx.do_boosted_frame_diagnostic = 1
 warpx.num_snapshots_lab = 7
@@ -62,11 +60,11 @@ electrons.momentum_distribution_type = "gaussian"
 electrons.xmin = -120.e-6
 electrons.xmax =  120.e-6
 electrons.zmin = 0.5e-3
-electrons.zmax = .0035
+electrons.zmax = 1.
 electrons.profile                 = "predefined"
 electrons.predefined_profile_name = "parabolic_channel"
 #         predefined_profile_params = z_start   ramp_up   plateau   ramp_down   rc       n0
-electrons.predefined_profile_params = .5e-3     .5e-3     2.e-3     .5e-3       50.e-6   3.5e24
+electrons.predefined_profile_params = .5e-3     .5e-3     2.e-3     .5e-3       50.e-6   3.5e25
 electrons.do_continuous_injection = 1
 
 ions.charge = q_e
@@ -77,11 +75,11 @@ ions.momentum_distribution_type = "gaussian"
 ions.xmin = -120.e-6
 ions.xmax =  120.e-6
 ions.zmin = 0.5e-3
-ions.zmax = .0035
+ions.zmax = 1.
 ions.profile                 = "predefined"
 ions.predefined_profile_name = "parabolic_channel"
 #    predefined_profile_params = z_start   ramp_up   plateau   ramp_down   rc       n0
-ions.predefined_profile_params = .5e-3     .5e-3     2.e-3     .5e-3       50.e-6   3.5e24
+ions.predefined_profile_params = .5e-3     .5e-3     2.e-3     .5e-3       50.e-6   3.5e25
 ions.do_continuous_injection = 1
 
 beam.charge = -q_e
diff --git a/Examples/Physics_applications/laser_acceleration/inputs.3d b/Examples/Physics_applications/laser_acceleration/inputs.3d
index 8e854cbf4..f055708d1 100644
--- a/Examples/Physics_applications/laser_acceleration/inputs.3d
+++ b/Examples/Physics_applications/laser_acceleration/inputs.3d
@@ -17,10 +17,6 @@ warpx.fine_tag_hi =  5.e-6    5.e-6   -30.e-6
 #################################
 ############ NUMERICS ###########
 #################################
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 interpolation.nox = 3 # Particle interpolation order. Must be the same in x, y, and z
 interpolation.noy = 3
 interpolation.noz = 3
diff --git a/Examples/Physics_applications/plasma_acceleration/inputs.2d b/Examples/Physics_applications/plasma_acceleration/inputs.2d
index e092cf932..9d0edbc7e 100644
--- a/Examples/Physics_applications/plasma_acceleration/inputs.2d
+++ b/Examples/Physics_applications/plasma_acceleration/inputs.2d
@@ -18,10 +18,6 @@ warpx.fine_tag_hi    =   12.e-6 -100.e-6
 #################################
 ############ NUMERICS ###########
 #################################
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 algo.maxwell_fdtd_solver = ckc
 warpx.use_filter = 1
 warpx.do_pml = 1
diff --git a/Examples/Physics_applications/plasma_acceleration/inputs.2d.boost b/Examples/Physics_applications/plasma_acceleration/inputs.2d.boost
index 59e7c432b..ba9166dee 100644
--- a/Examples/Physics_applications/plasma_acceleration/inputs.2d.boost
+++ b/Examples/Physics_applications/plasma_acceleration/inputs.2d.boost
@@ -16,10 +16,6 @@ geometry.prob_hi     =  125.e-6    1.e-6
 #################################
 ############ NUMERICS ###########
 #################################
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 algo.maxwell_fdtd_solver = ckc
 warpx.verbose = 1
 warpx.plot_raw_fields = 0
diff --git a/Examples/Physics_applications/plasma_acceleration/inputs.3d.boost b/Examples/Physics_applications/plasma_acceleration/inputs.3d.boost
index 1d58ef940..1af4fac86 100644
--- a/Examples/Physics_applications/plasma_acceleration/inputs.3d.boost
+++ b/Examples/Physics_applications/plasma_acceleration/inputs.3d.boost
@@ -16,10 +16,6 @@ geometry.prob_hi     = 0.00015 0.00015 1.e-06
 #################################
 ############ NUMERICS ###########
 #################################
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 algo.maxwell_fdtd_solver = ckc
 warpx.verbose = 1
 warpx.plot_raw_fields = 0
diff --git a/Examples/Physics_applications/plasma_mirror/inputs.2d b/Examples/Physics_applications/plasma_mirror/inputs.2d
index e16313b42..dc0a48ebf 100644
--- a/Examples/Physics_applications/plasma_mirror/inputs.2d
+++ b/Examples/Physics_applications/plasma_mirror/inputs.2d
@@ -18,10 +18,6 @@ warpx.serialize_ics = 1
 #################################
 ############ NUMERICS ###########
 #################################
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
diff --git a/Examples/Physics_applications/uniform_plasma/inputs.2d b/Examples/Physics_applications/uniform_plasma/inputs.2d
index f3eb2decb..4304638dd 100644
--- a/Examples/Physics_applications/uniform_plasma/inputs.2d
+++ b/Examples/Physics_applications/uniform_plasma/inputs.2d
@@ -16,10 +16,6 @@ geometry.prob_hi     =  20.e-6    20.e-6
 #################################
 warpx.serialize_ics = 1
 warpx.verbose = 1
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 warpx.cfl = 1.0
 amr.plot_int = 10
 
diff --git a/Examples/Physics_applications/uniform_plasma/inputs.3d b/Examples/Physics_applications/uniform_plasma/inputs.3d
index 245f3204c..6792dac2c 100644
--- a/Examples/Physics_applications/uniform_plasma/inputs.3d
+++ b/Examples/Physics_applications/uniform_plasma/inputs.3d
@@ -16,10 +16,6 @@ geometry.prob_hi     =  20.e-6    20.e-6    20.e-6
 #################################
 warpx.serialize_ics = 1
 warpx.verbose = 1
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 warpx.cfl = 1.0
 amr.plot_int = 10
 
diff --git a/Examples/Tests/Langmuir/inputs b/Examples/Tests/Langmuir/inputs
index cd13aebbd..a989f2249 100644
--- a/Examples/Tests/Langmuir/inputs
+++ b/Examples/Tests/Langmuir/inputs
@@ -26,10 +26,7 @@ warpx.moving_window_dir = x
 warpx.moving_window_v = 0.0 # in units of the speed of light
 
 # Algorithms
-algo.current_deposition = 3
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
+algo.current_deposition = direct
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Tests/Langmuir/inputs.2d.rz b/Examples/Tests/Langmuir/inputs.2d.rz
index c4132d493..a4464fe1d 100644
--- a/Examples/Tests/Langmuir/inputs.2d.rz
+++ b/Examples/Tests/Langmuir/inputs.2d.rz
@@ -27,10 +27,7 @@ warpx.moving_window_dir = z
 warpx.moving_window_v = 0.0 # in units of the speed of light
 
 # Algorithms
-algo.current_deposition = 3
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
+algo.current_deposition = direct
 interpolation.nox = 1
 interpolation.noy = 1
 interpolation.noz = 1
diff --git a/Examples/Tests/Langmuir/inputs.lb b/Examples/Tests/Langmuir/inputs.lb
index 5900cb1bb..73ededb23 100644
--- a/Examples/Tests/Langmuir/inputs.lb
+++ b/Examples/Tests/Langmuir/inputs.lb
@@ -32,10 +32,7 @@ warpx.moving_window_v = 0.0 # in units of the speed of light
 warpx.regrid_int = 1
 
 # Algorithms
-algo.current_deposition = 3
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
+algo.current_deposition = direct
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Tests/Langmuir/inputs.multi.2d.rt b/Examples/Tests/Langmuir/inputs.multi.2d.rt
index 79bc2b383..54d4e2c16 100644
--- a/Examples/Tests/Langmuir/inputs.multi.2d.rt
+++ b/Examples/Tests/Langmuir/inputs.multi.2d.rt
@@ -25,10 +25,7 @@ warpx.serialize_ics = 1
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 1
-algo.particle_pusher = 0
+algo.field_gathering = standard
 
 # Interpolation
 interpolation.nox = 1
diff --git a/Examples/Tests/Langmuir/inputs.multi.rt b/Examples/Tests/Langmuir/inputs.multi.rt
index c9969d39a..8e6d6d8e1 100644
--- a/Examples/Tests/Langmuir/inputs.multi.rt
+++ b/Examples/Tests/Langmuir/inputs.multi.rt
@@ -26,10 +26,8 @@ warpx.serialize_ics = 1
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 3
-algo.charge_deposition = 0
-algo.field_gathering = 1
-algo.particle_pusher = 0
+algo.current_deposition = direct
+algo.field_gathering = standard
 
 # Interpolation
 interpolation.nox = 1
diff --git a/Examples/Tests/Langmuir/inputs.multi.rz.rt b/Examples/Tests/Langmuir/inputs.multi.rz.rt
index 6bc84a434..e4099f9c5 100644
--- a/Examples/Tests/Langmuir/inputs.multi.rz.rt
+++ b/Examples/Tests/Langmuir/inputs.multi.rz.rt
@@ -25,10 +25,7 @@ warpx.serialize_ics = 1
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 1
-algo.particle_pusher = 0
+algo.field_gathering = standard
 
 # Interpolation
 interpolation.nox = 1
diff --git a/Examples/Tests/Langmuir/inputs.nolb b/Examples/Tests/Langmuir/inputs.nolb
index cd2900b29..6ac0629cb 100644
--- a/Examples/Tests/Langmuir/inputs.nolb
+++ b/Examples/Tests/Langmuir/inputs.nolb
@@ -32,10 +32,7 @@ warpx.moving_window_v = 0.0 # in units of the speed of light
 warpx.regrid_int = -1
 
 # Algorithms
-algo.current_deposition = 3
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
+algo.current_deposition = direct
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Tests/Langmuir/inputs.rt b/Examples/Tests/Langmuir/inputs.rt
index 3ec8fc47d..be5a730c2 100644
--- a/Examples/Tests/Langmuir/inputs.rt
+++ b/Examples/Tests/Langmuir/inputs.rt
@@ -25,10 +25,7 @@ warpx.serialize_ics = 1
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 1
-algo.particle_pusher = 0
+algo.field_gathering = standard
 
 # Interpolation
 interpolation.nox = 1
diff --git a/Examples/Tests/Larmor/inputs b/Examples/Tests/Larmor/inputs
index 729d315e9..da8fb5bae 100644
--- a/Examples/Tests/Larmor/inputs
+++ b/Examples/Tests/Larmor/inputs
@@ -41,10 +41,6 @@ warpx.B_external = 0.0  0.00078110417851950768  0.0
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Tests/Larmor/inputs.ml b/Examples/Tests/Larmor/inputs.ml
index 9a0c9eb3d..3cfb978d7 100644
--- a/Examples/Tests/Larmor/inputs.ml
+++ b/Examples/Tests/Larmor/inputs.ml
@@ -41,10 +41,6 @@ warpx.B_external = 0.0  0.00078110417851950768  0.0
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Tests/PML/inputs2d b/Examples/Tests/PML/inputs2d
index c84cf0555..5b936a333 100644
--- a/Examples/Tests/PML/inputs2d
+++ b/Examples/Tests/PML/inputs2d
@@ -23,10 +23,6 @@ geometry.prob_hi     =  30.e-6  120.e-6
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 
 warpx.cfl = 1.0
 warpx.do_pml = 1
diff --git a/Examples/Tests/SingleParticle/bilinear_filter_analysis.py b/Examples/Tests/SingleParticle/bilinear_filter_analysis.py
new file mode 100755
index 000000000..1a47796fb
--- /dev/null
+++ b/Examples/Tests/SingleParticle/bilinear_filter_analysis.py
@@ -0,0 +1,38 @@
+#! /usr/bin/env python
+
+import sys
+import yt ; yt.funcs.mylog.setLevel(0)
+import numpy as np
+from scipy import signal
+
+# Build Jx without filter (from other simulation)
+my_F_nofilter = np.zeros([16,16])
+my_F_nofilter[8,8] = -1.601068065642412e-11
+my_F_nofilter[8,7] = -1.601068065642412e-11
+
+# Build 2D filter
+filter0 = np.array([.25,.5,.25])
+my_order = [1,5]
+my_filterx = filter0
+my_filtery = filter0
+while my_order[0]>1:
+    my_filterx = np.convolve(my_filterx,filter0)
+    my_order[0] -= 1
+while my_order[1]>1:
+    my_filtery = np.convolve(my_filtery,filter0)
+    my_order[1] -= 1
+my_filter = my_filterx[:,None]*my_filtery
+
+# Apply filter. my_F_filetered is the theoretical value for filtered field
+my_F_filtered = signal.convolve2d(my_F_nofilter, my_filter, boundary='symm', mode='same')
+
+# Get simulation result for F_filtered
+filename = sys.argv[1]
+ds = yt.load( filename )
+sl = yt.SlicePlot(ds, 2, 'jx', aspect=1)
+all_data_level_0 = ds.covering_grid(level=0,left_edge=ds.domain_left_edge, dims=ds.domain_dimensions)
+F_filtered = all_data_level_0['boxlib', 'jx'].v.squeeze()
+
+# Compare theory and PIC for filtered value
+error = np.sum( np.abs(F_filtered - my_F_filtered) ) / np.sum( np.abs(my_F_filtered) )
+assert( error < 1.e-14 )
diff --git a/Examples/Tests/SingleParticle/inputs b/Examples/Tests/SingleParticle/inputs
new file mode 100644
index 000000000..548848d79
--- /dev/null
+++ b/Examples/Tests/SingleParticle/inputs
@@ -0,0 +1,23 @@
+max_step = 1
+amr.n_cell = 16 16
+amr.max_level = 0
+amr.blocking_factor = 8
+amr.max_grid_size = 8
+amr.plot_int = 1
+geometry.coord_sys   = 0
+geometry.is_periodic = 0 0
+geometry.prob_lo     = -8 -12
+geometry.prob_hi     =  8  12
+warpx.do_pml = 0
+algo.charge_deposition = standard
+algo.field_gathering = standard
+warpx.cfl = 1.0
+
+particles.nspecies = 1
+particles.species_names = electron
+electron.charge = -q_e
+electron.mass = m_e
+electron.injection_style = "SingleParticle"
+electron.single_particle_pos = 0.0   0.0 0.0
+electron.single_particle_vel = 1.e20 0.0 0.0  # gamma*beta
+electron.single_particle_weight = 1.0
diff --git a/Examples/Tests/gpu_test/inputs b/Examples/Tests/gpu_test/inputs
index 2b719655d..e4ae27469 100644
--- a/Examples/Tests/gpu_test/inputs
+++ b/Examples/Tests/gpu_test/inputs
@@ -24,10 +24,8 @@ warpx.do_pml = 0
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 3
-algo.charge_deposition = 0
-algo.field_gathering = 1
-algo.particle_pusher = 0
+algo.current_deposition = direct
+algo.field_gathering = standard
 
 interpolation.nox = 1
 interpolation.noy = 1
diff --git a/Examples/Tests/laser_on_fine/inputs b/Examples/Tests/laser_on_fine/inputs
index 8fc8876cb..9d3133dd7 100644
--- a/Examples/Tests/laser_on_fine/inputs
+++ b/Examples/Tests/laser_on_fine/inputs
@@ -38,10 +38,9 @@ warpx.pml_ncell = 10
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 1
-algo.charge_deposition = 1
-algo.field_gathering = 1
-algo.particle_pusher = 0
+algo.current_deposition = esirkepov
+algo.charge_deposition = standard
+algo.field_gathering = standard
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Tests/laser_on_fine/inputs.2d b/Examples/Tests/laser_on_fine/inputs.2d
index c4af2df56..d011ff4ee 100644
--- a/Examples/Tests/laser_on_fine/inputs.2d
+++ b/Examples/Tests/laser_on_fine/inputs.2d
@@ -38,10 +38,9 @@ warpx.pml_ncell = 10
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 1
-algo.charge_deposition = 1
-algo.field_gathering = 1
-algo.particle_pusher = 0
+algo.current_deposition = esirkepov
+algo.charge_deposition = standard
+algo.field_gathering = standard
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Tests/self_force_test/inputs b/Examples/Tests/self_force_test/inputs
index 1e55c1174..5e7bcc4f8 100644
--- a/Examples/Tests/self_force_test/inputs
+++ b/Examples/Tests/self_force_test/inputs
@@ -42,10 +42,9 @@ warpx.pml_ncell = 10
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 1
-algo.charge_deposition = 1
-algo.field_gathering = 1
-algo.particle_pusher = 0
+algo.current_deposition = esirkepov
+algo.charge_deposition = standard
+algo.field_gathering = standard
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Examples/Tests/subcycling/inputs.2d b/Examples/Tests/subcycling/inputs.2d
index 9790ee973..15034be52 100644
--- a/Examples/Tests/subcycling/inputs.2d
+++ b/Examples/Tests/subcycling/inputs.2d
@@ -37,10 +37,6 @@ warpx.n_current_deposition_buffer = 0
 warpx.n_field_gather_buffer = 0
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 algo.maxwell_fdtd_solver = "ckc"
 
 # CFL
diff --git a/GNUmakefile b/GNUmakefile
index 1acd53be7..5575a558e 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -20,6 +20,8 @@ TINY_PROFILE   = TRUE
 USE_OMP   = TRUE
 USE_GPU   = FALSE
 
+FAB_IS_MANAGED = FALSE
+
 EBASE     = main
 
 USE_PYTHON_MAIN = FALSE
diff --git a/Regression/WarpX-tests.ini b/Regression/WarpX-tests.ini
index 34fbc2769..80580ee42 100644
--- a/Regression/WarpX-tests.ini
+++ b/Regression/WarpX-tests.ini
@@ -102,6 +102,21 @@ compileTest = 0
 doVis = 0
 analysisRoutine = Examples/Modules/nci_corrector/ncicorr_analysis.py
 
+[bilinear_filter]
+buildDir = .
+inputFile = Examples/Tests/SingleParticle/inputs
+runtime_params = warpx.use_filter=1 warpx.filter_npass_each_dir=1 5
+dim = 2
+addToCompileString =
+restartTest = 0
+useMPI = 1
+numprocs = 2
+useOMP = 1
+numthreads = 2
+compileTest = 0
+doVis = 0
+analysisRoutine = Examples/Tests/SingleParticle/bilinear_filter_analysis.py
+
 [Langmuir_2d]
 buildDir = .
 inputFile = Examples/Tests/Langmuir/inputs.rt
@@ -241,7 +256,7 @@ numthreads = 1
 compileTest = 0
 doVis = 0
 compareParticles = 1
-runtime_params = warpx.do_nodal=1
+runtime_params = warpx.do_nodal=1 algo.current_deposition=direct
 particleTypes = electrons positrons
 analysisRoutine = Examples/Tests/Langmuir/langmuir_multi_2d_analysis.py
 analysisOutputImage = langmuir_multi_2d_analysis.png
diff --git a/Source/Diagnostics/BoostedFrameDiagnostic.cpp b/Source/Diagnostics/BoostedFrameDiagnostic.cpp
index 709b7cb48..5e7d4e0ad 100644
--- a/Source/Diagnostics/BoostedFrameDiagnostic.cpp
+++ b/Source/Diagnostics/BoostedFrameDiagnostic.cpp
@@ -754,9 +754,6 @@ writeLabFrameData(const MultiFab* cell_centered_data,
             MultiFab tmp(slice_ba, data_buffer_[i]->DistributionMap(), ncomp, 0);
             tmp.copy(*slice, 0, 0, ncomp);
             
-#ifdef _OPENMP
-#pragma omp parallel
-#endif
             // Copy data from MultiFab tmp to MultiDab data_buffer[i]
             CopySlice(tmp, *data_buffer_[i], i_lab, map_actual_fields_to_dump);
         }
diff --git a/Source/Diagnostics/FieldIO.cpp b/Source/Diagnostics/FieldIO.cpp
index e3d44d1fc..b1181f22f 100644
--- a/Source/Diagnostics/FieldIO.cpp
+++ b/Source/Diagnostics/FieldIO.cpp
@@ -677,7 +677,7 @@ getInterpolatedScalar(
             if ( F_fp.is_nodal() ){
                 IntVect refinement_vector{AMREX_D_DECL(r_ratio, r_ratio, r_ratio)};
                 node_bilinear_interp.interp(cfab, 0, ffab, 0, 1,
-                        finebx, refinement_vector, {}, {}, {}, 0, 0);
+                        finebx, refinement_vector, {}, {}, {}, 0, 0, RunOn::Cpu);
             } else {
                 amrex::Abort("Unknown field staggering.");
             }
diff --git a/Source/Diagnostics/ParticleIO.cpp b/Source/Diagnostics/ParticleIO.cpp
index ba413dcae..f2a543ed5 100644
--- a/Source/Diagnostics/ParticleIO.cpp
+++ b/Source/Diagnostics/ParticleIO.cpp
@@ -5,6 +5,53 @@
 using namespace amrex;
 
 void
+RigidInjectedParticleContainer::ReadHeader (std::istream& is)
+{
+    is >> charge >> mass;
+    WarpX::GotoNextLine(is);
+
+    int nlevs;
+    is >> nlevs;
+    WarpX::GotoNextLine(is);
+
+    AMREX_ASSERT(zinject_plane_levels.size() == 0);
+    AMREX_ASSERT(done_injecting.size() == 0);
+
+    for (int i = 0; i < nlevs; ++i)
+    {
+        int zinject_plane_tmp;
+        is >> zinject_plane_tmp;
+        zinject_plane_levels.push_back(zinject_plane_tmp);
+        WarpX::GotoNextLine(is);        
+    } 
+
+    for (int i = 0; i < nlevs; ++i)
+    {
+        int done_injecting_tmp;
+        is >> done_injecting_tmp;
+        done_injecting.push_back(done_injecting_tmp);
+        WarpX::GotoNextLine(is);
+    }     
+}
+
+void
+RigidInjectedParticleContainer::WriteHeader (std::ostream& os) const
+{
+    // no need to write species_id
+    os << charge << " " << mass << "\n";
+    int nlevs = zinject_plane_levels.size();
+    os << nlevs << "\n";
+    for (int i = 0; i < nlevs; ++i)
+    {
+        os << zinject_plane_levels[i] << "\n";
+    }
+    for (int i = 0; i < nlevs; ++i)
+    {
+        os << done_injecting[i] << "\n";
+    }
+}
+
+void
 WarpXParticleContainer::ReadHeader (std::istream& is)
 {
     is >> charge >> mass;
@@ -27,15 +74,45 @@ MultiParticleContainer::Checkpoint (const std::string& dir) const
 }
 
 void
-MultiParticleContainer::WritePlotFile (const std::string& dir,
-                                       const Vector<std::string>& real_names) const
+MultiParticleContainer::WritePlotFile (const std::string& dir) const
 {
     Vector<std::string> int_names;    
     Vector<int> int_flags;
-    
+
     for (unsigned i = 0, n = species_names.size(); i < n; ++i) {
-        auto& pc = allcontainers[i];
+        auto& pc = allcontainers[i];                
         if (pc->plot_species) {
+
+            Vector<std::string> real_names;
+            real_names.push_back("weight");
+
+            real_names.push_back("momentum_x");
+            real_names.push_back("momentum_y");
+            real_names.push_back("momentum_z");
+            
+            real_names.push_back("Ex");
+            real_names.push_back("Ey");
+            real_names.push_back("Ez");
+            
+            real_names.push_back("Bx");
+            real_names.push_back("By");
+            real_names.push_back("Bz");
+            
+#ifdef WARPX_RZ
+            real_names.push_back("theta");
+#endif
+            
+            if (WarpX::do_boosted_frame_diagnostic && pc->DoBoostedFrameDiags())
+            {
+                real_names.push_back("xold");
+                real_names.push_back("yold");
+                real_names.push_back("zold");
+                
+                real_names.push_back("uxold");
+                real_names.push_back("uyold");
+                real_names.push_back("uzold");
+            }
+                        
             // Convert momentum to SI
             pc->ConvertUnits(ConvertDirection::WarpX_to_SI);
             // real_names contains a list of all particle attributes.
diff --git a/Source/Diagnostics/WarpXIO.cpp b/Source/Diagnostics/WarpXIO.cpp
index 65201ba33..48a066cce 100644
--- a/Source/Diagnostics/WarpXIO.cpp
+++ b/Source/Diagnostics/WarpXIO.cpp
@@ -614,37 +614,7 @@ WarpX::WritePlotFile () const
         }
     }
 
-    Vector<std::string> particle_varnames;
-    particle_varnames.push_back("weight");
-
-    particle_varnames.push_back("momentum_x");
-    particle_varnames.push_back("momentum_y");
-    particle_varnames.push_back("momentum_z");
-
-    particle_varnames.push_back("Ex");
-    particle_varnames.push_back("Ey");
-    particle_varnames.push_back("Ez");
-
-    particle_varnames.push_back("Bx");
-    particle_varnames.push_back("By");
-    particle_varnames.push_back("Bz");
-
-#ifdef WARPX_RZ
-    particle_varnames.push_back("theta");
-#endif
-
-    if (WarpX::do_boosted_frame_diagnostic && WarpX::do_boosted_frame_particles)
-    {
-        particle_varnames.push_back("xold");
-        particle_varnames.push_back("yold");
-        particle_varnames.push_back("zold");
-
-        particle_varnames.push_back("uxold");
-        particle_varnames.push_back("uyold");
-        particle_varnames.push_back("uzold");
-    }
-
-    mypc->WritePlotFile(plotfilename, particle_varnames);
+    mypc->WritePlotFile(plotfilename);
 
     WriteJobInfo(plotfilename);
 
@@ -856,7 +826,7 @@ WarpX::InitializeSliceMultiFabs ()
     current_slice.resize(nlevels);
     Efield_slice.resize(nlevels);
     Bfield_slice.resize(nlevels);
-
+ 
 }
 
 
diff --git a/Source/Evolve/WarpXEvolveEM.cpp b/Source/Evolve/WarpXEvolveEM.cpp
index a5d68e4f9..6d6b68351 100644
--- a/Source/Evolve/WarpXEvolveEM.cpp
+++ b/Source/Evolve/WarpXEvolveEM.cpp
@@ -84,12 +84,14 @@ WarpX::EvolveEM (int numsteps)
                             *Bfield_aux[lev][0],*Bfield_aux[lev][1],*Bfield_aux[lev][2]);
             }
             is_synchronized = false;
+
         } else {
             // Beyond one step, we have E^{n} and B^{n}.
             // Particles have p^{n-1/2} and x^{n}.
             FillBoundaryE();
             FillBoundaryB();
             UpdateAuxilaryData();
+
         }
 
         if (do_subcycling == 0 || finest_level == 0) {
@@ -133,7 +135,8 @@ WarpX::EvolveEM (int numsteps)
         bool to_make_plot = (plot_int > 0) && ((step+1) % plot_int == 0);
 
         // slice generation //
-        bool to_make_slice_plot = (slice_plot_int > 0) && ( (step+1)% slice_plot_int == 0);        
+        bool to_make_slice_plot = (slice_plot_int > 0) && ( (step+1)% slice_plot_int == 0); 
+
         bool do_insitu = ((step+1) >= insitu_start) &&
             (insitu_int > 0) && ((step+1) % insitu_int == 0);
 
@@ -283,6 +286,7 @@ WarpX::OneStep_nosub (Real cur_time)
     if (warpx_py_beforedeposition) warpx_py_beforedeposition();
 #endif
     PushParticlesandDepose(cur_time);
+
 #ifdef WARPX_USE_PY
     if (warpx_py_afterdeposition) warpx_py_afterdeposition();
 #endif
diff --git a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H
index 0487e5226..12718e38b 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H
+++ b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H
@@ -8,14 +8,18 @@
  */
 class PsatdAlgorithm : public SpectralBaseAlgorithm
 {
+
     public:
         PsatdAlgorithm(const SpectralKSpace& spectral_kspace,
                          const amrex::DistributionMapping& dm,
                          const int norder_x, const int norder_y,
-                         const int norder_z, const bool nodal,
-                         const amrex::Real dt);
-        // Redefine update equation from base class
-        virtual void pushSpectralFields(SpectralFieldData& f) const override final;
+                         const int norder_z, const bool nodal, const amrex::Real dt);
+
+        void InitializeSpectralCoefficients(const SpectralKSpace& spectral_kspace,
+                                    const amrex::DistributionMapping& dm, 
+                                    const amrex::Real dt);
+
+        void pushSpectralFields(SpectralFieldData& f) const override final;
 
     private:
         SpectralCoefficients C_coef, S_ck_coef, X1_coef, X2_coef, X3_coef;
diff --git a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.cpp b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.cpp
index 37892d35a..d45b01bda 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.cpp
+++ b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.cpp
@@ -22,59 +22,8 @@ PsatdAlgorithm::PsatdAlgorithm(const SpectralKSpace& spectral_kspace,
     X2_coef = SpectralCoefficients(ba, dm, 1, 0);
     X3_coef = SpectralCoefficients(ba, dm, 1, 0);
 
-    // Fill them with the right values:
-    // Loop over boxes and allocate the corresponding coefficients
-    // for each box owned by the local MPI proc
-    for (MFIter mfi(ba, dm); mfi.isValid(); ++mfi){
-
-        const Box& bx = ba[mfi];
-
-        // Extract pointers for the k vectors
-        const Real* modified_kx = modified_kx_vec[mfi].dataPtr();
-#if (AMREX_SPACEDIM==3)
-        const Real* modified_ky = modified_ky_vec[mfi].dataPtr();
-#endif
-        const Real* modified_kz = modified_kz_vec[mfi].dataPtr();
-        // Extract arrays for the coefficients
-        Array4<Real> C = C_coef[mfi].array();
-        Array4<Real> S_ck = S_ck_coef[mfi].array();
-        Array4<Real> X1 = X1_coef[mfi].array();
-        Array4<Real> X2 = X2_coef[mfi].array();
-        Array4<Real> X3 = X3_coef[mfi].array();
-
-        // Loop over indices within one box
-        ParallelFor(bx,
-        [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
-        {
-            // Calculate norm of vector
-            const Real k_norm = std::sqrt(
-                std::pow(modified_kx[i], 2) +
-#if (AMREX_SPACEDIM==3)
-                std::pow(modified_ky[j], 2) +
-                std::pow(modified_kz[k], 2));
-#else
-                std::pow(modified_kz[j], 2));
-#endif
-
-            // Calculate coefficients
-            constexpr Real c = PhysConst::c;
-            constexpr Real ep0 = PhysConst::ep0;
-            if (k_norm != 0){
-                C(i,j,k) = std::cos(c*k_norm*dt);
-                S_ck(i,j,k) = std::sin(c*k_norm*dt)/(c*k_norm);
-                X1(i,j,k) = (1. - C(i,j,k))/(ep0 * c*c * k_norm*k_norm);
-                X2(i,j,k) = (1. - S_ck(i,j,k)/dt)/(ep0 * k_norm*k_norm);
-                X3(i,j,k) = (C(i,j,k) - S_ck(i,j,k)/dt)/(ep0 * k_norm*k_norm);
-            } else { // Handle k_norm = 0, by using the analytical limit
-                C(i,j,k) = 1.;
-                S_ck(i,j,k) = dt;
-                X1(i,j,k) = 0.5 * dt*dt / ep0;
-                X2(i,j,k) = c*c * dt*dt / (6.*ep0);
-                X3(i,j,k) = - c*c * dt*dt / (3.*ep0);
-            }
-        });
-    }
-};
+    InitializeSpectralCoefficients(spectral_kspace, dm, dt);
+}
 
 /* Advance the E and B field in spectral space (stored in `f`)
  * over one time step */
@@ -130,13 +79,14 @@ PsatdAlgorithm::pushSpectralFields(SpectralFieldData& f) const{
 #endif
             constexpr Real c2 = PhysConst::c*PhysConst::c;
             constexpr Real inv_ep0 = 1./PhysConst::ep0;
-            constexpr Complex I = Complex{0,1};
+            const Complex I = Complex{0,1};
             const Real C = C_arr(i,j,k);
             const Real S_ck = S_ck_arr(i,j,k);
             const Real X1 = X1_arr(i,j,k);
             const Real X2 = X2_arr(i,j,k);
             const Real X3 = X3_arr(i,j,k);
 
+
             // Update E (see WarpX online documentation: theory section)
             fields(i,j,k,Idx::Ex) = C*Ex_old
                         + S_ck*(c2*I*(ky*Bz_old - kz*By_old) - inv_ep0*Jx)
@@ -160,3 +110,63 @@ PsatdAlgorithm::pushSpectralFields(SpectralFieldData& f) const{
         });
     }
 };
+
+void PsatdAlgorithm::InitializeSpectralCoefficients(const SpectralKSpace& spectral_kspace,
+                                    const amrex::DistributionMapping& dm,
+                                    const amrex::Real dt)
+{
+    const BoxArray& ba = spectral_kspace.spectralspace_ba;
+    // Fill them with the right values:
+    // Loop over boxes and allocate the corresponding coefficients
+    // for each box owned by the local MPI proc
+    for (MFIter mfi(ba, dm); mfi.isValid(); ++mfi){
+
+        const Box& bx = ba[mfi];
+
+        // Extract pointers for the k vectors
+        const Real* modified_kx = modified_kx_vec[mfi].dataPtr();
+#if (AMREX_SPACEDIM==3)
+        const Real* modified_ky = modified_ky_vec[mfi].dataPtr();
+#endif
+        const Real* modified_kz = modified_kz_vec[mfi].dataPtr();
+        // Extract arrays for the coefficients
+        Array4<Real> C = C_coef[mfi].array();
+        Array4<Real> S_ck = S_ck_coef[mfi].array();
+        Array4<Real> X1 = X1_coef[mfi].array();
+        Array4<Real> X2 = X2_coef[mfi].array();
+        Array4<Real> X3 = X3_coef[mfi].array();
+
+        // Loop over indices within one box
+        ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+        {
+            // Calculate norm of vector
+            const Real k_norm = std::sqrt(
+                std::pow(modified_kx[i], 2) +
+#if (AMREX_SPACEDIM==3)
+                std::pow(modified_ky[j], 2) +
+                std::pow(modified_kz[k], 2));
+#else
+                std::pow(modified_kz[j], 2));
+#endif
+
+
+            // Calculate coefficients
+            constexpr Real c = PhysConst::c;
+            constexpr Real ep0 = PhysConst::ep0;
+            if (k_norm != 0){
+                C(i,j,k) = std::cos(c*k_norm*dt);
+                S_ck(i,j,k) = std::sin(c*k_norm*dt)/(c*k_norm);
+                X1(i,j,k) = (1. - C(i,j,k))/(ep0 * c*c * k_norm*k_norm);
+                X2(i,j,k) = (1. - S_ck(i,j,k)/dt)/(ep0 * k_norm*k_norm);
+                X3(i,j,k) = (C(i,j,k) - S_ck(i,j,k)/dt)/(ep0 * k_norm*k_norm);
+            } else { // Handle k_norm = 0, by using the analytical limit
+                C(i,j,k) = 1.;
+                S_ck(i,j,k) = dt;
+                X1(i,j,k) = 0.5 * dt*dt / ep0;
+                X2(i,j,k) = c*c * dt*dt / (6.*ep0);
+                X3(i,j,k) = - c*c * dt*dt / (3.*ep0);
+            }
+        });
+     }
+}
diff --git a/Source/FieldSolver/SpectralSolver/SpectralFieldData.H b/Source/FieldSolver/SpectralSolver/SpectralFieldData.H
index 8e58aa1d8..7954414b8 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralFieldData.H
+++ b/Source/FieldSolver/SpectralSolver/SpectralFieldData.H
@@ -25,7 +25,7 @@ class SpectralFieldData
     // (plans are only initialized for the boxes that are owned by
     // the local MPI rank)
 #ifdef AMREX_USE_GPU
-    // Add cuFFT-specific code
+    using FFTplans = amrex::LayoutData<cufftHandle>;
 #else
     using FFTplans = amrex::LayoutData<fftw_plan>;
 #endif
diff --git a/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp b/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp
index 02fa2015f..a2b695568 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp
+++ b/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp
@@ -53,7 +53,38 @@ SpectralFieldData::SpectralFieldData( const BoxArray& realspace_ba,
         // the FFT plan, the valid dimensions are those of the real-space box.
         IntVect fft_size = realspace_ba[mfi].length();
 #ifdef AMREX_USE_GPU
-        // Add cuFFT-specific code
+        // Create cuFFT plans
+        // Creating 3D plan for real to complex -- double precision
+        // Assuming CUDA is used for programming GPU 
+        // Note that D2Z is inherently forward plan 
+        // and  Z2D is inherently backward plan 
+        cufftResult result;
+#if (AMREX_SPACEDIM == 3)
+        result = cufftPlan3d( &forward_plan[mfi], fft_size[2], 
+                              fft_size[1],fft_size[0], CUFFT_D2Z);
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " cufftplan3d forward failed! \n";
+        }
+
+        result = cufftPlan3d( &backward_plan[mfi], fft_size[2], 
+                              fft_size[1], fft_size[0], CUFFT_Z2D);
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " cufftplan3d backward failed! \n";
+        }
+#else
+        result = cufftPlan2d( &forward_plan[mfi], fft_size[1], 
+                              fft_size[0], CUFFT_D2Z );
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " cufftplan2d forward failed! \n";
+        }
+
+        result = cufftPlan2d( &backward_plan[mfi], fft_size[1], 
+                               fft_size[0], CUFFT_Z2D );
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " cufftplan2d backward failed! \n";
+        }
+#endif
+
 #else
         // Create FFTW plans
         forward_plan[mfi] =
@@ -86,7 +117,9 @@ SpectralFieldData::~SpectralFieldData()
     if (tmpRealField.size() > 0){
         for ( MFIter mfi(tmpRealField); mfi.isValid(); ++mfi ){
 #ifdef AMREX_USE_GPU
-            // Add cuFFT-specific code
+            // Destroy cuFFT plans
+            cufftDestroy( forward_plan[mfi] );
+            cufftDestroy( backward_plan[mfi] );
 #else
             // Destroy FFTW plans
             fftw_destroy_plan( forward_plan[mfi] );
@@ -129,14 +162,25 @@ SpectralFieldData::ForwardTransform( const MultiFab& mf,
             Array4<Real> tmp_arr = tmpRealField[mfi].array();
             ParallelFor( realspace_bx,
             [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
-                tmp_arr(i,j,k) = mf_arr(i,j,k,i_comp);
+                tmp_arr(i,j,k) = mf_arr(i,j,k,i_comp);                
             });
         }
 
         // Perform Fourier transform from `tmpRealField` to `tmpSpectralField`
 #ifdef AMREX_USE_GPU
-        // Add cuFFT-specific code ; make sure that this is done on the same
-        // GPU stream as the above copy
+        // Perform Fast Fourier Transform on GPU using cuFFT 
+        // make sure that this is done on the same 
+        // GPU stream as the above copy        
+        cufftResult result;
+        cudaStream_t stream = amrex::Gpu::Device::cudaStream(); 
+        cufftSetStream ( forward_plan[mfi], stream);
+        result = cufftExecD2Z( forward_plan[mfi], 
+                               tmpRealField[mfi].dataPtr(), 
+                               reinterpret_cast<cuDoubleComplex*>(
+                               tmpSpectralField[mfi].dataPtr()) );
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " forward transform using cufftExecD2Z failed ! \n";
+        }
 #else
         fftw_execute( forward_plan[mfi] );
 #endif
@@ -155,6 +199,7 @@ SpectralFieldData::ForwardTransform( const MultiFab& mf,
             const Complex* zshift_arr = zshift_FFTfromCell[mfi].dataPtr();
             // Loop over indices within one box
             const Box spectralspace_bx = tmpSpectralField[mfi].box();
+
             ParallelFor( spectralspace_bx,
             [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
                 Complex spectral_field_value = tmp_arr(i,j,k);
@@ -207,6 +252,7 @@ SpectralFieldData::BackwardTransform( MultiFab& mf,
             const Complex* zshift_arr = zshift_FFTtoCell[mfi].dataPtr();
             // Loop over indices within one box
             const Box spectralspace_bx = tmpSpectralField[mfi].box();
+
             ParallelFor( spectralspace_bx,
             [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
                 Complex spectral_field_value = field_arr(i,j,k,field_index);
@@ -225,8 +271,19 @@ SpectralFieldData::BackwardTransform( MultiFab& mf,
 
         // Perform Fourier transform from `tmpSpectralField` to `tmpRealField`
 #ifdef AMREX_USE_GPU
-        // Add cuFFT-specific code ; make sure that this is done on the same
+        // Perform Fast Fourier Transform on GPU using cuFFT. 
+        // make sure that this is done on the same 
         // GPU stream as the above copy
+        cufftResult result;
+        cudaStream_t stream = amrex::Gpu::Device::cudaStream(); 
+        cufftSetStream ( backward_plan[mfi], stream);
+        result = cufftExecZ2D( backward_plan[mfi], 
+                               reinterpret_cast<cuDoubleComplex*>(
+                               tmpSpectralField[mfi].dataPtr()),
+                               tmpRealField[mfi].dataPtr() );
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " Backward transform using cufftexecZ2D failed! \n";
+        }
 #else
         fftw_execute( backward_plan[mfi] );
 #endif
@@ -240,6 +297,7 @@ SpectralFieldData::BackwardTransform( MultiFab& mf,
             Array4<const Real> tmp_arr = tmpRealField[mfi].array();
             // Normalization: divide by the number of points in realspace
             const Real inv_N = 1./realspace_bx.numPts();
+
             ParallelFor( realspace_bx,
             [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
                 // Copy and normalize field
diff --git a/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp b/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp
index 2fe78cedd..6fe5e3939 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp
+++ b/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp
@@ -142,9 +142,14 @@ SpectralKSpace::getSpectralShiftFactor( const DistributionMapping& dm,
             case ShiftType::TransformFromCellCentered: sign = -1.; break;
             case ShiftType::TransformToCellCentered: sign = 1.;
         }
-        constexpr Complex I{0,1};
+        const Complex I{0,1};
         for (int i=0; i<k.size(); i++ ){
+#ifdef AMREX_USE_GPU
+            shift[i] = thrust::exp( I*sign*k[i]*0.5*dx[i_dim] );
+#else
             shift[i] = std::exp( I*sign*k[i]*0.5*dx[i_dim] );
+#endif
+        
         }
     }
     return shift_factor;
diff --git a/Source/FieldSolver/WarpXFFT.cpp b/Source/FieldSolver/WarpXFFT.cpp
index 1cf5460f2..13d92f6f3 100644
--- a/Source/FieldSolver/WarpXFFT.cpp
+++ b/Source/FieldSolver/WarpXFFT.cpp
@@ -56,6 +56,7 @@ BuildFFTOwnerMask (const MultiFab& mf, const Geometry& geom)
         for (const auto& b : bl) {
             fab.setVal(nonowner, b, 0, 1);
         }
+
     }
 
     return mask;
@@ -89,7 +90,7 @@ CopyDataFromFFTToValid (MultiFab& mf, const MultiFab& mf_fft, const BoxArray& ba
 
         const FArrayBox& srcfab = mf_fft[mfi];
         const Box& srcbox = srcfab.box();
-
+ 
         if (srcbox.contains(bx))
         {
             // Copy the interior region (without guard cells)
@@ -107,6 +108,8 @@ CopyDataFromFFTToValid (MultiFab& mf, const MultiFab& mf_fft, const BoxArray& ba
     // the cell that has non-zero mask is the one which is retained.
     mf.setVal(0.0, 0);
     mf.ParallelAdd(mftmp);
+
+  
 }
 
 }
@@ -407,6 +410,7 @@ WarpX::PushPSATD (int lev, amrex::Real /* dt */)
 
     BL_PROFILE_VAR_START(blp_push_eb);
     if (fft_hybrid_mpi_decomposition){
+#ifndef AMREX_USE_CUDA // When running on CPU: use PICSAR code
         if (Efield_fp_fft[lev][0]->local_size() == 1)
            //Only one FFT patch on this MPI
         {
@@ -447,6 +451,9 @@ WarpX::PushPSATD (int lev, amrex::Real /* dt */)
         {
     	amrex::Abort("WarpX::PushPSATD: TODO");
         }
+#else // AMREX_USE_CUDA is defined ; running on GPU
+        amrex::Abort("The option `psatd.fft_hybrid_mpi_decomposition` does not work on GPU.");
+#endif
     } else {
         // Not using the hybrid decomposition
         auto& solver = *spectral_solver_fp[lev];
@@ -474,6 +481,7 @@ WarpX::PushPSATD (int lev, amrex::Real /* dt */)
         solver.BackwardTransform(*Bfield_fp_fft[lev][0], SpectralFieldIndex::Bx);
         solver.BackwardTransform(*Bfield_fp_fft[lev][1], SpectralFieldIndex::By);
         solver.BackwardTransform(*Bfield_fp_fft[lev][2], SpectralFieldIndex::Bz);
+
     }
     BL_PROFILE_VAR_STOP(blp_push_eb);
 
@@ -490,4 +498,7 @@ WarpX::PushPSATD (int lev, amrex::Real /* dt */)
     {
         amrex::Abort("WarpX::PushPSATD: TODO");
     }
+
 }
+
+
diff --git a/Source/Filter/BilinearFilter.H b/Source/Filter/BilinearFilter.H
index 9bade3c77..4fcfec14b 100644
--- a/Source/Filter/BilinearFilter.H
+++ b/Source/Filter/BilinearFilter.H
@@ -1,30 +1,17 @@
-#include <AMReX_MultiFab.H>
-#include <AMReX_CudaContainers.H>
+#include <Filter.H>
 
-#ifndef WARPX_FILTER_H_
-#define WARPX_FILTER_H_
+#ifndef WARPX_BILIN_FILTER_H_
+#define WARPX_BILIN_FILTER_H_
 
-class BilinearFilter
+class BilinearFilter : public Filter
 {
 public:
-    BilinearFilter () = default;
+
+    BilinearFilter() = default;
     
     void ComputeStencils();
-    void ApplyStencil(amrex::MultiFab& dstmf, const amrex::MultiFab& srcmf, int scomp=0, int dcomp=0, int ncomp=10000);
 
     amrex::IntVect npass_each_dir;
-    amrex::IntVect stencil_length_each_dir;
-
-    // public for cuda
-    void Filter(const amrex::Box& tbx,
-                amrex::Array4<amrex::Real const> const& tmp,
-                amrex::Array4<amrex::Real      > const& dst,
-                int scomp, int dcomp, int ncomp);
-
-private:
-
-    amrex::Gpu::ManagedVector<amrex::Real> stencil_x, stencil_y, stencil_z;
 
-    amrex::Dim3 slen;
 };
-#endif
+#endif // #ifndef WARPX_BILIN_FILTER_H_
diff --git a/Source/Filter/BilinearFilter.cpp b/Source/Filter/BilinearFilter.cpp
index f6acaa5df..68ebde687 100644
--- a/Source/Filter/BilinearFilter.cpp
+++ b/Source/Filter/BilinearFilter.cpp
@@ -68,173 +68,3 @@ void BilinearFilter::ComputeStencils(){
     slen.z = 1;
 #endif
 }
-
-
-#ifdef AMREX_USE_CUDA
-
-void
-BilinearFilter::ApplyStencil (MultiFab& dstmf, const MultiFab& srcmf, int scomp, int dcomp, int ncomp)
-{
-    BL_PROFILE("BilinearFilter::ApplyStencil()");
-    ncomp = std::min(ncomp, srcmf.nComp());
-
-    for (MFIter mfi(dstmf); mfi.isValid(); ++mfi)
-    {
-        const auto& src = srcmf.array(mfi);
-        const auto& dst = dstmf.array(mfi);
-        const Box& tbx = mfi.growntilebox();
-        const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
-
-        // tmpfab has enough ghost cells for the stencil
-        FArrayBox tmp_fab(gbx,ncomp);
-        Elixir tmp_eli = tmp_fab.elixir();  // Prevent the tmp data from being deleted too early
-        auto const& tmp = tmp_fab.array();
-
-        // Copy values in srcfab into tmpfab
-        const Box& ibx = gbx & srcmf[mfi].box();
-        AMREX_PARALLEL_FOR_4D ( gbx, ncomp, i, j, k, n,
-        {
-            if (ibx.contains(IntVect(AMREX_D_DECL(i,j,k)))) {
-                tmp(i,j,k,n) = src(i,j,k,n+scomp);
-            } else {
-                tmp(i,j,k,n) = 0.0;
-            }
-        });
-
-        // Apply filter
-        Filter(tbx, tmp, dst, 0, dcomp, ncomp);
-    }
-}
-
-void BilinearFilter::Filter (const Box& tbx,
-                             Array4<Real const> const& tmp,
-                             Array4<Real      > const& dst,
-                             int scomp, int dcomp, int ncomp)
-{
-    amrex::Real const* AMREX_RESTRICT sx = stencil_x.data();
-    amrex::Real const* AMREX_RESTRICT sy = stencil_y.data();
-    amrex::Real const* AMREX_RESTRICT sz = stencil_z.data();
-    Dim3 slen_local = slen;
-    AMREX_PARALLEL_FOR_4D ( tbx, ncomp, i, j, k, n,
-    {
-        Real d = 0.0;
-
-        for         (int iz=0; iz < slen_local.z; ++iz){
-            for     (int iy=0; iy < slen_local.y; ++iy){
-                for (int ix=0; ix < slen_local.x; ++ix){
-#if (AMREX_SPACEDIM == 3)        
-                    Real sss = sx[ix]*sy[iy]*sz[iz];
-#else
-                    Real sss = sx[ix]*sz[iy];
-#endif                        
-#if (AMREX_SPACEDIM == 3)
-                    d += sss*( tmp(i-ix,j-iy,k-iz,scomp+n)
-                              +tmp(i+ix,j-iy,k-iz,scomp+n)
-                              +tmp(i-ix,j+iy,k-iz,scomp+n)
-                              +tmp(i+ix,j+iy,k-iz,scomp+n)
-                              +tmp(i-ix,j-iy,k+iz,scomp+n)
-                              +tmp(i+ix,j-iy,k+iz,scomp+n)
-                              +tmp(i-ix,j+iy,k+iz,scomp+n)
-                              +tmp(i+ix,j+iy,k+iz,scomp+n));
-#else
-                    d += sss*( tmp(i-ix,j-iy,k,scomp+n)
-                              +tmp(i+ix,j-iy,k,scomp+n)
-                              +tmp(i-ix,j+iy,k,scomp+n)
-                              +tmp(i+ix,j+iy,k,scomp+n));
-#endif
-                }
-            }
-        }
-
-        dst(i,j,k,dcomp+n) = d;
-    });
-}
-
-#else
-
-void
-BilinearFilter::ApplyStencil (MultiFab& dstmf, const MultiFab& srcmf, int scomp, int dcomp, int ncomp)
-{
-    BL_PROFILE("BilinearFilter::ApplyStencil()");
-    ncomp = std::min(ncomp, srcmf.nComp());
-#ifdef _OPENMP
-#pragma omp parallel
-#endif
-    {
-        FArrayBox tmpfab;
-        for (MFIter mfi(dstmf,true); mfi.isValid(); ++mfi){
-            const auto& srcfab = srcmf[mfi];
-            auto& dstfab = dstmf[mfi];
-            const Box& tbx = mfi.growntilebox();
-            const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
-            // tmpfab has enough ghost cells for the stencil
-            tmpfab.resize(gbx,ncomp);
-            tmpfab.setVal(0.0, gbx, 0, ncomp);
-            // Copy values in srcfab into tmpfab
-            const Box& ibx = gbx & srcfab.box();
-            tmpfab.copy(srcfab, ibx, scomp, ibx, 0, ncomp);
-            // Apply filter
-            Filter(tbx, tmpfab.array(), dstfab.array(), 0, dcomp, ncomp);
-        }
-    }
-}
-
-void BilinearFilter::Filter (const Box& tbx,
-                             Array4<Real const> const& tmp,
-                             Array4<Real      > const& dst,
-                             int scomp, int dcomp, int ncomp)
-{
-    const auto lo = amrex::lbound(tbx);
-    const auto hi = amrex::ubound(tbx);
-    // tmp and dst are of type Array4 (Fortran ordering)
-    amrex::Real const* AMREX_RESTRICT sx = stencil_x.data();
-    amrex::Real const* AMREX_RESTRICT sy = stencil_y.data();
-    amrex::Real const* AMREX_RESTRICT sz = stencil_z.data();
-    for (int n = 0; n < ncomp; ++n) {
-        // Set dst value to 0.
-        for         (int k = lo.z; k <= hi.z; ++k) {
-            for     (int j = lo.y; j <= hi.y; ++j) {
-                for (int i = lo.x; i <= hi.x; ++i) {
-                    dst(i,j,k,dcomp+n) = 0.0;
-                }
-            }
-        }
-        // 3 nested loop on 3D stencil
-        for         (int iz=0; iz < slen.z; ++iz){
-            for     (int iy=0; iy < slen.y; ++iy){
-                for (int ix=0; ix < slen.x; ++ix){
-#if (AMREX_SPACEDIM == 3)        
-                    Real sss = sx[ix]*sy[iy]*sz[iz];
-#else
-                    Real sss = sx[ix]*sz[iy];
-#endif
-                    // 3 nested loop on 3D array
-                    for         (int k = lo.z; k <= hi.z; ++k) {
-                        for     (int j = lo.y; j <= hi.y; ++j) {
-                            AMREX_PRAGMA_SIMD
-                            for (int i = lo.x; i <= hi.x; ++i) {
-#if (AMREX_SPACEDIM == 3)
-                                dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k-iz,scomp+n)
-                                                          +tmp(i+ix,j-iy,k-iz,scomp+n)
-                                                          +tmp(i-ix,j+iy,k-iz,scomp+n)
-                                                          +tmp(i+ix,j+iy,k-iz,scomp+n)
-                                                          +tmp(i-ix,j-iy,k+iz,scomp+n)
-                                                          +tmp(i+ix,j-iy,k+iz,scomp+n)
-                                                          +tmp(i-ix,j+iy,k+iz,scomp+n)
-                                                          +tmp(i+ix,j+iy,k+iz,scomp+n));
-#else
-                                dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k,scomp+n)
-                                                          +tmp(i+ix,j-iy,k,scomp+n)
-                                                          +tmp(i-ix,j+iy,k,scomp+n)
-                                                          +tmp(i+ix,j+iy,k,scomp+n));
-#endif
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-#endif
diff --git a/Source/Filter/Filter.H b/Source/Filter/Filter.H
new file mode 100644
index 000000000..eaa0498c9
--- /dev/null
+++ b/Source/Filter/Filter.H
@@ -0,0 +1,43 @@
+#include <AMReX_MultiFab.H>
+#include <AMReX_CudaContainers.H>
+
+#ifndef WARPX_FILTER_H_
+#define WARPX_FILTER_H_
+
+class Filter
+{
+public:
+    Filter () = default;
+
+    // Apply stencil on MultiFab.
+    // Guard cells are handled inside this function
+    void ApplyStencil(amrex::MultiFab& dstmf, 
+                              const amrex::MultiFab& srcmf, int scomp=0, 
+                              int dcomp=0, int ncomp=10000);
+
+    // Apply stencil on a FabArray.
+    void ApplyStencil (amrex::FArrayBox& dstfab, 
+                       const amrex::FArrayBox& srcfab, const amrex::Box& tbx, 
+                       int scomp=0, int dcomp=0, int ncomp=10000);
+
+    // public for cuda
+    void DoFilter(const amrex::Box& tbx,
+                          amrex::Array4<amrex::Real const> const& tmp,
+                          amrex::Array4<amrex::Real      > const& dst,
+                          int scomp, int dcomp, int ncomp);
+
+    // In 2D, stencil_length_each_dir = {length(stencil_x), length(stencil_z)}
+    amrex::IntVect stencil_length_each_dir;
+
+protected:
+    // Stencil along each direction.
+    // in 2D, stencil_y is not initialized.
+    amrex::Gpu::ManagedVector<amrex::Real> stencil_x, stencil_y, stencil_z;
+    // Length of each stencil.
+    // In 2D, slen = {length(stencil_x), length(stencil_z), 1}
+    amrex::Dim3 slen;
+
+private:
+
+};
+#endif // #ifndef WARPX_FILTER_H_
diff --git a/Source/Filter/Filter.cpp b/Source/Filter/Filter.cpp
new file mode 100644
index 000000000..f8a2dd6c2
--- /dev/null
+++ b/Source/Filter/Filter.cpp
@@ -0,0 +1,257 @@
+#include <WarpX.H>
+#include <Filter.H>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+using namespace amrex;
+
+#ifdef AMREX_USE_CUDA
+
+/* \brief Apply stencil on MultiFab (GPU version, 2D/3D).
+ * \param dstmf Destination MultiFab
+ * \param srcmf source MultiFab
+ * \param scomp first component of srcmf on which the filter is applied
+ * \param dcomp first component of dstmf on which the filter is applied
+ * \param ncomp Number of components on which the filter is applied.
+ */
+void
+Filter::ApplyStencil (MultiFab& dstmf, const MultiFab& srcmf, int scomp, int dcomp, int ncomp)
+{
+    BL_PROFILE("BilinearFilter::ApplyStencil(MultiFab)");
+    ncomp = std::min(ncomp, srcmf.nComp());
+
+    for (MFIter mfi(dstmf); mfi.isValid(); ++mfi)
+    {
+        const auto& src = srcmf.array(mfi);
+        const auto& dst = dstmf.array(mfi);
+        const Box& tbx = mfi.growntilebox();
+        const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
+
+        // tmpfab has enough ghost cells for the stencil
+        FArrayBox tmp_fab(gbx,ncomp);
+        Elixir tmp_eli = tmp_fab.elixir();  // Prevent the tmp data from being deleted too early
+        auto const& tmp = tmp_fab.array();
+
+        // Copy values in srcfab into tmpfab
+        const Box& ibx = gbx & srcmf[mfi].box();
+        AMREX_PARALLEL_FOR_4D ( gbx, ncomp, i, j, k, n,
+        {
+            if (ibx.contains(IntVect(AMREX_D_DECL(i,j,k)))) {
+                tmp(i,j,k,n) = src(i,j,k,n+scomp);
+            } else {
+                tmp(i,j,k,n) = 0.0;
+            }
+        });
+
+        // Apply filter
+        DoFilter(tbx, tmp, dst, 0, dcomp, ncomp);
+    }
+}
+
+/* \brief Apply stencil on FArrayBox (GPU version, 2D/3D).
+ * \param dstfab Destination FArrayBox
+ * \param srcmf source FArrayBox
+ * \param tbx Grown box on which srcfab is defined.
+ * \param scomp first component of srcfab on which the filter is applied
+ * \param dcomp first component of dstfab on which the filter is applied
+ * \param ncomp Number of components on which the filter is applied.
+ */
+void
+Filter::ApplyStencil (FArrayBox& dstfab, const FArrayBox& srcfab, 
+                      const Box& tbx, int scomp, int dcomp, int ncomp)
+{
+    BL_PROFILE("BilinearFilter::ApplyStencil(FArrayBox)");
+    ncomp = std::min(ncomp, srcfab.nComp());
+    const auto& src = srcfab.array();
+    const auto& dst = dstfab.array();
+    const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
+
+    // tmpfab has enough ghost cells for the stencil
+    FArrayBox tmp_fab(gbx,ncomp);
+    Elixir tmp_eli = tmp_fab.elixir();  // Prevent the tmp data from being deleted too early
+    auto const& tmp = tmp_fab.array();
+
+    // Copy values in srcfab into tmpfab
+    const Box& ibx = gbx & srcfab.box();
+    AMREX_PARALLEL_FOR_4D ( gbx, ncomp, i, j, k, n,
+        {
+            if (ibx.contains(IntVect(AMREX_D_DECL(i,j,k)))) {
+                tmp(i,j,k,n) = src(i,j,k,n+scomp);
+            } else {
+                tmp(i,j,k,n) = 0.0;
+            }
+        });
+
+    // Apply filter
+    DoFilter(tbx, tmp, dst, 0, dcomp, ncomp);
+}
+
+/* \brief Apply stencil (2D/3D, CPU/GPU)
+ */
+void Filter::DoFilter (const Box& tbx,
+                       Array4<Real const> const& tmp,
+                       Array4<Real      > const& dst,
+                       int scomp, int dcomp, int ncomp)
+{
+    amrex::Real const* AMREX_RESTRICT sx = stencil_x.data();
+    amrex::Real const* AMREX_RESTRICT sy = stencil_y.data();
+    amrex::Real const* AMREX_RESTRICT sz = stencil_z.data();
+    Dim3 slen_local = slen;
+    AMREX_PARALLEL_FOR_4D ( tbx, ncomp, i, j, k, n,
+    {
+        Real d = 0.0;
+
+        for         (int iz=0; iz < slen_local.z; ++iz){
+            for     (int iy=0; iy < slen_local.y; ++iy){
+                for (int ix=0; ix < slen_local.x; ++ix){
+#if (AMREX_SPACEDIM == 3)        
+                    Real sss = sx[ix]*sy[iy]*sz[iz];
+#else
+                    Real sss = sx[ix]*sz[iy];
+#endif                        
+#if (AMREX_SPACEDIM == 3)
+                    d += sss*( tmp(i-ix,j-iy,k-iz,scomp+n)
+                              +tmp(i+ix,j-iy,k-iz,scomp+n)
+                              +tmp(i-ix,j+iy,k-iz,scomp+n)
+                              +tmp(i+ix,j+iy,k-iz,scomp+n)
+                              +tmp(i-ix,j-iy,k+iz,scomp+n)
+                              +tmp(i+ix,j-iy,k+iz,scomp+n)
+                              +tmp(i-ix,j+iy,k+iz,scomp+n)
+                              +tmp(i+ix,j+iy,k+iz,scomp+n));
+#else
+                    d += sss*( tmp(i-ix,j-iy,k,scomp+n)
+                              +tmp(i+ix,j-iy,k,scomp+n)
+                              +tmp(i-ix,j+iy,k,scomp+n)
+                              +tmp(i+ix,j+iy,k,scomp+n));
+#endif
+                }
+            }
+        }
+
+        dst(i,j,k,dcomp+n) = d;
+    });
+}
+
+#else
+
+/* \brief Apply stencil on MultiFab (CPU version, 2D/3D).
+ * \param dstmf Destination MultiFab
+ * \param srcmf source MultiFab
+ * \param scomp first component of srcmf on which the filter is applied
+ * \param dcomp first component of dstmf on which the filter is applied
+ * \param ncomp Number of components on which the filter is applied.
+ */
+void
+Filter::ApplyStencil (MultiFab& dstmf, const MultiFab& srcmf, int scomp, int dcomp, int ncomp)
+{
+    BL_PROFILE("BilinearFilter::ApplyStencil()");
+    ncomp = std::min(ncomp, srcmf.nComp());
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+    {
+        FArrayBox tmpfab;
+        for (MFIter mfi(dstmf,true); mfi.isValid(); ++mfi){
+            const auto& srcfab = srcmf[mfi];
+            auto& dstfab = dstmf[mfi];
+            const Box& tbx = mfi.growntilebox();
+            const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
+            // tmpfab has enough ghost cells for the stencil
+            tmpfab.resize(gbx,ncomp);
+            tmpfab.setVal(0.0, gbx, 0, ncomp);
+            // Copy values in srcfab into tmpfab
+            const Box& ibx = gbx & srcfab.box();
+            tmpfab.copy(srcfab, ibx, scomp, ibx, 0, ncomp);
+            // Apply filter
+            DoFilter(tbx, tmpfab.array(), dstfab.array(), 0, dcomp, ncomp);
+        }
+    }
+}
+
+/* \brief Apply stencil on FArrayBox (CPU version, 2D/3D).
+ * \param dstfab Destination FArrayBox
+ * \param srcmf source FArrayBox
+ * \param tbx Grown box on which srcfab is defined.
+ * \param scomp first component of srcfab on which the filter is applied
+ * \param dcomp first component of dstfab on which the filter is applied
+ * \param ncomp Number of components on which the filter is applied.
+ */
+void
+Filter::ApplyStencil (FArrayBox& dstfab, const FArrayBox& srcfab, 
+                      const Box& tbx, int scomp, int dcomp, int ncomp)
+{
+    BL_PROFILE("BilinearFilter::ApplyStencil(FArrayBox)");
+    ncomp = std::min(ncomp, srcfab.nComp());
+    FArrayBox tmpfab;
+    const Box& gbx = amrex::grow(tbx,stencil_length_each_dir-1);
+    // tmpfab has enough ghost cells for the stencil
+    tmpfab.resize(gbx,ncomp);
+    tmpfab.setVal(0.0, gbx, 0, ncomp);
+    // Copy values in srcfab into tmpfab
+    const Box& ibx = gbx & srcfab.box();
+    tmpfab.copy(srcfab, ibx, scomp, ibx, 0, ncomp);
+    // Apply filter
+    DoFilter(tbx, tmpfab.array(), dstfab.array(), 0, dcomp, ncomp);
+}
+
+void Filter::DoFilter (const Box& tbx,
+                       Array4<Real const> const& tmp,
+                       Array4<Real      > const& dst,
+                       int scomp, int dcomp, int ncomp)
+{
+    const auto lo = amrex::lbound(tbx);
+    const auto hi = amrex::ubound(tbx);
+    // tmp and dst are of type Array4 (Fortran ordering)
+    amrex::Real const* AMREX_RESTRICT sx = stencil_x.data();
+    amrex::Real const* AMREX_RESTRICT sy = stencil_y.data();
+    amrex::Real const* AMREX_RESTRICT sz = stencil_z.data();
+    for (int n = 0; n < ncomp; ++n) {
+        // Set dst value to 0.
+        for         (int k = lo.z; k <= hi.z; ++k) {
+            for     (int j = lo.y; j <= hi.y; ++j) {
+                for (int i = lo.x; i <= hi.x; ++i) {
+                    dst(i,j,k,dcomp+n) = 0.0;
+                }
+            }
+        }
+        // 3 nested loop on 3D stencil
+        for         (int iz=0; iz < slen.z; ++iz){
+            for     (int iy=0; iy < slen.y; ++iy){
+                for (int ix=0; ix < slen.x; ++ix){
+#if (AMREX_SPACEDIM == 3)        
+                    Real sss = sx[ix]*sy[iy]*sz[iz];
+#else
+                    Real sss = sx[ix]*sz[iy];
+#endif
+                    // 3 nested loop on 3D array
+                    for         (int k = lo.z; k <= hi.z; ++k) {
+                        for     (int j = lo.y; j <= hi.y; ++j) {
+                            AMREX_PRAGMA_SIMD
+                            for (int i = lo.x; i <= hi.x; ++i) {
+#if (AMREX_SPACEDIM == 3)
+                                dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k-iz,scomp+n)
+                                                          +tmp(i+ix,j-iy,k-iz,scomp+n)
+                                                          +tmp(i-ix,j+iy,k-iz,scomp+n)
+                                                          +tmp(i+ix,j+iy,k-iz,scomp+n)
+                                                          +tmp(i-ix,j-iy,k+iz,scomp+n)
+                                                          +tmp(i+ix,j-iy,k+iz,scomp+n)
+                                                          +tmp(i-ix,j+iy,k+iz,scomp+n)
+                                                          +tmp(i+ix,j+iy,k+iz,scomp+n));
+#else
+                                dst(i,j,k,dcomp+n) += sss*(tmp(i-ix,j-iy,k,scomp+n)
+                                                          +tmp(i+ix,j-iy,k,scomp+n)
+                                                          +tmp(i-ix,j+iy,k,scomp+n)
+                                                          +tmp(i+ix,j+iy,k,scomp+n));
+#endif
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+#endif // #ifdef AMREX_USE_CUDA
diff --git a/Source/Filter/Make.package b/Source/Filter/Make.package
index 36e74bfb0..8b8e9b50b 100644
--- a/Source/Filter/Make.package
+++ b/Source/Filter/Make.package
@@ -1,5 +1,9 @@
+CEXE_headers += Filter.H
+CEXE_sources += Filter.cpp
 CEXE_sources += BilinearFilter.cpp
 CEXE_headers += BilinearFilter.H
+CEXE_sources += NCIGodfreyFilter.cpp
+CEXE_headers += NCIGodfreyFilter.H
 
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Filter
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Filter
diff --git a/Source/Filter/NCIGodfreyFilter.H b/Source/Filter/NCIGodfreyFilter.H
new file mode 100644
index 000000000..a53039dfa
--- /dev/null
+++ b/Source/Filter/NCIGodfreyFilter.H
@@ -0,0 +1,30 @@
+#include <Filter.H>
+
+#ifndef WARPX_GODFREY_FILTER_H_
+#define WARPX_GODFREY_FILTER_H_
+
+enum class godfrey_coeff_set { Ex_Ey_Bz=0, Bx_By_Ez=1 };
+
+class NCIGodfreyFilter : public Filter
+{
+public:
+
+    NCIGodfreyFilter () = default;
+
+    NCIGodfreyFilter(godfrey_coeff_set coeff_set_, amrex::Real cdtodz_, amrex::Real l_lower_order_in_v_);
+
+    void ComputeStencils();
+
+    void getGodfreyCoeffs(godfrey_coeff_set coeff_set_in);
+
+    static constexpr int stencil_width = 4;
+
+private:
+    
+    godfrey_coeff_set coeff_set;
+    amrex::Real cdtodz;
+    int l_lower_order_in_v;
+
+};
+
+#endif // #ifndef WARPX_GODFREY_FILTER_H_
diff --git a/Source/Filter/NCIGodfreyFilter.cpp b/Source/Filter/NCIGodfreyFilter.cpp
new file mode 100644
index 000000000..3f589260a
--- /dev/null
+++ b/Source/Filter/NCIGodfreyFilter.cpp
@@ -0,0 +1,78 @@
+#include <WarpX.H>
+#include <NCIGodfreyFilter.H>
+#include <NCIGodfreyTables.H>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+using namespace amrex;
+
+NCIGodfreyFilter::NCIGodfreyFilter(godfrey_coeff_set coeff_set_, amrex::Real cdtodz_, amrex::Real l_lower_order_in_v_){
+    // Store parameters into class data members
+    coeff_set = coeff_set_;
+    cdtodz = cdtodz_;
+    l_lower_order_in_v = l_lower_order_in_v_;
+    // NCI Godfrey filter has fixed size, and is applied along z only.
+#if (AMREX_SPACEDIM == 3)
+    stencil_length_each_dir = {1,1,5};
+    slen = {1,1,5};
+#else
+    stencil_length_each_dir = {1,5};
+    slen = {1,5,1};
+#endif    
+}
+
+void NCIGodfreyFilter::ComputeStencils(){
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(
+#if ( AMREX_SPACEDIM == 3 )
+        slen.z==5,
+#else
+        slen.y==5, 
+#endif
+        "ERROR: NCI filter requires 5 points in z");
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(l_lower_order_in_v==1, 
+        "ERROR: NCI corrector requires l_lower_order_in_v=1, i.e., Galerkin scheme");
+    
+    // Interpolate coefficients from the table, and store into prestencil.
+    int index = tab_length*cdtodz;
+    index = min(index, tab_length-2);
+    index = max(index, 0);
+    Real weight_right = cdtodz - index/tab_length;
+    Real prestencil[4];
+    for(int i=0; i<tab_width; i++){
+        if        (coeff_set == godfrey_coeff_set::Ex_Ey_Bz) {
+            prestencil[i] = (1-weight_right)*table_nci_godfrey_Ex_Ey_Bz[index  ][i] + 
+                                weight_right*table_nci_godfrey_Ex_Ey_Bz[index+1][i];
+        } else if (coeff_set == godfrey_coeff_set::Bx_By_Ez) {
+            prestencil[i] = (1-weight_right)*table_nci_godfrey_Bx_By_Ez[index  ][i] + 
+                                weight_right*table_nci_godfrey_Bx_By_Ez[index+1][i];
+        }
+    }
+
+    // Compute stencil_z
+    stencil_z.resize( 5 );
+    stencil_z[0] =  (256 + 128*prestencil[0] + 96*prestencil[1] + 80*prestencil[2] + 70*prestencil[3]) / 256;
+    stencil_z[1] = -(       64*prestencil[0] + 64*prestencil[1] + 60*prestencil[2] + 56*prestencil[3]) / 256;
+    stencil_z[2] =  (                          16*prestencil[1] + 24*prestencil[2] + 28*prestencil[3]) / 256;
+    stencil_z[3] = -(                                              4*prestencil[2] +  8*prestencil[3]) / 256;
+    stencil_z[4] =  (                                                                 1*prestencil[3]) / 256;
+
+    // Compute stencil_x and stencil_y (no filter in these directions, 
+    // so only 1 coeff, equal to 1)
+    stencil_x.resize(1);
+    stencil_x[0] = 1.;
+#if (AMREX_SPACEDIM == 3)
+    stencil_y.resize(1);
+    stencil_y[0] = 1.;
+#endif
+
+    // Due to the way Filter::DoFilter() is written, 
+    // coefficient 0 has to be /2
+    stencil_x[0] /= 2.;
+#if (AMREX_SPACEDIM == 3)
+    stencil_y[0] /= 2.;
+#endif
+    stencil_z[0] /= 2.;
+    
+}
diff --git a/Source/FortranInterface/WarpX_f.H b/Source/FortranInterface/WarpX_f.H
index 52996a60a..98dd8685d 100644
--- a/Source/FortranInterface/WarpX_f.H
+++ b/Source/FortranInterface/WarpX_f.H
@@ -39,10 +39,6 @@
 #define WRPX_PUSH_LEAPFROG               warpx_push_leapfrog_3d
 #define WRPX_PUSH_LEAPFROG_POSITIONS     warpx_push_leapfrog_positions_3d
 
-#define WRPX_PXR_NCI_CORR_INIT           init_godfrey_filter_coeffs
-#define WRPX_PXR_GODFREY_FILTER          apply_godfrey_filter_z_3d
-
-
 #elif (AMREX_SPACEDIM == 2)
 
 #define WRPX_COMPUTE_DIVB                warpx_compute_divb_2d
@@ -66,9 +62,6 @@
 #define WRPX_PUSH_LEAPFROG               warpx_push_leapfrog_2d
 #define WRPX_PUSH_LEAPFROG_POSITIONS     warpx_push_leapfrog_positions_2d
 
-#define WRPX_PXR_NCI_CORR_INIT           init_godfrey_filter_coeffs
-#define WRPX_PXR_GODFREY_FILTER          apply_godfrey_filter_z_2d
-
 #ifdef WARPX_RZ
 #define WRPX_COMPUTE_DIVE                warpx_compute_dive_rz
 #else
@@ -87,7 +80,7 @@ extern "C"
                             const amrex_real* uxp, const amrex_real* uyp, const amrex_real* uzp,
                             amrex_real* xpold, amrex_real* ypold, amrex_real* zpold,
                             amrex_real* uxpold, amrex_real* uypold, amrex_real* uzpold);
-    
+
 	// Charge deposition
 	void warpx_charge_deposition(amrex::Real* rho,
             const long* np,	const amrex::Real* xp, const amrex::Real* yp, const amrex::Real* zp,	const amrex::Real* w,
@@ -117,7 +110,7 @@ extern "C"
 			const amrex::Real* dt,
 			const amrex::Real* dx, const amrex::Real* dy, const amrex::Real* dz,
 			const long* nox, const long* noy,const long* noz,
-			const long* lvect, const long* current_depo_algo);
+                        const int* l_nodal, const long* lvect, const long* current_depo_algo);
 
         // Current deposition finalize for RZ
         void warpx_current_deposition_rz_volume_scaling(
@@ -185,7 +178,7 @@ extern "C"
        amrex::Real* u_Xx, amrex::Real* u_Xy, amrex::Real* u_Xz,
        amrex::Real* u_Yx, amrex::Real* u_Yy, amrex::Real* u_Yz,
        amrex::Real* positionx, amrex::Real* positiony, amrex::Real* positionz );
-  
+
   void update_laser_particle( const long* np,
        amrex::Real* xp, amrex::Real* yp, amrex::Real* zp,
        amrex::Real* uxp, amrex::Real* uyp, amrex::Real* uzp,
@@ -194,7 +187,7 @@ extern "C"
        amrex::Real* nvecx, amrex::Real* nvecy, amrex::Real* nvecz,
        amrex::Real* mobility, amrex::Real* dt,  const amrex::Real* c,
        const amrex::Real* beta_boost, const amrex::Real* gamma_boost );
-  
+
     // Maxwell solver
 
         void warpx_push_evec(
@@ -455,14 +448,6 @@ extern "C"
                         const BL_FORT_FAB_ARG_ANYD(fine),
                         const int* ncomp);
 
-    void WRPX_PXR_NCI_CORR_INIT(amrex::Real*, amrex::Real*, const int,
-                                const amrex::Real, const int);
-
-    void WRPX_PXR_GODFREY_FILTER (const int* lo, const int* hi,
-                                  amrex_real* fou, const int* olo, const int* ohi,
-                                  const amrex_real* fin, const int* ilo, const int* ihi,
-                                  const amrex_real* stencil, const int* nsten);
-
 #ifdef WARPX_USE_PSATD
     void warpx_fft_mpi_init (int fcomm);
     void warpx_fft_domain_decomp (int* warpx_local_nz, int* warpx_local_z0,
diff --git a/Source/FortranInterface/WarpX_picsar.F90 b/Source/FortranInterface/WarpX_picsar.F90
index c17e8861b..33f85c633 100644
--- a/Source/FortranInterface/WarpX_picsar.F90
+++ b/Source/FortranInterface/WarpX_picsar.F90
@@ -117,7 +117,7 @@ contains
     pxr_ll4symtry = ll4symtry .eq. 1
     pxr_l_lower_order_in_v = l_lower_order_in_v .eq. 1
     pxr_l_nodal = l_nodal .eq. 1
-    
+
     exg_nguards = ixyzmin - exg_lo
     eyg_nguards = ixyzmin - eyg_lo
     ezg_nguards = ixyzmin - ezg_lo
@@ -218,6 +218,11 @@ subroutine warpx_charge_deposition(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,n
     IF ((nox.eq.1).and.(noy.eq.1).and.(noz.eq.1)) THEN
       CALL depose_rho_vecHVv2_1_1_1(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,nx,ny,nz,&
                nxguard,nyguard,nzguard,lvect)
+
+    ELSE IF ((nox.eq.2).and.(noy.eq.2).and.(noz.eq.2)) THEN
+      CALL depose_rho_vecHVv2_2_2_2(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,nx,ny,nz,&
+                 nxguard,nyguard,nzguard,lvect)
+
     ELSE
       CALL pxr_depose_rho_n(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,nx,ny,nz,&
                   nxguard,nyguard,nzguard,nox,noy,noz, &
@@ -268,7 +273,7 @@ subroutine warpx_charge_deposition(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,n
 #ifdef WARPX_RZ
     integer(c_long) :: type_rz_depose = 1
 #endif
-    
+
     ! Compute the number of valid cells and guard cells
     integer(c_long) :: rho_nvalid(AMREX_SPACEDIM), rho_nguards(AMREX_SPACEDIM)
     rho_nvalid = rho_ntot - 2*rho_ng
@@ -310,14 +315,14 @@ subroutine warpx_charge_deposition(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,n
   subroutine warpx_current_deposition( &
     jx,jx_ng,jx_ntot,jy,jy_ng,jy_ntot,jz,jz_ng,jz_ntot, &
     np,xp,yp,zp,uxp,uyp,uzp,gaminv,w,q,xmin,ymin,zmin,dt,dx,dy,dz,nox,noy,noz,&
-    lvect,current_depo_algo) &
+    l_nodal,lvect,current_depo_algo) &
     bind(C, name="warpx_current_deposition")
 
     integer, intent(in) :: jx_ntot(AMREX_SPACEDIM), jy_ntot(AMREX_SPACEDIM), jz_ntot(AMREX_SPACEDIM)
     integer(c_long), intent(in) :: jx_ng, jy_ng, jz_ng
     integer(c_long), intent(IN)                                  :: np
     integer(c_long), intent(IN)                                  :: nox,noy,noz
-
+    integer(c_int), intent(in)                                   :: l_nodal
     real(amrex_real), intent(IN OUT):: jx(*), jy(*), jz(*)
     real(amrex_real), intent(IN)                                     :: q
     real(amrex_real), intent(IN)                                     :: dx,dy,dz
@@ -328,6 +333,7 @@ subroutine warpx_charge_deposition(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,n
     real(amrex_real), intent(IN),  dimension(np)                     :: gaminv
     integer(c_long), intent(IN)                                   :: lvect
     integer(c_long), intent(IN)                                   :: current_depo_algo
+    logical(pxr_logical)                                          :: pxr_l_nodal
 
     ! Compute the number of valid cells and guard cells
     integer(c_long) :: jx_nvalid(AMREX_SPACEDIM), jy_nvalid(AMREX_SPACEDIM), jz_nvalid(AMREX_SPACEDIM), &
@@ -338,6 +344,7 @@ subroutine warpx_charge_deposition(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,n
     jx_nguards = jx_ng
     jy_nguards = jy_ng
     jz_nguards = jz_ng
+    pxr_l_nodal = l_nodal .eq. 1
 
 ! Dimension 3
 #if (AMREX_SPACEDIM==3)
@@ -347,7 +354,7 @@ subroutine warpx_charge_deposition(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,n
         jz,jz_nguards,jz_nvalid,            &
         np,xp,yp,zp,uxp,uyp,uzp,gaminv,w,q, &
         xmin,ymin,zmin,dt,dx,dy,dz,         &
-        nox,noy,noz,current_depo_algo)
+        nox,noy,noz,pxr_l_nodal,current_depo_algo)
 ! Dimension 2
 #elif (AMREX_SPACEDIM==2)
         CALL WRPX_PXR_CURRENT_DEPOSITION(   &
@@ -355,8 +362,8 @@ subroutine warpx_charge_deposition(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,n
         jy,jy_nguards,jy_nvalid,            &
         jz,jz_nguards,jz_nvalid,            &
         np,xp,yp,zp,uxp,uyp,uzp,gaminv,w,q, &
-        xmin,zmin,dt,dx,dz,nox,noz,lvect,   &
-        current_depo_algo)
+        xmin,zmin,dt,dx,dz,nox,noz,pxr_l_nodal, &
+        lvect,current_depo_algo)
 #endif
 
   end subroutine warpx_current_deposition
@@ -387,7 +394,7 @@ subroutine warpx_charge_deposition(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,n
     real(amrex_real), intent(IN OUT):: jx(*), jy(*), jz(*)
     real(amrex_real), intent(IN) :: rmin, dr
 
-#ifdef WARPX_RZ    
+#ifdef WARPX_RZ
     integer(c_long) :: type_rz_depose = 1
 #endif
     ! Compute the number of valid cells and guard cells
diff --git a/Source/Initialization/PlasmaInjector.H b/Source/Initialization/PlasmaInjector.H
index cd5802a55..f998e217e 100644
--- a/Source/Initialization/PlasmaInjector.H
+++ b/Source/Initialization/PlasmaInjector.H
@@ -293,6 +293,7 @@ public:
     amrex::Real z_rms;
     amrex::Real q_tot;
     long npart;
+    int do_symmetrize = 0;
 
     bool radially_weighted = true;
 
diff --git a/Source/Initialization/PlasmaInjector.cpp b/Source/Initialization/PlasmaInjector.cpp
index 52b5d8b61..f9642d1b6 100644
--- a/Source/Initialization/PlasmaInjector.cpp
+++ b/Source/Initialization/PlasmaInjector.cpp
@@ -286,6 +286,7 @@ PlasmaInjector::PlasmaInjector(int ispecies, const std::string& name)
         pp.get("z_rms", z_rms);
         pp.get("q_tot", q_tot);
         pp.get("npart", npart);
+        pp.query("do_symmetrize", do_symmetrize);
         gaussian_beam = true;
         parseMomentum(pp);
     }
diff --git a/Source/Initialization/WarpXInitData.cpp b/Source/Initialization/WarpXInitData.cpp
index 23637ec97..d583b2b0f 100644
--- a/Source/Initialization/WarpXInitData.cpp
+++ b/Source/Initialization/WarpXInitData.cpp
@@ -7,6 +7,7 @@
 #include <WarpX.H>
 #include <WarpX_f.H>
 #include <BilinearFilter.H>
+#include <NCIGodfreyFilter.H>
 
 #ifdef BL_USE_SENSEI_INSITU
 #include <AMReX_AmrMeshInSituBridge.H>
@@ -161,8 +162,6 @@ WarpX::InitNCICorrector ()
 {
     if (WarpX::use_fdtd_nci_corr)
     {
-        mypc->fdtd_nci_stencilz_ex.resize(max_level+1);
-        mypc->fdtd_nci_stencilz_by.resize(max_level+1);
         for (int lev = 0; lev <= max_level; ++lev)
         {
             const Geometry& gm = Geom(lev);
@@ -174,10 +173,15 @@ WarpX::InitNCICorrector ()
                 dz = dx[1];
             }
             cdtodz = PhysConst::c * dt[lev] / dz;
-            WRPX_PXR_NCI_CORR_INIT( (mypc->fdtd_nci_stencilz_ex)[lev].data(),
-                                    (mypc->fdtd_nci_stencilz_by)[lev].data(),
-                                    mypc->nstencilz_fdtd_nci_corr, cdtodz,
-                                    WarpX::l_lower_order_in_v);
+
+            // Initialize Godfrey filters
+            // Same filter for fields Ex, Ey and Bz
+            nci_godfrey_filter_exeybz[lev].reset( new NCIGodfreyFilter(godfrey_coeff_set::Ex_Ey_Bz, cdtodz, WarpX::l_lower_order_in_v) );
+            // Same filter for fields Bx, By and Ez
+            nci_godfrey_filter_bxbyez[lev].reset( new NCIGodfreyFilter(godfrey_coeff_set::Bx_By_Ez, cdtodz, WarpX::l_lower_order_in_v) );
+            // Compute Godfrey filters stencils
+            nci_godfrey_filter_exeybz[lev]->ComputeStencils();
+            nci_godfrey_filter_bxbyez[lev]->ComputeStencils();
         }
     }
 }
@@ -318,6 +322,7 @@ WarpX::InitLevelData (int lev, Real time)
 void
 WarpX::InitLevelDataFFT (int lev, Real time)
 {
+ 
     Efield_fp_fft[lev][0]->setVal(0.0);
     Efield_fp_fft[lev][1]->setVal(0.0);
     Efield_fp_fft[lev][2]->setVal(0.0);
@@ -342,6 +347,7 @@ WarpX::InitLevelDataFFT (int lev, Real time)
         current_cp_fft[lev][2]->setVal(0.0);
         rho_cp_fft[lev]->setVal(0.0);
     }
+
 }
 
 #endif
diff --git a/Source/Laser/LaserParticleContainer.cpp b/Source/Laser/LaserParticleContainer.cpp
index 2f964b6f3..de410b31f 100644
--- a/Source/Laser/LaserParticleContainer.cpp
+++ b/Source/Laser/LaserParticleContainer.cpp
@@ -518,10 +518,11 @@ LaserParticleContainer::Evolve (int lev,
             if (cost) {
                 const Box& tbx = pti.tilebox();
                 wt = (amrex::second() - wt) / tbx.d_numPts();
-                FArrayBox* costfab = cost->fabPtr(pti);
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, work_box,
+                Array4<Real> const& costarr = cost->array(pti);
+                amrex::ParallelFor(tbx,
+                [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
                 {
-                    costfab->plus(wt, work_box);
+                    costarr(i,j,k) += wt;
                 });
             }
         }
diff --git a/Source/Parallelization/WarpXComm.cpp b/Source/Parallelization/WarpXComm.cpp
index 5c9fa144f..00dcb85d0 100644
--- a/Source/Parallelization/WarpXComm.cpp
+++ b/Source/Parallelization/WarpXComm.cpp
@@ -246,7 +246,7 @@ void
 WarpX::FillBoundaryE (int lev, PatchType patch_type)
 {
     if (patch_type == PatchType::fine)
-    {
+    {        
         if (do_pml && pml[lev]->ok())
         {
     	    pml[lev]->ExchangeE(patch_type,
diff --git a/Source/Particles/MultiParticleContainer.H b/Source/Particles/MultiParticleContainer.H
index 513f30b99..869126fef 100644
--- a/Source/Particles/MultiParticleContainer.H
+++ b/Source/Particles/MultiParticleContainer.H
@@ -129,8 +129,7 @@ public:
 
     void Checkpoint (const std::string& dir) const;
 
-    void WritePlotFile( const std::string& dir,
-                        const amrex::Vector<std::string>& real_names) const;
+    void WritePlotFile (const std::string& dir) const;
     
     void Restart (const std::string& dir);
 
@@ -181,17 +180,6 @@ public:
     void UpdateContinuousInjectionPosition(amrex::Real dt) const;
     int doContinuousInjection() const;
 
-    // 
-    // Parameters for the Cherenkov corrector in the FDTD solver.
-    // Both stencils are calculated ar runtime.
-    // 
-    // Number of coefficients for the stencil of the NCI corrector.
-    // The stencil is applied in the z direction only.
-    static constexpr int nstencilz_fdtd_nci_corr=5;
-
-    amrex::Vector<amrex::Array<amrex::Real, nstencilz_fdtd_nci_corr> > fdtd_nci_stencilz_ex;
-    amrex::Vector<amrex::Array<amrex::Real, nstencilz_fdtd_nci_corr> > fdtd_nci_stencilz_by;
-
     std::vector<std::string> GetSpeciesNames() const { return species_names; }
 
     PhysicalParticleContainer& GetPCtmp () { return *pc_tmp; }
diff --git a/Source/Particles/MultiParticleContainer.cpp b/Source/Particles/MultiParticleContainer.cpp
index 6d618c096..9d39ec2f9 100644
--- a/Source/Particles/MultiParticleContainer.cpp
+++ b/Source/Particles/MultiParticleContainer.cpp
@@ -8,8 +8,6 @@
 
 using namespace amrex;
 
-constexpr int MultiParticleContainer::nstencilz_fdtd_nci_corr;
-
 MultiParticleContainer::MultiParticleContainer (AmrCore* amr_core)
 {
     ReadParameters();
diff --git a/Source/Particles/PhysicalParticleContainer.H b/Source/Particles/PhysicalParticleContainer.H
index df1e0296a..bd8cfb47e 100644
--- a/Source/Particles/PhysicalParticleContainer.H
+++ b/Source/Particles/PhysicalParticleContainer.H
@@ -94,7 +94,10 @@ public:
 
     void AddGaussianBeam(amrex::Real x_m, amrex::Real y_m, amrex::Real z_m,
                          amrex::Real x_rms, amrex::Real y_rms, amrex::Real z_rms,
-                         amrex::Real q_tot, long npart);
+                         amrex::Real q_tot, long npart, int do_symmetrize);
+
+    void CheckAndAddParticle(amrex::Real x, amrex::Real y, amrex::Real z,
+                             std::array<amrex::Real, 3> u, amrex::Real weight);
 
     virtual void GetParticleSlice(const int direction, const amrex::Real z_old,
                                   const amrex::Real z_new, const amrex::Real t_boost, 
diff --git a/Source/Particles/PhysicalParticleContainer.cpp b/Source/Particles/PhysicalParticleContainer.cpp
index 212084e64..7e7c9534e 100644
--- a/Source/Particles/PhysicalParticleContainer.cpp
+++ b/Source/Particles/PhysicalParticleContainer.cpp
@@ -181,7 +181,8 @@ void PhysicalParticleContainer::MapParticletoBoostedFrame(Real& x, Real& y, Real
 void
 PhysicalParticleContainer::AddGaussianBeam(Real x_m, Real y_m, Real z_m,
                                            Real x_rms, Real y_rms, Real z_rms,
-                                           Real q_tot, long npart) {
+                                           Real q_tot, long npart, 
+                                           int do_symmetrize) {
 
     const Geometry& geom     = m_gdb->Geom(0);
     RealBox containing_bx = geom.ProbDomain();
@@ -191,13 +192,15 @@ PhysicalParticleContainer::AddGaussianBeam(Real x_m, Real y_m, Real z_m,
     std::normal_distribution<double> disty(y_m, y_rms);
     std::normal_distribution<double> distz(z_m, z_rms);
 
-    std::array<Real,PIdx::nattribs> attribs;
-    attribs.fill(0.0);
-
     if (ParallelDescriptor::IOProcessor()) {
-       std::array<Real, 3> u;
-       Real weight;
-       for (long i = 0; i < npart; ++i) {
+        std::array<Real, 3> u;
+        Real weight;
+        // If do_symmetrize, create 4x fewer particles, and 
+        // Replicate each particle 4 times (x,y) (-x,y) (x,-y) (-x,-y)
+        if (do_symmetrize){
+            npart /= 4;
+        }
+        for (long i = 0; i < npart; ++i) {
 #if ( AMREX_SPACEDIM == 3 | WARPX_RZ)
             weight = q_tot/npart/charge;
             Real x = distx(mt);
@@ -209,29 +212,27 @@ PhysicalParticleContainer::AddGaussianBeam(Real x_m, Real y_m, Real z_m,
             Real y = 0.;
             Real z = distz(mt);
 #endif
-        if (plasma_injector->insideBounds(x, y, z)) {
-	    plasma_injector->getMomentum(u, x, y, z);
-            if (WarpX::gamma_boost > 1.) {
-                MapParticletoBoostedFrame(x, y, z, u);
-            }
-            attribs[PIdx::ux] = u[0];
-            attribs[PIdx::uy] = u[1];
-            attribs[PIdx::uz] = u[2];
-            attribs[PIdx::w ] = weight;
-
-            if (WarpX::do_boosted_frame_diagnostic && do_boosted_frame_diags)
-            {
-                auto& particle_tile = DefineAndReturnParticleTile(0, 0, 0);
-                particle_tile.push_back_real(particle_comps["xold"], x);
-                particle_tile.push_back_real(particle_comps["yold"], y);
-                particle_tile.push_back_real(particle_comps["zold"], z);
-                
-                particle_tile.push_back_real(particle_comps["uxold"], u[0]);
-                particle_tile.push_back_real(particle_comps["uyold"], u[1]);
-                particle_tile.push_back_real(particle_comps["uzold"], u[2]);
-            }
-            
-            AddOneParticle(0, 0, 0, x, y, z, attribs);
+            if (plasma_injector->insideBounds(x, y, z)) {
+                plasma_injector->getMomentum(u, x, y, z);
+                if (do_symmetrize){
+                    std::array<Real, 3> u_tmp;
+                    Real x_tmp, y_tmp;
+                    // Add four particles to the beam:
+                    // (x,ux,y,uy) (-x,-ux,y,uy) (x,ux,-y,-uy) (-x,-ux,-y,-uy)
+                    for (int ix=0; ix<2; ix++){
+                        for (int iy=0; iy<2; iy++){
+                            u_tmp = u;
+                            x_tmp     = x*std::pow(-1,ix);
+                            u_tmp[0] *= std::pow(-1,ix);
+                            y_tmp     = y*std::pow(-1,iy);
+                            u_tmp[1] *= std::pow(-1,iy);
+                            CheckAndAddParticle(x_tmp, y_tmp, z, 
+                                                u_tmp, weight/4);
+                        }
+                    }
+                } else {
+                    CheckAndAddParticle(x, y, z, u, weight);
+                }
             }
         }
     }
@@ -239,6 +240,39 @@ PhysicalParticleContainer::AddGaussianBeam(Real x_m, Real y_m, Real z_m,
 }
 
 void
+PhysicalParticleContainer::CheckAndAddParticle(Real x, Real y, Real z,
+                                               std::array<Real, 3> u,
+                                               Real weight)
+{
+    std::array<Real,PIdx::nattribs> attribs;
+    attribs.fill(0.0);
+
+    // update attribs with input arguments
+    if (WarpX::gamma_boost > 1.) {
+        MapParticletoBoostedFrame(x, y, z, u);
+    }
+    attribs[PIdx::ux] = u[0];
+    attribs[PIdx::uy] = u[1];
+    attribs[PIdx::uz] = u[2];
+    attribs[PIdx::w ] = weight;
+
+    if (WarpX::do_boosted_frame_diagnostic && do_boosted_frame_diags)
+    {
+        // need to create old values
+        auto& particle_tile = DefineAndReturnParticleTile(0, 0, 0);
+        particle_tile.push_back_real(particle_comps["xold"], x);
+        particle_tile.push_back_real(particle_comps["yold"], y);
+        particle_tile.push_back_real(particle_comps["zold"], z);
+                
+        particle_tile.push_back_real(particle_comps["uxold"], u[0]);
+        particle_tile.push_back_real(particle_comps["uyold"], u[1]);
+        particle_tile.push_back_real(particle_comps["uzold"], u[2]);
+    }
+    // add particle
+    AddOneParticle(0, 0, 0, x, y, z, attribs);
+}
+
+void
 PhysicalParticleContainer::AddParticles (int lev)
 {
     BL_PROFILE("PhysicalParticleContainer::AddParticles()");
@@ -263,7 +297,8 @@ PhysicalParticleContainer::AddParticles (int lev)
                         plasma_injector->y_rms,
                         plasma_injector->z_rms,
                         plasma_injector->q_tot,
-                        plasma_injector->npart);
+                        plasma_injector->npart,
+                        plasma_injector->do_symmetrize);
 
 
         return;
@@ -521,10 +556,11 @@ PhysicalParticleContainer::AddPlasmaCPU (int lev, RealBox part_realbox)
 
             if (cost) {
 	        wt = (amrex::second() - wt) / tile_box.d_numPts();
-                FArrayBox* costfab = cost->fabPtr(mfi);
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tile_box, work_box,
+                Array4<Real> const& costarr = cost->array(mfi);
+                amrex::ParallelFor(tile_box,
+                [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
                 {
-                    costfab->plus(wt, work_box);
+                    costarr(i,j,k) += wt;
                 });
             }
         }
@@ -795,10 +831,11 @@ PhysicalParticleContainer::AddPlasmaGPU (int lev, RealBox part_realbox)
 	    			 
             if (cost) {
 	        wt = (amrex::second() - wt) / tile_box.d_numPts();
-                FArrayBox* costfab = cost->fabPtr(mfi);
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tile_box, work_box,
+                Array4<Real> const& costarr = cost->array(mfi);
+                amrex::ParallelFor(tile_box,
+                [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
                 {
-                    costfab->plus(wt, work_box);
+                    costarr(i,j,k) += wt;
                 });
             }
         }		
@@ -1102,10 +1139,11 @@ PhysicalParticleContainer::FieldGather (int lev,
             if (cost) {
                 const Box& tbx = pti.tilebox();
                 wt = (amrex::second() - wt) / tbx.d_numPts();
-                FArrayBox* costfab = cost->fabPtr(pti);
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, work_box,
+                Array4<Real> const& costarr = cost->array(pti);
+                amrex::ParallelFor(tbx,
+                [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
                 {
-                    costfab->plus(wt, work_box);
+                    costarr(i,j,k) += wt;
                 });
             }
         }
@@ -1132,8 +1170,9 @@ PhysicalParticleContainer::Evolve (int lev,
     const std::array<Real,3>& dx = WarpX::CellSize(lev);
     const std::array<Real,3>& cdx = WarpX::CellSize(std::max(lev-1,0));
 
-    const auto& mypc = WarpX::GetInstance().GetPartContainer();
-    const int nstencilz_fdtd_nci_corr = mypc.nstencilz_fdtd_nci_corr;
+    // Get instances of NCI Godfrey filters 
+    const auto& nci_godfrey_filter_exeybz = WarpX::GetInstance().nci_godfrey_filter_exeybz;
+    const auto& nci_godfrey_filter_bxbyez = WarpX::GetInstance().nci_godfrey_filter_bxbyez;
 
     BL_ASSERT(OnSameGrids(lev,jx));
 
@@ -1165,7 +1204,7 @@ PhysicalParticleContainer::Evolve (int lev,
 	{
             Real wt = amrex::second();
 
-	    const Box& box = pti.validbox();
+            const Box& box = pti.validbox();
 
             auto& attribs = pti.GetAttribs();
 
@@ -1182,7 +1221,7 @@ PhysicalParticleContainer::Evolve (int lev,
 
             const long np = pti.numParticles();
 
-	    // Data on the grid
+            // Data on the grid
             FArrayBox const* exfab = &(Ex[pti]);
             FArrayBox const* eyfab = &(Ey[pti]);
             FArrayBox const* ezfab = &(Ez[pti]);
@@ -1190,6 +1229,7 @@ PhysicalParticleContainer::Evolve (int lev,
             FArrayBox const* byfab = &(By[pti]);
             FArrayBox const* bzfab = &(Bz[pti]);
 
+            Elixir exeli, eyeli, ezeli, bxeli, byeli, bzeli;
             if (WarpX::use_fdtd_nci_corr)
             {
 #if (AMREX_SPACEDIM == 2)
@@ -1201,54 +1241,43 @@ PhysicalParticleContainer::Evolve (int lev,
                             static_cast<int>(WarpX::noz)});
 #endif
 
-                // both 2d and 3d
+                // Filter Ex (Both 2D and 3D)
                 filtered_Ex.resize(amrex::convert(tbox,WarpX::Ex_nodal_flag));
-                WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_Ex),
-                                        BL_TO_FORTRAN_ANYD(filtered_Ex),
-                                        BL_TO_FORTRAN_ANYD(Ex[pti]),
-                                        mypc.fdtd_nci_stencilz_ex[lev].data(),
-                                        &nstencilz_fdtd_nci_corr);
+                // Safeguard for GPU
+                exeli = filtered_Ex.elixir();
+                // Apply filter on Ex, result stored in filtered_Ex
+                nci_godfrey_filter_exeybz[lev]->ApplyStencil(filtered_Ex, Ex[pti], filtered_Ex.box());
+                // Update exfab reference
                 exfab = &filtered_Ex;
 
+                // Filter Ez
                 filtered_Ez.resize(amrex::convert(tbox,WarpX::Ez_nodal_flag));
-                WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_Ez),
-                                        BL_TO_FORTRAN_ANYD(filtered_Ez),
-                                        BL_TO_FORTRAN_ANYD(Ez[pti]),
-                                        mypc.fdtd_nci_stencilz_by[lev].data(),
-                                        &nstencilz_fdtd_nci_corr);
+                ezeli = filtered_Ez.elixir();
+                nci_godfrey_filter_bxbyez[lev]->ApplyStencil(filtered_Ez, Ez[pti], filtered_Ez.box());
                 ezfab = &filtered_Ez;
 
+                // Filter By
                 filtered_By.resize(amrex::convert(tbox,WarpX::By_nodal_flag));
-                WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_By),
-                                        BL_TO_FORTRAN_ANYD(filtered_By),
-                                        BL_TO_FORTRAN_ANYD(By[pti]),
-                                        mypc.fdtd_nci_stencilz_by[lev].data(),
-                                        &nstencilz_fdtd_nci_corr);
+                byeli = filtered_By.elixir();
+                nci_godfrey_filter_bxbyez[lev]->ApplyStencil(filtered_By, By[pti], filtered_By.box());
                 byfab = &filtered_By;
-
 #if (AMREX_SPACEDIM == 3)
+                // Filter Ey
                 filtered_Ey.resize(amrex::convert(tbox,WarpX::Ey_nodal_flag));
-                WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_Ey),
-                                        BL_TO_FORTRAN_ANYD(filtered_Ey),
-                                        BL_TO_FORTRAN_ANYD(Ey[pti]),
-                                        mypc.fdtd_nci_stencilz_ex[lev].data(),
-                                        &nstencilz_fdtd_nci_corr);
+                eyeli = filtered_Ey.elixir();
+                nci_godfrey_filter_exeybz[lev]->ApplyStencil(filtered_Ey, Ey[pti], filtered_Ey.box());
                 eyfab = &filtered_Ey;
 
+                // Filter Bx
                 filtered_Bx.resize(amrex::convert(tbox,WarpX::Bx_nodal_flag));
-                WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_Bx),
-                                        BL_TO_FORTRAN_ANYD(filtered_Bx),
-                                        BL_TO_FORTRAN_ANYD(Bx[pti]),
-                                        mypc.fdtd_nci_stencilz_by[lev].data(),
-                                        &nstencilz_fdtd_nci_corr);
+                bxeli = filtered_Bx.elixir();
+                nci_godfrey_filter_bxbyez[lev]->ApplyStencil(filtered_Bx, Bx[pti], filtered_Bx.box());
                 bxfab = &filtered_Bx;
 
+                // Filter Bz
                 filtered_Bz.resize(amrex::convert(tbox,WarpX::Bz_nodal_flag));
-                WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_Bz),
-                                        BL_TO_FORTRAN_ANYD(filtered_Bz),
-                                        BL_TO_FORTRAN_ANYD(Bz[pti]),
-                                        mypc.fdtd_nci_stencilz_ex[lev].data(),
-                                        &nstencilz_fdtd_nci_corr);
+                bzeli = filtered_Bz.elixir();
+                nci_godfrey_filter_exeybz[lev]->ApplyStencil(filtered_Bz, Bz[pti], filtered_Bz.box());
                 bzfab = &filtered_Bz;
 #endif
             }
@@ -1424,53 +1453,43 @@ PhysicalParticleContainer::Evolve (int lev,
                                     static_cast<int>(WarpX::noz)});
 #endif
 
-                        // both 2d and 3d
+                        // Filter Ex (both 2D and 3D)
                         filtered_Ex.resize(amrex::convert(tbox,WarpX::Ex_nodal_flag));
-                        WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_Ex),
-                                                BL_TO_FORTRAN_ANYD(filtered_Ex),
-                                                BL_TO_FORTRAN_ANYD((*cEx)[pti]),
-                                                mypc.fdtd_nci_stencilz_ex[lev-1].data(),
-                                                &nstencilz_fdtd_nci_corr);
+                        // Safeguard for GPU
+                        exeli = filtered_Ex.elixir();
+                        // Apply filter on Ex, result stored in filtered_Ex
+                        nci_godfrey_filter_exeybz[lev-1]->ApplyStencil(filtered_Ex, (*cEx)[pti], filtered_Ex.box());
+                        // Update exfab reference
                         cexfab = &filtered_Ex;
 
+                        // Filter Ez
                         filtered_Ez.resize(amrex::convert(tbox,WarpX::Ez_nodal_flag));
-                        WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_Ez),
-                                                BL_TO_FORTRAN_ANYD(filtered_Ez),
-                                                BL_TO_FORTRAN_ANYD((*cEz)[pti]),
-                                                mypc.fdtd_nci_stencilz_by[lev-1].data(),
-                                                &nstencilz_fdtd_nci_corr);
+                        ezeli = filtered_Ez.elixir();
+                        nci_godfrey_filter_bxbyez[lev-1]->ApplyStencil(filtered_Ez, (*cEz)[pti], filtered_Ez.box());
                         cezfab = &filtered_Ez;
+
+                        // Filter By
                         filtered_By.resize(amrex::convert(tbox,WarpX::By_nodal_flag));
-                        WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_By),
-                                                BL_TO_FORTRAN_ANYD(filtered_By),
-                                                BL_TO_FORTRAN_ANYD((*cBy)[pti]),
-                                                mypc.fdtd_nci_stencilz_by[lev-1].data(),
-                                                &nstencilz_fdtd_nci_corr);
+                        byeli = filtered_By.elixir();
+                        nci_godfrey_filter_bxbyez[lev-1]->ApplyStencil(filtered_By, (*cBy)[pti], filtered_By.box());
                         cbyfab = &filtered_By;
-
 #if (AMREX_SPACEDIM == 3)
+                        // Filter Ey
                         filtered_Ey.resize(amrex::convert(tbox,WarpX::Ey_nodal_flag));
-                        WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_Ey),
-                                                BL_TO_FORTRAN_ANYD(filtered_Ey),
-                                                BL_TO_FORTRAN_ANYD((*cEy)[pti]),
-                                                mypc.fdtd_nci_stencilz_ex[lev-1].data(),
-                                                &nstencilz_fdtd_nci_corr);
+                        eyeli = filtered_Ey.elixir();
+                        nci_godfrey_filter_exeybz[lev-1]->ApplyStencil(filtered_Ey, (*cEy)[pti], filtered_Ey.box());
                         ceyfab = &filtered_Ey;
                         
+                        // Filter Bx
                         filtered_Bx.resize(amrex::convert(tbox,WarpX::Bx_nodal_flag));
-                        WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_Bx),
-                                                BL_TO_FORTRAN_ANYD(filtered_Bx),
-                                                BL_TO_FORTRAN_ANYD((*cBx)[pti]),
-                                                mypc.fdtd_nci_stencilz_by[lev-1].data(),
-                                                &nstencilz_fdtd_nci_corr);
+                        bxeli = filtered_Bx.elixir();
+                        nci_godfrey_filter_bxbyez[lev-1]->ApplyStencil(filtered_Bx, (*cBx)[pti], filtered_Bx.box());
                         cbxfab = &filtered_Bx;
                         
+                        // Filter Bz
                         filtered_Bz.resize(amrex::convert(tbox,WarpX::Bz_nodal_flag));
-                        WRPX_PXR_GODFREY_FILTER(BL_TO_FORTRAN_BOX(filtered_Bz),
-                                                BL_TO_FORTRAN_ANYD(filtered_Bz),
-                                                BL_TO_FORTRAN_ANYD((*cBz)[pti]),
-                                                mypc.fdtd_nci_stencilz_ex[lev-1].data(),
-                                                &nstencilz_fdtd_nci_corr);
+                        bzeli = filtered_Bz.elixir();
+                        nci_godfrey_filter_exeybz[lev-1]->ApplyStencil(filtered_Bz, (*cBz)[pti], filtered_Bz.box());
                         cbzfab = &filtered_Bz;
 #endif
                     }
@@ -1526,10 +1545,11 @@ PhysicalParticleContainer::Evolve (int lev,
             if (cost) {
                 const Box& tbx = pti.tilebox();
                 wt = (amrex::second() - wt) / tbx.d_numPts();
-                FArrayBox* costfab = cost->fabPtr(pti);
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, work_box,
+                Array4<Real> const& costarr = cost->array(pti);
+                amrex::ParallelFor(tbx,
+                [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
                 {
-                    costfab->plus(wt, work_box);
+                    costarr(i,j,k) += wt;
                 });
             }
         }
diff --git a/Source/Particles/RigidInjectedParticleContainer.H b/Source/Particles/RigidInjectedParticleContainer.H
index d3a69f5b0..0b27a2f2f 100644
--- a/Source/Particles/RigidInjectedParticleContainer.H
+++ b/Source/Particles/RigidInjectedParticleContainer.H
@@ -57,6 +57,10 @@ public:
                         const amrex::MultiFab& By,
                         const amrex::MultiFab& Bz) override;
 
+    virtual void ReadHeader (std::istream& is) override;
+
+    virtual void WriteHeader (std::ostream& os) const override;
+    
 private:
 
     // User input quantities
diff --git a/Source/Particles/RigidInjectedParticleContainer.cpp b/Source/Particles/RigidInjectedParticleContainer.cpp
index fd1b2dfb5..2a3e8dd0d 100644
--- a/Source/Particles/RigidInjectedParticleContainer.cpp
+++ b/Source/Particles/RigidInjectedParticleContainer.cpp
@@ -77,7 +77,6 @@ RigidInjectedParticleContainer::RemapParticles()
 
                 for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
                 {
-
                     auto& attribs = pti.GetAttribs();
                     auto& uxp = attribs[PIdx::ux];
                     auto& uyp = attribs[PIdx::uy];
diff --git a/Source/Particles/WarpXParticleContainer.H b/Source/Particles/WarpXParticleContainer.H
index c24a6a3dc..1e3dfb2ae 100644
--- a/Source/Particles/WarpXParticleContainer.H
+++ b/Source/Particles/WarpXParticleContainer.H
@@ -225,9 +225,9 @@ public:
                          amrex::Real x, amrex::Real y, amrex::Real z,
                          std::array<amrex::Real,PIdx::nattribs>& attribs);
 
-    void ReadHeader (std::istream& is);
+    virtual void ReadHeader (std::istream& is);
 
-    void WriteHeader (std::ostream& os) const;
+    virtual void WriteHeader (std::ostream& os) const;
 
     virtual void ConvertUnits (ConvertDirection convert_dir){};
 
@@ -255,6 +255,8 @@ public:
         AddIntComp(comm);
     }
 
+    int DoBoostedFrameDiags () const { return do_boosted_frame_diags; }
+    
 protected:
 
     std::map<std::string, int> particle_comps;
diff --git a/Source/Particles/WarpXParticleContainer.cpp b/Source/Particles/WarpXParticleContainer.cpp
index 9791eee80..ac532912d 100644
--- a/Source/Particles/WarpXParticleContainer.cpp
+++ b/Source/Particles/WarpXParticleContainer.cpp
@@ -306,7 +306,7 @@ WarpXParticleContainer::DepositCurrent(WarpXParIter& pti,
   // WarpX assumes the same number of guard cells for Jx, Jy, Jz
   long ngJ = jx.nGrow();
 
-  bool j_is_nodal = jx.is_nodal() and jy.is_nodal() and jz.is_nodal();
+  int j_is_nodal = jx.is_nodal() and jy.is_nodal() and jz.is_nodal();
 
   // Deposit charge for particles that are not in the current buffers
   if (np_current > 0)
@@ -342,92 +342,29 @@ WarpXParticleContainer::DepositCurrent(WarpXParIter& pti,
 #endif
 
       BL_PROFILE_VAR_START(blp_pxr_cd);
-      if (j_is_nodal) {
-          const Real* p_wp = wp.dataPtr();
-          const Real* p_gaminv = m_giv[thread_num].dataPtr();
-          const Real* p_uxp = uxp.dataPtr();
-          const Real* p_uyp = uyp.dataPtr();
-          const Real* p_uzp = uzp.dataPtr();
-          AsyncArray<Real> wptmp_aa(np_current);
-          Real* const wptmp = wptmp_aa.data();
-          const Box& tile_box = pti.tilebox();
-#if (AMREX_SPACEDIM == 3)
-          const long nx = tile_box.length(0);
-          const long ny = tile_box.length(1);
-          const long nz = tile_box.length(2);
-#else
-          const long nx = tile_box.length(0);
-          const long ny = 0;
-          const long nz = tile_box.length(1);
-#endif
-          amrex::ParallelFor (np_current, [=] AMREX_GPU_DEVICE (long ip) {
-                  wptmp[ip] = p_wp[ip] * p_gaminv[ip] * p_uxp[ip];
-          });
-          warpx_charge_deposition(jx_ptr, &np_current,
-                                  m_xp[thread_num].dataPtr(),
-                                  m_yp[thread_num].dataPtr(),
-                                  m_zp[thread_num].dataPtr(),
-                                  wptmp,
-                                  &this->charge,
-                                  &xyzmin[0], &xyzmin[1], &xyzmin[2],
-                                  &dx[0], &dx[1], &dx[2], &nx, &ny, &nz,
-                                  &ngJ, &ngJ, &ngJ,
-                                  &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                                  &lvect, &WarpX::current_deposition_algo);
-          amrex::ParallelFor (np_current, [=] AMREX_GPU_DEVICE (long ip) {
-                  wptmp[ip] = p_wp[ip] * p_gaminv[ip] * p_uyp[ip];
-          });
-          warpx_charge_deposition(jy_ptr, &np_current,
-                                  m_xp[thread_num].dataPtr(),
-                                  m_yp[thread_num].dataPtr(),
-                                  m_zp[thread_num].dataPtr(),
-                                  wptmp,
-                                  &this->charge,
-                                  &xyzmin[0], &xyzmin[1], &xyzmin[2],
-                                  &dx[0], &dx[1], &dx[2], &nx, &ny, &nz,
-                                  &ngJ, &ngJ, &ngJ,
-                                  &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                                  &lvect, &WarpX::current_deposition_algo);
-          amrex::ParallelFor (np_current, [=] AMREX_GPU_DEVICE (long ip) {
-                  wptmp[ip] = p_wp[ip] * p_gaminv[ip] * p_uzp[ip];
-          });
-          warpx_charge_deposition(jz_ptr, &np_current,
-                                  m_xp[thread_num].dataPtr(),
-                                  m_yp[thread_num].dataPtr(),
-                                  m_zp[thread_num].dataPtr(),
-                                  wptmp,
-                                  &this->charge,
-                                  &xyzmin[0], &xyzmin[1], &xyzmin[2],
-                                  &dx[0], &dx[1], &dx[2], &nx, &ny, &nz,
-                                  &ngJ, &ngJ, &ngJ,
-                                  &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                                  &lvect, &WarpX::current_deposition_algo);
-      } else {
-          warpx_current_deposition(
-                               jx_ptr, &ngJ, jxntot.getVect(),
-                               jy_ptr, &ngJ, jyntot.getVect(),
-                               jz_ptr, &ngJ, jzntot.getVect(),
-                               &np_current,
-                               m_xp[thread_num].dataPtr(),
-                               m_yp[thread_num].dataPtr(),
-                               m_zp[thread_num].dataPtr(),
-                               uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
-                               m_giv[thread_num].dataPtr(),
-                               wp.dataPtr(), &this->charge,
-                               &xyzmin[0], &xyzmin[1], &xyzmin[2],
-                               &dt, &dx[0], &dx[1], &dx[2],
-                               &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                               &lvect,&WarpX::current_deposition_algo);
+      warpx_current_deposition(
+           jx_ptr, &ngJ, jxntot.getVect(),
+           jy_ptr, &ngJ, jyntot.getVect(),
+           jz_ptr, &ngJ, jzntot.getVect(),
+           &np_current,
+           m_xp[thread_num].dataPtr(),
+           m_yp[thread_num].dataPtr(),
+           m_zp[thread_num].dataPtr(),
+           uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
+           m_giv[thread_num].dataPtr(),
+           wp.dataPtr(), &this->charge,
+           &xyzmin[0], &xyzmin[1], &xyzmin[2],
+           &dt, &dx[0], &dx[1], &dx[2],
+           &WarpX::nox,&WarpX::noy,&WarpX::noz, &j_is_nodal,
+           &lvect,&WarpX::current_deposition_algo);
 
 #ifdef WARPX_RZ
-         warpx_current_deposition_rz_volume_scaling(
+       warpx_current_deposition_rz_volume_scaling(
                                   jx_ptr, &ngJ, jxntot.getVect(),
                                   jy_ptr, &ngJ, jyntot.getVect(),
                                   jz_ptr, &ngJ, jzntot.getVect(),
                                   &xyzmin[0], &dx[0]);
 #endif
-      }
-
       BL_PROFILE_VAR_STOP(blp_pxr_cd);
 
 #ifndef AMREX_USE_GPU
@@ -484,92 +421,30 @@ WarpXParticleContainer::DepositCurrent(WarpXParIter& pti,
 
       long ncrse = np - np_current;
       BL_PROFILE_VAR_START(blp_pxr_cd);
-      if (j_is_nodal) {
-          const Real* p_wp = wp.dataPtr() + np_current;
-          const Real* p_gaminv = m_giv[thread_num].dataPtr() + np_current;
-          const Real* p_uxp = uxp.dataPtr() + np_current;
-          const Real* p_uyp = uyp.dataPtr() + np_current;
-          const Real* p_uzp = uzp.dataPtr() + np_current;
-          AsyncArray<Real> wptmp_aa(ncrse);
-          Real* const wptmp = wptmp_aa.data();
-          const Box& tile_box = pti.tilebox();
-#if (AMREX_SPACEDIM == 3)
-          const long nx = tile_box.length(0);
-          const long ny = tile_box.length(1);
-          const long nz = tile_box.length(2);
-#else
-          const long nx = tile_box.length(0);
-          const long ny = 0;
-          const long nz = tile_box.length(1);
-#endif
-          amrex::ParallelFor (ncrse, [=] AMREX_GPU_DEVICE (long ip) {
-                  wptmp[ip] = p_wp[ip] * p_gaminv[ip] * p_uxp[ip];
-          });
-          warpx_charge_deposition(jx_ptr, &ncrse,
-                                  m_xp[thread_num].dataPtr() +np_current,
-                                  m_yp[thread_num].dataPtr() +np_current,
-                                  m_zp[thread_num].dataPtr() +np_current,
-                                  wptmp,
-                                  &this->charge,
-                                  &cxyzmin_tile[0], &cxyzmin_tile[1], &cxyzmin_tile[2],
-                                  &cdx[0], &cdx[1], &cdx[2], &nx, &ny, &nz,
-                                  &ngJ, &ngJ, &ngJ,
-                                  &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                                  &lvect, &WarpX::current_deposition_algo);
-          amrex::ParallelFor (ncrse, [=] AMREX_GPU_DEVICE (long ip) {
-                  wptmp[ip] = p_wp[ip] * p_gaminv[ip] * p_uyp[ip];
-          });
-          warpx_charge_deposition(jy_ptr, &ncrse,
-                                  m_xp[thread_num].dataPtr() +np_current,
-                                  m_yp[thread_num].dataPtr() +np_current,
-                                  m_zp[thread_num].dataPtr() +np_current,
-                                  wptmp,
-                                  &this->charge,
-                                  &cxyzmin_tile[0], &cxyzmin_tile[1], &cxyzmin_tile[2],
-                                  &cdx[0], &cdx[1], &cdx[2], &nx, &ny, &nz,
-                                  &ngJ, &ngJ, &ngJ,
-                                  &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                                  &lvect, &WarpX::current_deposition_algo);
-          amrex::ParallelFor (ncrse, [=] AMREX_GPU_DEVICE (long ip) {
-                  wptmp[ip] = p_wp[ip] * p_gaminv[ip] * p_uzp[ip];
-          });
-          warpx_charge_deposition(jz_ptr, &ncrse,
-                                  m_xp[thread_num].dataPtr() +np_current,
-                                  m_yp[thread_num].dataPtr() +np_current,
-                                  m_zp[thread_num].dataPtr() +np_current,
-                                  wptmp,
-                                  &this->charge,
-                                  &cxyzmin_tile[0], &cxyzmin_tile[1], &cxyzmin_tile[2],
-                                  &cdx[0], &cdx[1], &cdx[2], &nx, &ny, &nz,
-                                  &ngJ, &ngJ, &ngJ,
-                                  &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                                  &lvect, &WarpX::current_deposition_algo);
-      } else {
-          warpx_current_deposition(
-                               jx_ptr, &ngJ, jxntot.getVect(),
-                               jy_ptr, &ngJ, jyntot.getVect(),
-                               jz_ptr, &ngJ, jzntot.getVect(),
-                               &ncrse,
-                               m_xp[thread_num].dataPtr() +np_current,
-                               m_yp[thread_num].dataPtr() +np_current,
-                               m_zp[thread_num].dataPtr() +np_current,
-                               uxp.dataPtr()+np_current,
-                               uyp.dataPtr()+np_current,
-                               uzp.dataPtr()+np_current,
-                               m_giv[thread_num].dataPtr()+np_current,
-                               wp.dataPtr()+np_current, &this->charge,
-                               &cxyzmin_tile[0], &cxyzmin_tile[1], &cxyzmin_tile[2],
-                               &dt, &cdx[0], &cdx[1], &cdx[2],
-                               &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                               &lvect,&WarpX::current_deposition_algo);
+      warpx_current_deposition(
+                           jx_ptr, &ngJ, jxntot.getVect(),
+                           jy_ptr, &ngJ, jyntot.getVect(),
+                           jz_ptr, &ngJ, jzntot.getVect(),
+                           &ncrse,
+                           m_xp[thread_num].dataPtr() +np_current,
+                           m_yp[thread_num].dataPtr() +np_current,
+                           m_zp[thread_num].dataPtr() +np_current,
+                           uxp.dataPtr()+np_current,
+                           uyp.dataPtr()+np_current,
+                           uzp.dataPtr()+np_current,
+                           m_giv[thread_num].dataPtr()+np_current,
+                           wp.dataPtr()+np_current, &this->charge,
+                           &cxyzmin_tile[0], &cxyzmin_tile[1], &cxyzmin_tile[2],
+                           &dt, &cdx[0], &cdx[1], &cdx[2],
+                           &WarpX::nox,&WarpX::noy,&WarpX::noz, &j_is_nodal,
+                           &lvect,&WarpX::current_deposition_algo);
 #ifdef WARPX_RZ
-         warpx_current_deposition_rz_volume_scaling(
+       warpx_current_deposition_rz_volume_scaling(
                                   jx_ptr, &ngJ, jxntot.getVect(),
                                   jy_ptr, &ngJ, jyntot.getVect(),
                                   jz_ptr, &ngJ, jzntot.getVect(),
                                   &xyzmin[0], &dx[0]);
 #endif
-      }
 
       BL_PROFILE_VAR_STOP(blp_pxr_cd);
 
@@ -669,7 +544,7 @@ WarpXParticleContainer::DepositCharge ( WarpXParIter& pti, RealVector& wp,
       const std::array<Real,3>& cxyzmin_tile = WarpX::LowerCorner(ctilebox, lev-1);
 
 #ifdef AMREX_USE_GPU
-      data_ptr = (*crhomf)[pti].dataPtr();
+      data_ptr = (*crhomf)[pti].dataPtr(icomp);
       auto rholen = (*crhomf)[pti].length();
 #else
       tile_box = amrex::convert(ctilebox, IntVect::TheUnitVector());
@@ -1074,10 +949,11 @@ WarpXParticleContainer::PushX (int lev, Real dt)
             if (cost) {
                 const Box& tbx = pti.tilebox();
                 wt = (amrex::second() - wt) / tbx.d_numPts();
-                FArrayBox* costfab = cost->fabPtr(pti);
-                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, work_box,
+                Array4<Real> const& costarr = cost->array(pti);
+                amrex::ParallelFor(tbx,
+                [=] AMREX_GPU_DEVICE (int i, int j, int k) noexcept
                 {
-                    costfab->plus(wt, work_box);
+                    costarr(i,j,k) += wt;
                 });
             }
         }
diff --git a/Source/Utils/Make.package b/Source/Utils/Make.package
index d8e2d2dab..cd335dbcf 100644
--- a/Source/Utils/Make.package
+++ b/Source/Utils/Make.package
@@ -4,6 +4,9 @@ CEXE_sources += WarpXTagging.cpp
 CEXE_sources += WarpXUtil.cpp
 CEXE_headers += WarpXConst.H
 CEXE_headers += WarpXUtil.H
+CEXE_headers += WarpXAlgorithmSelection.H
+CEXE_sources += WarpXAlgorithmSelection.cpp
+CEXE_headers += NCIGodfreyTables.H
 
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Utils
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Utils
diff --git a/Source/Utils/NCIGodfreyTables.H b/Source/Utils/NCIGodfreyTables.H
new file mode 100644
index 000000000..8cb105aa0
--- /dev/null
+++ b/Source/Utils/NCIGodfreyTables.H
@@ -0,0 +1,224 @@
+#include <AMReX_AmrCore.H>
+
+#ifndef WARPX_GODFREY_COEFF_TABLE_H_
+#define WARPX_GODFREY_COEFF_TABLE_H_
+
+// Table width. This is related to the stencil length
+const int tab_width = 4;
+// table length. Each line correspond to 1 value of cdtodz
+// (here 101 values).
+const int tab_length = 101;
+
+// Table of coefficient for Ex, Ey abd Bz
+// We typically interpolate between two lines
+const amrex::Real table_nci_godfrey_Ex_Ey_Bz[tab_length][tab_width]{
+        -2.47536,2.04288,-0.598163,0.0314711,  
+        -2.47536,2.04288,-0.598163,0.0314711,  
+        -2.47545,2.04309,-0.598307,0.0315029,  
+        -2.4756,2.04342,-0.598549,0.0315558,  
+        -2.47581,2.0439,-0.598886,0.0316298,  
+        -2.47608,2.0445,-0.59932,0.031725,  
+        -2.47641,2.04525,-0.59985,0.0318412,  
+        -2.4768,2.04612,-0.600477,0.0319785,  
+        -2.47725,2.04714,-0.6012,0.0321367,  
+        -2.47776,2.04829,-0.602019,0.0323158,  
+        -2.47833,2.04957,-0.602934,0.0325158,  
+        -2.47896,2.05099,-0.603944,0.0327364,  
+        -2.47965,2.05254,-0.605051,0.0329777,  
+        -2.4804,2.05423,-0.606253,0.0332396,  
+        -2.48121,2.05606,-0.60755,0.0335218,  
+        -2.48208,2.05802,-0.608942,0.0338243,  
+        -2.48301,2.06012,-0.610429,0.0341469,  
+        -2.48401,2.06235,-0.61201,0.0344895,  
+        -2.48506,2.06471,-0.613685,0.0348519,  
+        -2.48618,2.06721,-0.615453,0.0352339,  
+        -2.48735,2.06984,-0.617314,0.0356353,  
+        -2.48859,2.07261,-0.619268,0.0360559,  
+        -2.48988,2.0755,-0.621312,0.0364954,  
+        -2.49123,2.07853,-0.623447,0.0369536,  
+        -2.49265,2.08169,-0.625672,0.0374302,  
+        -2.49412,2.08498,-0.627986,0.0379248,  
+        -2.49565,2.0884,-0.630386,0.0384372,  
+        -2.49724,2.09194,-0.632873,0.0389669,  
+        -2.49888,2.09561,-0.635443,0.0395135,  
+        -2.50058,2.09939,-0.638096,0.0400766,  
+        -2.50234,2.1033,-0.640829,0.0406557,  
+        -2.50415,2.10732,-0.64364,0.0412502,  
+        -2.50601,2.11145,-0.646526,0.0418594,  
+        -2.50791,2.1157,-0.649485,0.0424828,  
+        -2.50987,2.12004,-0.652512,0.0431196,  
+        -2.51187,2.12448,-0.655604,0.0437688,  
+        -2.51392,2.12901,-0.658756,0.0444297,  
+        -2.516,2.13363,-0.661964,0.0451011,  
+        -2.51812,2.13832,-0.665221,0.0457818,  
+        -2.52027,2.14308,-0.668521,0.0464705,  
+        -2.52244,2.14789,-0.671856,0.0471658,  
+        -2.52464,2.15274,-0.675218,0.0478658,  
+        -2.52684,2.15762,-0.678596,0.0485687,  
+        -2.52906,2.16251,-0.68198,0.0492723,  
+        -2.53126,2.16738,-0.685355,0.049974,  
+        -2.53345,2.17222,-0.688706,0.0506708,  
+        -2.53561,2.177,-0.692015,0.0513594,  
+        -2.53773,2.18168,-0.69526,0.0520359,  
+        -2.53978,2.18623,-0.698416,0.0526955,  
+        -2.54175,2.19059,-0.701452,0.053333,  
+        -2.5436,2.19471,-0.704331,0.0539417,  
+        -2.54531,2.19852,-0.70701,0.0545141,  
+        -2.54683,2.20193,-0.709433,0.0550409,  
+        -2.5481,2.20483,-0.711533,0.0555106,  
+        -2.54906,2.20709,-0.713224,0.0559094,  
+        -2.54963,2.20852,-0.714397,0.0562198,  
+        -2.54968,2.20888,-0.714907,0.0564196,  
+        -2.54905,2.20785,-0.714562,0.0564797,  
+        -2.54751,2.20496,-0.713094,0.0563618,  
+        -2.54472,2.19955,-0.710118,0.0560124,  
+        -2.54014,2.19058,-0.705048,0.0553544,  
+        -2.53286,2.1763,-0.69693,0.0542684,  
+        -2.52115,2.15344,-0.684027,0.05255,  
+        -2.50098,2.11466,-0.66255,0.0497817,  
+        -2.45797,2.03459,-0.620099,0.0446889,  
+        -2.28371,1.72254,-0.465905,0.0283268,  
+        -2.4885,2.04899,-0.599292,0.0390466,  
+        -2.1433,1.36735,-0.220924,-0.00215633,  
+        -2.4943,2.07019,-0.610552,0.035166,  
+        -2.84529,2.77303,-1.00018,0.0724884,  
+        -2.72242,2.51888,-0.847226,0.0509964,  
+        -2.65633,2.3744,-0.750392,0.0326366,  
+        -2.59601,2.23412,-0.646421,0.00868027,  
+        -2.51477,2.0369,-0.491066,-0.0306397,  
+        -2.35935,1.65155,-0.178971,-0.112713,  
+        -1.84315,0.361693,0.876104,-0.393844,  
+        -2.65422,2.39262,-0.789663,0.0516265,  
+        -3.46529,4.42354,-2.45543,0.497097,  
+        -3.15747,3.65311,-1.824,0.328432,  
+        -3.04694,3.37613,-1.59668,0.267631,  
+        -2.99205,3.23814,-1.48302,0.237103,  
+        -2.96075,3.15894,-1.41733,0.219317,  
+        -2.94172,3.11028,-1.37649,0.20811,  
+        -2.92994,3.07962,-1.35025,0.200755,  
+        -2.92283,3.06054,-1.33338,0.195859,  
+        -2.91894,3.04938,-1.3229,0.192637,  
+        -2.91736,3.04394,-1.31702,0.190612,  
+        -2.91753,3.04278,-1.31456,0.189477,  
+        -2.91905,3.04494,-1.31475,0.189026,  
+        -2.92165,3.04973,-1.31705,0.189117,  
+        -2.92512,3.05667,-1.32105,0.189646,  
+        -2.92933,3.06539,-1.32646,0.190538,  
+        -2.93416,3.07562,-1.33308,0.191735,  
+        -2.93952,3.08715,-1.34072,0.193194,  
+        -2.94535,3.09982,-1.34925,0.194881,  
+        -2.95159,3.11349,-1.35858,0.196769,  
+        -2.9582,3.12805,-1.36861,0.198838,  
+        -2.96514,3.14342,-1.37929,0.201068,  
+        -2.97239,3.15953,-1.39055,0.203448,  
+        -2.97991,3.17632,-1.40234,0.205964,  
+        -2.98769,3.19374,-1.41463,0.208607
+            };
+
+// Table of coefficient for Bx, By and Ez
+// We typically interpolate between two lines
+const amrex::Real table_nci_godfrey_Bx_By_Ez[tab_length][tab_width]{
+        -2.80862,2.80104,-1.14615,0.154077,  
+        -2.80862,2.80104,-1.14615,0.154077,  
+        -2.80851,2.80078,-1.14595,0.154027,  
+        -2.80832,2.80034,-1.14561,0.153945,  
+        -2.80807,2.79973,-1.14514,0.153829,  
+        -2.80774,2.79894,-1.14454,0.15368,  
+        -2.80733,2.79798,-1.1438,0.153498,  
+        -2.80685,2.79685,-1.14292,0.153284,  
+        -2.8063,2.79554,-1.14192,0.153036,  
+        -2.80568,2.79405,-1.14077,0.152756,  
+        -2.80498,2.79239,-1.1395,0.152443,  
+        -2.80421,2.79056,-1.13809,0.152098,  
+        -2.80337,2.78856,-1.13656,0.151721,  
+        -2.80246,2.78638,-1.13488,0.151312,  
+        -2.80147,2.78404,-1.13308,0.150871,  
+        -2.80041,2.78152,-1.13115,0.150397,  
+        -2.79927,2.77882,-1.12908,0.149893,  
+        -2.79807,2.77596,-1.12689,0.149356,  
+        -2.79679,2.77292,-1.12456,0.148789,  
+        -2.79543,2.76972,-1.12211,0.14819,  
+        -2.79401,2.76634,-1.11953,0.14756,  
+        -2.79251,2.76279,-1.11681,0.1469,  
+        -2.79094,2.75907,-1.11397,0.146208,  
+        -2.78929,2.75517,-1.111,0.145486,  
+        -2.78757,2.7511,-1.10789,0.144733,  
+        -2.78578,2.74686,-1.10466,0.14395,  
+        -2.78391,2.74245,-1.1013,0.143137,  
+        -2.78196,2.73786,-1.09781,0.142293,  
+        -2.77994,2.73309,-1.09419,0.141419,  
+        -2.77784,2.72814,-1.09043,0.140514,  
+        -2.77566,2.72301,-1.08654,0.139578,  
+        -2.7734,2.7177,-1.08252,0.138612,  
+        -2.77106,2.7122,-1.07836,0.137614,  
+        -2.76864,2.70651,-1.07406,0.136586,  
+        -2.76613,2.70062,-1.06962,0.135525,  
+        -2.76353,2.69453,-1.06503,0.134432,  
+        -2.76084,2.68824,-1.0603,0.133307,  
+        -2.75806,2.68173,-1.05541,0.132148,  
+        -2.75518,2.675,-1.05037,0.130954,  
+        -2.75219,2.66804,-1.04516,0.129725,  
+        -2.7491,2.66084,-1.03978,0.12846,  
+        -2.7459,2.65339,-1.03423,0.127156,  
+        -2.74257,2.64566,-1.02848,0.125813,  
+        -2.73912,2.63765,-1.02254,0.124428,  
+        -2.73552,2.62934,-1.01638,0.122999,  
+        -2.73178,2.62069,-1.01,0.121523,  
+        -2.72787,2.61169,-1.00337,0.119996,  
+        -2.72379,2.6023,-0.996479,0.118417,  
+        -2.71951,2.59248,-0.989294,0.116778,  
+        -2.71501,2.58218,-0.981786,0.115076,  
+        -2.71026,2.57135,-0.97392,0.113303,  
+        -2.70524,2.55991,-0.965651,0.111453,  
+        -2.69989,2.54778,-0.956922,0.109514,  
+        -2.69416,2.53484,-0.947666,0.107476,  
+        -2.68799,2.52096,-0.937795,0.105324,  
+        -2.68129,2.50596,-0.927197,0.103039,  
+        -2.67394,2.48959,-0.915724,0.100597,  
+        -2.66578,2.47153,-0.903179,0.097968,  
+        -2.65657,2.4513,-0.889283,0.0951084,  
+        -2.64598,2.42824,-0.873638,0.0919592,  
+        -2.63347,2.40127,-0.855632,0.0884325,  
+        -2.61813,2.36864,-0.834261,0.0843898,  
+        -2.59821,2.32701,-0.807691,0.0795876,  
+        -2.56971,2.26887,-0.77188,0.0735132,  
+        -2.51823,2.16823,-0.713448,0.0645399,  
+        -2.33537,1.8294,-0.533852,0.0409941,  
+        -2.53143,2.14818,-0.670502,0.053982,  
+        -2.17737,1.43641,-0.259095,0.00101255,  
+        -2.51929,2.12931,-0.654743,0.0452381,  
+        -2.86122,2.82221,-1.05039,0.0894636,  
+        -2.72908,2.54506,-0.87834,0.0626188,  
+        -2.6536,2.37954,-0.7665,0.0409117,  
+        -2.58374,2.21923,-0.649738,0.0146791,  
+        -2.49284,2.00346,-0.48457,-0.0255348,  
+        -2.32762,1.60337,-0.1698,-0.105287,  
+        -1.80149,0.316787,0.855414,-0.369652,  
+        -2.60242,2.28418,-0.721378,0.040091,  
+        -3.40335,4.25157,-2.29817,0.449834,  
+        -3.0852,3.47341,-1.67791,0.28982,  
+        -2.9642,3.17856,-1.44399,0.229852,  
+        -2.89872,3.01966,-1.31861,0.197945,  
+        -2.85668,2.91811,-1.23894,0.17783,  
+        -2.82679,2.84621,-1.18287,0.163785,  
+        -2.80401,2.79167,-1.14058,0.153278,  
+        -2.78577,2.74819,-1.10706,0.145015,  
+        -2.77061,2.7122,-1.07946,0.138267,  
+        -2.75764,2.68152,-1.05606,0.132589,  
+        -2.74627,2.65475,-1.03575,0.127695,  
+        -2.73612,2.63093,-1.01777,0.123395,  
+        -2.72692,2.6094,-1.00159,0.119553,  
+        -2.71846,2.58968,-0.986841,0.116074,  
+        -2.71061,2.57142,-0.973239,0.112887,  
+        -2.70323,2.55434,-0.960573,0.109937,  
+        -2.69626,2.53824,-0.948678,0.107185,  
+        -2.68962,2.52294,-0.937429,0.104598,  
+        -2.68327,2.50833,-0.926722,0.102151,  
+        -2.67714,2.4943,-0.916477,0.0998223,  
+        -2.67122,2.48076,-0.906627,0.0975966,  
+        -2.66546,2.46764,-0.897118,0.0954599,  
+        -2.65985,2.45489,-0.887903,0.0934011,  
+        -2.65437,2.44244,-0.878945,0.0914107
+            };
+
+#endif // #ifndef WARPX_GODFREY_COEFF_TABLE_H_
diff --git a/Source/Utils/WarpXAlgorithmSelection.H b/Source/Utils/WarpXAlgorithmSelection.H
new file mode 100644
index 000000000..3fb23698a
--- /dev/null
+++ b/Source/Utils/WarpXAlgorithmSelection.H
@@ -0,0 +1,57 @@
+#ifndef UTILS_WARPXALGORITHMSELECTION_H_
+#define UTILS_WARPXALGORITHMSELECTION_H_
+
+#include <AMReX_ParmParse.H>
+#include <string>
+
+struct MaxwellSolverAlgo {
+    // These numbers corresponds to the algorithm code in WarpX's
+    // `warpx_push_bvec` and `warpx_push_evec_f`
+    enum {
+        Yee = 0,
+        CKC = 1
+    };
+};
+
+struct ParticlePusherAlgo {
+    // These numbers correspond to the algorithm code in WarpX's
+    // `warpx_particle_pusher`
+    enum {
+        Boris = 0,
+        Vay = 1
+    };
+};
+
+struct CurrentDepositionAlgo {
+    enum {
+        // These numbers corresponds to the algorithm code in PICSAR's
+        // `depose_jxjyjz_generic` and `depose_jxjyjz_generic_2d`
+         Direct = 3,
+         DirectVectorized = 2,
+         EsirkepovNonOptimized = 1,
+         Esirkepov = 0
+    };
+};
+
+struct ChargeDepositionAlgo {
+    // These numbers corresponds to the algorithm code in WarpX's
+    // `warpx_charge_deposition` function
+    enum {
+         Vectorized = 0,
+         Standard = 1
+    };
+};
+
+struct GatheringAlgo {
+    // These numbers corresponds to the algorithm code in PICSAR's
+    // `geteb3d_energy_conserving_generic` function
+    enum {
+         Vectorized = 0,
+         Standard = 1
+    };
+};
+
+int
+GetAlgorithmInteger( amrex::ParmParse& pp, const char* pp_search_key );
+
+#endif // UTILS_WARPXALGORITHMSELECTION_H_
diff --git a/Source/Utils/WarpXAlgorithmSelection.cpp b/Source/Utils/WarpXAlgorithmSelection.cpp
new file mode 100644
index 000000000..2c8038ccd
--- /dev/null
+++ b/Source/Utils/WarpXAlgorithmSelection.cpp
@@ -0,0 +1,95 @@
+#include <WarpXAlgorithmSelection.H>
+#include <map>
+#include <algorithm>
+#include <cstring>
+
+// Define dictionary with correspondance between user-input strings,
+// and corresponding integer for use inside the code (e.g. in PICSAR).
+
+const std::map<std::string, int> maxwell_solver_algo_to_int = {
+    {"yee",     MaxwellSolverAlgo::Yee },
+#ifndef WARPX_RZ // Not available in RZ
+    {"ckc",     MaxwellSolverAlgo::CKC },
+#endif
+    {"default", MaxwellSolverAlgo::Yee }
+};
+
+const std::map<std::string, int> particle_pusher_algo_to_int = {
+    {"boris",   ParticlePusherAlgo::Boris },
+    {"vay",     ParticlePusherAlgo::Vay },
+    {"default", ParticlePusherAlgo::Boris }
+};
+
+const std::map<std::string, int> current_deposition_algo_to_int = {
+    {"esirkepov",            CurrentDepositionAlgo::Esirkepov },
+    {"direct",               CurrentDepositionAlgo::Direct },
+#if (!defined AMREX_USE_GPU)&&(AMREX_SPACEDIM == 3) // Only available on CPU and 3D
+    {"direct-vectorized",    CurrentDepositionAlgo::DirectVectorized },
+#endif
+    {"default",              CurrentDepositionAlgo::Esirkepov }
+};
+
+const std::map<std::string, int> charge_deposition_algo_to_int = {
+    {"standard",   ChargeDepositionAlgo::Standard },
+#if (!defined AMREX_USE_GPU)&&(AMREX_SPACEDIM == 3) // Only available on CPU and 3D
+    {"vectorized", ChargeDepositionAlgo::Vectorized },
+    {"default",    ChargeDepositionAlgo::Vectorized }
+#else
+    {"default",    ChargeDepositionAlgo::Standard }
+#endif
+};
+
+const std::map<std::string, int> gathering_algo_to_int = {
+    {"standard",   GatheringAlgo::Standard },
+#ifndef AMREX_USE_GPU // Only available on CPU
+    {"vectorized", GatheringAlgo::Vectorized },
+    {"default",    GatheringAlgo::Vectorized }
+#else
+    {"default",    GatheringAlgo::Standard }
+#endif
+};
+
+
+int
+GetAlgorithmInteger( amrex::ParmParse& pp, const char* pp_search_key ){
+
+    // Read user input ; use "default" if it is not found
+    std::string algo = "default";
+    pp.query( pp_search_key, algo );
+    // Convert to lower case
+    std::transform(algo.begin(), algo.end(), algo.begin(), ::tolower);
+
+    // Pick the right dictionary
+    std::map<std::string, int> algo_to_int;
+    if (0 == std::strcmp(pp_search_key, "maxwell_fdtd_solver")) {
+        algo_to_int = maxwell_solver_algo_to_int;
+    } else if (0 == std::strcmp(pp_search_key, "particle_pusher")) {
+        algo_to_int = particle_pusher_algo_to_int;
+    } else if (0 == std::strcmp(pp_search_key, "current_deposition")) {
+        algo_to_int = current_deposition_algo_to_int;
+    } else if (0 == std::strcmp(pp_search_key, "charge_deposition")) {
+        algo_to_int = charge_deposition_algo_to_int;
+    } else if (0 == std::strcmp(pp_search_key, "field_gathering")) {
+        algo_to_int = gathering_algo_to_int;
+    } else {
+        std::string pp_search_string = pp_search_key;
+        amrex::Abort("Unknown algorithm type: " + pp_search_string);
+    }
+
+    // Check if the user-input is a valid key for the dictionary
+    if (algo_to_int.count(algo) == 0){
+        // Not a valid key ; print error message
+        std::string pp_search_string = pp_search_key;
+        std::string error_message = "Invalid string for algo." + pp_search_string
+            + ": " + algo + ".\nThe valid values are:\n";
+        for ( const auto &valid_pair : algo_to_int ) {
+            if (valid_pair.first != "default"){
+                error_message += " - " + valid_pair.first + "\n";
+            }
+        }
+        amrex::Abort(error_message);
+    }
+
+    // If the input is a valid key, return the value
+    return algo_to_int[algo];
+}
diff --git a/Source/Utils/WarpXUtil.cpp b/Source/Utils/WarpXUtil.cpp
index a5ea6d75a..19e898208 100644
--- a/Source/Utils/WarpXUtil.cpp
+++ b/Source/Utils/WarpXUtil.cpp
@@ -51,16 +51,25 @@ void ConvertLabParamsToBoost()
     Vector<Real> prob_hi(AMREX_SPACEDIM);
     Vector<Real> fine_tag_lo(AMREX_SPACEDIM);
     Vector<Real> fine_tag_hi(AMREX_SPACEDIM);
+    Vector<Real> slice_lo(AMREX_SPACEDIM);
+    Vector<Real> slice_hi(AMREX_SPACEDIM);
 
     ParmParse pp_geom("geometry");
     ParmParse pp_wpx("warpx");
     ParmParse pp_amr("amr");
+    ParmParse pp_slice("slice");
 
     pp_geom.getarr("prob_lo",prob_lo,0,AMREX_SPACEDIM);
     BL_ASSERT(prob_lo.size() == AMREX_SPACEDIM);
     pp_geom.getarr("prob_hi",prob_hi,0,AMREX_SPACEDIM);
     BL_ASSERT(prob_hi.size() == AMREX_SPACEDIM);
 
+    pp_slice.queryarr("dom_lo",slice_lo,0,AMREX_SPACEDIM);
+    BL_ASSERT(slice_lo.size() == AMREX_SPACEDIM);
+    pp_slice.queryarr("dom_hi",slice_hi,0,AMREX_SPACEDIM);
+    BL_ASSERT(slice_hi.size() == AMREX_SPACEDIM);
+    
+
     pp_amr.query("max_level", max_level);
     if (max_level > 0){
       pp_wpx.getarr("fine_tag_lo", fine_tag_lo);
@@ -86,15 +95,22 @@ void ConvertLabParamsToBoost()
               fine_tag_lo[idim] *= convert_factor;
               fine_tag_hi[idim] *= convert_factor;
             }
+            slice_lo[idim] *= convert_factor;
+            slice_hi[idim] *= convert_factor;
             break;
         }
     }
+
     pp_geom.addarr("prob_lo", prob_lo);
     pp_geom.addarr("prob_hi", prob_hi);
     if (max_level > 0){
       pp_wpx.addarr("fine_tag_lo", fine_tag_lo);
       pp_wpx.addarr("fine_tag_hi", fine_tag_hi);
     }
+
+    pp_slice.addarr("dom_lo",slice_lo);
+    pp_slice.addarr("dom_hi",slice_hi);
+
 }
 
 /* \brief Function that sets the value of MultiFab MF to zero for z between 
diff --git a/Source/WarpX.H b/Source/WarpX.H
index a2cd02648..67f51784e 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -23,6 +23,7 @@
 #include <PML.H>
 #include <BoostedFrameDiagnostic.H>
 #include <BilinearFilter.H>
+#include <NCIGodfreyFilter.H>
 
 #ifdef WARPX_USE_PSATD
 #include <fftw3.h>
@@ -146,7 +147,9 @@ public:
 
     static amrex::IntVect filter_npass_each_dir;
     BilinearFilter bilinear_filter;
-
+    amrex::Vector< std::unique_ptr<NCIGodfreyFilter> > nci_godfrey_filter_exeybz;
+    amrex::Vector< std::unique_ptr<NCIGodfreyFilter> > nci_godfrey_filter_bxbyez;
+    
     static int num_mirrors;
     amrex::Vector<amrex::Real> mirror_z;
     amrex::Vector<amrex::Real> mirror_z_width;
diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp
index 6a6add9db..c2cf97f30 100644
--- a/Source/WarpX.cpp
+++ b/Source/WarpX.cpp
@@ -17,6 +17,7 @@
 #include <WarpXConst.H>
 #include <WarpXWrappers.h>
 #include <WarpXUtil.H>
+#include <WarpXAlgorithmSelection.H>
 
 #ifdef BL_USE_SENSEI_INSITU
 #include <AMReX_AmrMeshInSituBridge.H>
@@ -35,11 +36,11 @@ Vector<int> WarpX::boost_direction = {0,0,0};
 int WarpX::do_compute_max_step_from_zmax = 0;
 Real WarpX::zmax_plasma_to_compute_max_step = 0.;
 
-long WarpX::current_deposition_algo = 3;
-long WarpX::charge_deposition_algo = 0;
-long WarpX::field_gathering_algo = 1;
-long WarpX::particle_pusher_algo = 0;
-int WarpX::maxwell_fdtd_solver_id = 0;
+long WarpX::current_deposition_algo;
+long WarpX::charge_deposition_algo;
+long WarpX::field_gathering_algo;
+long WarpX::particle_pusher_algo;
+int WarpX::maxwell_fdtd_solver_id;
 
 long WarpX::nox = 1;
 long WarpX::noy = 1;
@@ -118,7 +119,7 @@ WarpX::ResetInstance ()
 {
     delete m_instance;
     m_instance = nullptr;
-}	
+}
 
 WarpX::WarpX ()
 {
@@ -228,6 +229,11 @@ WarpX::WarpX ()
 #ifdef BL_USE_SENSEI_INSITU
     insitu_bridge = nullptr;
 #endif
+
+    // NCI Godfrey filters can have different stencils
+    // at different levels (the stencil depends on c*dt/dz)
+    nci_godfrey_filter_exeybz.resize(nlevs_max);
+    nci_godfrey_filter_bxbyez.resize(nlevs_max);
 }
 
 WarpX::~WarpX ()
@@ -276,10 +282,10 @@ WarpX::ReadParameters ()
 
     ReadBoostedFrameParameters(gamma_boost, beta_boost, boost_direction);
 
-    // pp.query returns 1 if argument zmax_plasma_to_compute_max_step is 
+    // pp.query returns 1 if argument zmax_plasma_to_compute_max_step is
     // specified by the user, 0 otherwise.
-    do_compute_max_step_from_zmax = 
-        pp.query("zmax_plasma_to_compute_max_step", 
+    do_compute_max_step_from_zmax =
+        pp.query("zmax_plasma_to_compute_max_step",
                   zmax_plasma_to_compute_max_step);
 
     pp.queryarr("B_external", B_external);
@@ -318,7 +324,7 @@ WarpX::ReadParameters ()
                "gamma_boost must be > 1 to use the boosted frame diagnostic.");
 
         pp.query("lab_data_directory", lab_data_directory);
-        
+
         std::string s;
         pp.get("boost_direction", s);
         AMREX_ALWAYS_ASSERT_WITH_MESSAGE( (s == "z" || s == "Z"),
@@ -476,29 +482,12 @@ WarpX::ReadParameters ()
     }
 
     {
-	ParmParse pp("algo");
-	pp.query("current_deposition", current_deposition_algo);
-	pp.query("charge_deposition", charge_deposition_algo);
-	pp.query("field_gathering", field_gathering_algo);
-	pp.query("particle_pusher", particle_pusher_algo);
-	std::string s_solver = "";
-	pp.query("maxwell_fdtd_solver", s_solver);
-        std::transform(s_solver.begin(),
-                       s_solver.end(),
-                       s_solver.begin(),
-                       ::tolower);
-	// if maxwell_fdtd_solver is specified, set the value
-	// of maxwell_fdtd_solver_id accordingly.
-        // Otherwise keep the default value maxwell_fdtd_solver_id=0
-        if (s_solver != "") {
-            if (s_solver == "yee") {
-                maxwell_fdtd_solver_id = 0;
-            } else if (s_solver == "ckc") {
-                maxwell_fdtd_solver_id = 1;
-            } else {
-                amrex::Abort("Unknown FDTD Solver type " + s_solver);
-            }
-        }
+        ParmParse pp("algo");
+        current_deposition_algo = GetAlgorithmInteger(pp, "current_deposition");
+        charge_deposition_algo = GetAlgorithmInteger(pp, "charge_deposition");
+        field_gathering_algo = GetAlgorithmInteger(pp, "field_gathering");
+        particle_pusher_algo = GetAlgorithmInteger(pp, "particle_pusher");
+        maxwell_fdtd_solver_id = GetAlgorithmInteger(pp, "maxwell_fdtd_solver");
     }
 
 #ifdef WARPX_USE_PSATD
@@ -535,7 +524,7 @@ WarpX::ReadParameters ()
        amrex::Vector<Real> slice_hi(AMREX_SPACEDIM);
        Vector<int> slice_crse_ratio(AMREX_SPACEDIM);
        // set default slice_crse_ratio //
-       for (int idim=0; idim < AMREX_SPACEDIM; ++idim ) 
+       for (int idim=0; idim < AMREX_SPACEDIM; ++idim )
        {
           slice_crse_ratio[idim] = 1;
        }
@@ -554,7 +543,7 @@ WarpX::ReadParameters ()
        }
 
     }
- 
+
 }
 
 // This is a virtual function.
@@ -656,7 +645,7 @@ WarpX::AllocLevelData (int lev, const BoxArray& ba, const DistributionMapping& d
     int ngz_nonci = (ngz_tmp % 2) ? ngz_tmp+1 : ngz_tmp;  // Always even number
     int ngz;
     if (WarpX::use_fdtd_nci_corr) {
-        int ng = ngz_tmp + (mypc->nstencilz_fdtd_nci_corr-1);
+        int ng = ngz_tmp + NCIGodfreyFilter::stencil_width;
         ngz = (ng % 2) ? ng+1 : ng;
     } else {
         ngz = ngz_nonci;
@@ -861,8 +850,6 @@ WarpX::AllocLevelMFs (int lev, const BoxArray& ba, const DistributionMapping& dm
     if (load_balance_int > 0) {
         costs[lev].reset(new MultiFab(ba, dm, 1, 0));
     }
-
-
 }
 
 std::array<Real,3>
diff --git a/Tools/performance_tests/GNUmakefile_perftest b/Tools/performance_tests/GNUmakefile_perftest
index 7740b0a64..38275332d 100644
--- a/Tools/performance_tests/GNUmakefile_perftest
+++ b/Tools/performance_tests/GNUmakefile_perftest
@@ -7,6 +7,9 @@ DIM	= 3
 COMP=intel
 TINY_PROFILE = TRUE
 USE_OMP   = TRUE
+USE_CUDA  = FALSE
+USE_ACC   = FALSE
+USE_SENSEI_INSITU = FALSE
 EBASE     = perf_tests
 USE_PYTHON_MAIN = FALSE
 WarpxBinDir = Bin
diff --git a/Tools/performance_tests/automated_test_1_uniform_rest_32ppc b/Tools/performance_tests/automated_test_1_uniform_rest_32ppc
index cce91317c..55c1a6061 100644
--- a/Tools/performance_tests/automated_test_1_uniform_rest_32ppc
+++ b/Tools/performance_tests/automated_test_1_uniform_rest_32ppc
@@ -19,11 +19,6 @@ geometry.prob_hi     =  20.e-6    20.e-6    20.e-6
 # Verbosity
 warpx.verbose = 1
 
-# Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
diff --git a/Tools/performance_tests/automated_test_2_uniform_rest_1ppc b/Tools/performance_tests/automated_test_2_uniform_rest_1ppc
index 6dfdee422..8e17042c9 100644
--- a/Tools/performance_tests/automated_test_2_uniform_rest_1ppc
+++ b/Tools/performance_tests/automated_test_2_uniform_rest_1ppc
@@ -19,11 +19,6 @@ geometry.prob_hi     =  20.e-6    20.e-6    20.e-6
 # Verbosity
 warpx.verbose = 1
 
-# Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
diff --git a/Tools/performance_tests/automated_test_3_uniform_drift_4ppc b/Tools/performance_tests/automated_test_3_uniform_drift_4ppc
index 21ac8409f..13af8aaff 100644
--- a/Tools/performance_tests/automated_test_3_uniform_drift_4ppc
+++ b/Tools/performance_tests/automated_test_3_uniform_drift_4ppc
@@ -20,10 +20,6 @@ geometry.prob_hi     =  20.e-6    20.e-6    20.e-6
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
diff --git a/Tools/performance_tests/automated_test_4_labdiags_2ppc b/Tools/performance_tests/automated_test_4_labdiags_2ppc
index f762c9b8c..282c93331 100644
--- a/Tools/performance_tests/automated_test_4_labdiags_2ppc
+++ b/Tools/performance_tests/automated_test_4_labdiags_2ppc
@@ -17,12 +17,6 @@ geometry.prob_hi     =  150.e-6    150.e-6    0.
 # Verbosity
 warpx.verbose = 1
 
-# Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
-
 # Numerics
 interpolation.nox = 3
 interpolation.noy = 3
@@ -78,7 +72,8 @@ ions.momentum_distribution_type = "constant"
 ions.do_continuous_injection = 1
 
 # Laser
-warpx.use_laser = 1
+lasers.nlasers = 1
+lasers.names = laser
 laser.profile      = Gaussian
 laser.position     = 0. 0. -1.e-6 # This point is on the laser plane
 laser.direction    = 0. 0. 1.    # The plane normal direction
diff --git a/Tools/performance_tests/automated_test_5_loadimbalance b/Tools/performance_tests/automated_test_5_loadimbalance
index 32707bff2..22c9ec4b6 100644
--- a/Tools/performance_tests/automated_test_5_loadimbalance
+++ b/Tools/performance_tests/automated_test_5_loadimbalance
@@ -18,11 +18,6 @@ geometry.prob_hi     =  20.e-6    20.e-6    20.e-6
 
 warpx.verbose = 1
 warpx.load_balance_int = 5
-# Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
diff --git a/Tools/performance_tests/automated_test_6_output_2ppc b/Tools/performance_tests/automated_test_6_output_2ppc
index 01de623ca..f4498c410 100644
--- a/Tools/performance_tests/automated_test_6_output_2ppc
+++ b/Tools/performance_tests/automated_test_6_output_2ppc
@@ -19,11 +19,6 @@ geometry.prob_hi     =  20.e-6    20.e-6    20.e-6
 # Verbosity
 warpx.verbose = 1
 
-# Algorithms
-algo.current_deposition = 0
-algo.charge_deposition = 0
-algo.field_gathering = 0
-algo.particle_pusher = 0
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
diff --git a/Tools/performance_tests/run_automated.py b/Tools/performance_tests/run_automated.py
index f154c1308..dca038c6c 100644
--- a/Tools/performance_tests/run_automated.py
+++ b/Tools/performance_tests/run_automated.py
@@ -1,9 +1,10 @@
 #!/usr/common/software/python/2.7-anaconda-4.4/bin/python
 
-import os, sys, shutil, datetime
+import os, sys, shutil, datetime, git
 import argparse, re, time, copy
 import pandas as pd
-from functions_perftest import *
+from functions_perftest import store_git_hash, get_file_content, \
+                               run_batch_nnode, extract_dataframe
 
 # typical use: python run_automated.py --n_node_list='1,8,16,32' --automated
 # Assume warpx, picsar, amrex and perf_logs repos ar in the same directory and
@@ -225,6 +226,7 @@ def process_analysis():
     batch_string += '#SBATCH -o read_output.txt\n'
     batch_string += '#SBATCH --mail-type=end\n'
     batch_string += '#SBATCH --account=m2852\n'
+    batch_string += 'module load h5py-parallel\n'
     batch_string += 'python ' + __file__ + ' --compiler=' + \
                     args.compiler + ' --architecture=' + args.architecture + \
                     ' --mode=read' + \
@@ -303,7 +305,8 @@ for n_node in n_node_list:
             # Load file perf_database_file if exists, and
             # append with results from this scan
             if os.path.exists(perf_database_file):
-                df_base = pd.read_hdf(perf_database_file, 'all_data')
+                df_base = pd.read_hdf(perf_database_file, 'all_data', format='table')
+                # df_base = pd.read_hdf(perf_database_file, 'all_data')
                 updated_df = df_base.append(df_newline, ignore_index=True)
             else:
                 updated_df = df_newline