48 files changed, 2053 insertions, 757 deletions
diff --git a/Docs/source/building/building.rst b/Docs/source/building/building.rst
index 5913f283d..33357a4b6 100644
--- a/Docs/source/building/building.rst
+++ b/Docs/source/building/building.rst
@@ -107,3 +107,4 @@ Advanced building instructions
    spectral
    cori
    summitdev
+   spack
diff --git a/Docs/source/building/spack.rst b/Docs/source/building/spack.rst
new file mode 100644
index 000000000..d54c6cc10
--- /dev/null
+++ b/Docs/source/building/spack.rst
@@ -0,0 +1,47 @@
+Building WarpX with Spack
+===============================
+
+WarpX can be installed using Spack. From the Spack web page: "Spack is a package management tool designed to support multiple
+versions and configurations of software on a wide variety of platforms and environments."
+
+Spack is available from `github <https://github.com/spack/spack>`__. Spack only needs to be cloned and can be used right away - there are no installation
+steps. The spack command, "spack/bin/spack", can be used directly or "spack/bin" can be added to your execute path.
+
+WarpX is built with the single command
+
+::
+
+    spack install warpx
+
+This will build the 3-D version of WarpX using the master branch.
+At the very end of the output from build sequence, Spack tells you where the WarpX executable has been placed.
+Alternatively, the "spack load" command can be configured so that "spack load warpx" will put the executable in your execute path.
+
+To build using the dev branch, the command is
+
+::
+
+    spack install warpx@dev
+
+
+Other variants of WarpX can be installed, for example
+
+::
+
+    spack install warpx dims=2
+
+will build the 2-D version.
+
+::
+
+    spack install warpx debug=True
+
+will build with debugging turned on.
+
+::
+
+    spack install warpx %intel
+
+will build using the intel compiler (instead of gcc).
+
+The Python verson of WarpX is not yet available with Spack.
diff --git a/Docs/source/building/summitdev.rst b/Docs/source/building/summitdev.rst
index e194db73c..c5a3a7034 100644
--- a/Docs/source/building/summitdev.rst
+++ b/Docs/source/building/summitdev.rst
@@ -13,7 +13,7 @@ correct branch:
 
     git clone https://github.com/ECP-WarpX/WarpX.git
     cd WarpX
-    git checkout gpu
+    git checkout dev
     cd ..
 
     git clone https://bitbucket.org/berkeleylab/picsar.git
@@ -33,4 +33,4 @@ Then, use the following set of commands to compile:
 
     module load pgi
     module load cuda
-    make
+    make -j 4 USE_GPU=TRUE COMP=pgi
diff --git a/Docs/source/conf.py b/Docs/source/conf.py
index 1d4b97c49..ce6f436d3 100644
--- a/Docs/source/conf.py
+++ b/Docs/source/conf.py
@@ -88,6 +88,8 @@ todo_include_todos = False
 #
 html_theme = 'sphinx_rtd_theme'
 
+numfig = True
+
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
diff --git a/Docs/source/running_cpp/parameters.rst b/Docs/source/running_cpp/parameters.rst
index 3c579d6fd..e0a0ac2b2 100644
--- a/Docs/source/running_cpp/parameters.rst
+++ b/Docs/source/running_cpp/parameters.rst
@@ -376,11 +376,10 @@ Numerics and algorithms
      - ``2``: Direct deposition, vectorized
      - ``3``: Direct deposition, non-optimized
 
-     .. warning ::
+    .. warning::
 
-        The vectorized Esirkepov deposition
-        (``algo.current_deposition=0``) is currently not functional in WarpX.
-        All the other methods (``1``, ``2`` and ``3``) are functional.
+        On GPU, use ``algo.current_deposition=0`` for Esirkepov
+	or ``3`` for direct deposition.
 
 * ``algo.charge_deposition`` (`integer`)
     The algorithm for the charge density deposition:
@@ -393,6 +392,10 @@ Numerics and algorithms
 
      - ``0``: Vectorized version
      - ``1``: Non-optimized version
+    .. warning::
+
+        The vectorized version does not run on GPU. Use 
+		``algo.field_gather=1`` when running on GPU.
 
 * ``algo.particle_pusher`` (`integer`)
     The algorithm for the particle pusher:
@@ -470,6 +473,9 @@ Diagnostics and output
     Only used when mesh refinement is activated and ``warpx.plot_raw_fields`` is ``1``.
     Whether to output the data of the coarse patch, in the plot files.
 
+* ``amr.plot_file`` (`string`)
+    Root for output file names. Supports sub-directories. Default `plotfiles/plt`
+
 Checkpoints and restart
 -----------------------
 WarpX supports checkpoints/restart via AMReX.
diff --git a/Docs/source/visualization/sensei.rst b/Docs/source/visualization/sensei.rst
index 4881f9d78..59f879e47 100644
--- a/Docs/source/visualization/sensei.rst
+++ b/Docs/source/visualization/sensei.rst
@@ -233,19 +233,6 @@ To access the SENSEI modulefiles on cori first add the SENSEI install to the sea
     module use /usr/common/software/sensei/modulefiles
 
 
-Examples
-===================
-
-2D LPA Example
---------------
-
-* :download:`input file<./inputs.2d>`
-* :download:`xml file<./ez2d.xml>`
-* :download:`session file<./ez2d.session>`
-* :E field screen shot at time step 40
-.. figure:: ez2d_00040.png
-   :alt: picture
-
 3D LPA Example
 --------------
 This section shows an example of using SENSEI and three different back ends on
diff --git a/Examples/Physics_applications/laser_acceleration/laser_acceleration_PICMI.py b/Examples/Physics_applications/laser_acceleration/laser_acceleration_PICMI.py
index 77cd2e176..867e7a951 100644
--- a/Examples/Physics_applications/laser_acceleration/laser_acceleration_PICMI.py
+++ b/Examples/Physics_applications/laser_acceleration/laser_acceleration_PICMI.py
@@ -95,6 +95,20 @@ solver = picmi.ElectromagneticSolver(grid=grid, method='CKC', cfl=1.)
 
 
 ##########################
+# diagnostics
+##########################
+
+field_diag1 = picmi.FieldDiagnostic(grid = grid,
+                                    period = 100,
+                                    warpx_plot_raw_fields = 1,
+                                    warpx_plot_raw_fields_guards = 1,
+                                    warpx_plot_finepatch = 1,
+                                    warpx_plot_crsepatch = 1)
+
+part_diag1 = picmi.ParticleDiagnostic(period = 100,
+                                      species = [electrons])
+
+##########################
 # simulation setup
 ##########################
 
@@ -102,7 +116,6 @@ sim = picmi.Simulation(solver = solver,
                        max_steps = max_steps,
                        verbose = 1,
                        cfl = 1.0,
-                       warpx_plot_int = 100,
                        warpx_current_deposition_algo = 3,
                        warpx_charge_deposition_algo = 0,
                        warpx_field_gathering_algo = 0,
@@ -112,6 +125,8 @@ sim.add_species(electrons, layout=picmi.GriddedLayout(grid=grid, n_macroparticle
 
 sim.add_laser(laser, injection_method=laser_antenna)
 
+sim.add_diagnostic(field_diag1)
+sim.add_diagnostic(part_diag1)
 
 ##########################
 # simulation run
diff --git a/Examples/Tests/Langmuir/inputs.multi.2d.rt b/Examples/Tests/Langmuir/inputs.multi.2d.rt
index 6f301a7c2..ce82fd1c7 100644
--- a/Examples/Tests/Langmuir/inputs.multi.2d.rt
+++ b/Examples/Tests/Langmuir/inputs.multi.2d.rt
@@ -25,7 +25,7 @@ warpx.serialize_ics = 1
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 3
+algo.current_deposition = 0
 algo.charge_deposition = 0
 algo.field_gathering = 1
 algo.particle_pusher = 0
diff --git a/Examples/Tests/Langmuir/inputs.rt b/Examples/Tests/Langmuir/inputs.rt
index ec265bca7..3ec8fc47d 100644
--- a/Examples/Tests/Langmuir/inputs.rt
+++ b/Examples/Tests/Langmuir/inputs.rt
@@ -25,7 +25,7 @@ warpx.serialize_ics = 1
 warpx.verbose = 1
 
 # Algorithms
-algo.current_deposition = 3
+algo.current_deposition = 0
 algo.charge_deposition = 0
 algo.field_gathering = 1
 algo.particle_pusher = 0
diff --git a/Examples/Tests/Langmuir/langmuir_PICMI_rt.py b/Examples/Tests/Langmuir/langmuir_PICMI_rt.py
new file mode 100644
index 000000000..dab414b3b
--- /dev/null
+++ b/Examples/Tests/Langmuir/langmuir_PICMI_rt.py
@@ -0,0 +1,48 @@
+# --- Simple example of Langmuir oscillations in a uniform plasma
+
+import numpy as np
+from pywarpx import picmi
+
+nx = 64
+ny = 64
+nz = 64
+
+xmin = -20.e-6
+ymin = -20.e-6
+zmin = -20.e-6
+xmax = +20.e-6
+ymax = +20.e-6
+zmax = +20.e-6
+
+uniform_plasma = picmi.UniformDistribution(density=1.e25, upper_bound=[0., None, None], directed_velocity=[0.1, 0., 0.])
+
+electrons = picmi.Species(particle_type='electron', name='electrons', initial_distribution=uniform_plasma)
+
+grid = picmi.Cartesian3DGrid(number_of_cells = [nx, ny, nz],
+                             lower_bound = [xmin, ymin, zmin],
+                             upper_bound = [xmax, ymax, zmax],
+                             lower_boundary_conditions = ['periodic', 'periodic', 'periodic'],
+                             upper_boundary_conditions = ['periodic', 'periodic', 'periodic'],
+                             moving_window_velocity = [0., 0., 0.],
+                             warpx_max_grid_size=32, warpx_coord_sys=0)
+
+solver = picmi.ElectromagneticSolver(grid=grid, cfl=1.)
+
+sim = picmi.Simulation(solver = solver,
+                       max_steps = 40,
+                       verbose = 1,
+                       warpx_plot_int = 40,
+                       warpx_current_deposition_algo = 3,
+                       warpx_charge_deposition_algo = 0,
+                       warpx_field_gathering_algo = 0,
+                       warpx_particle_pusher_algo = 0)
+
+sim.add_species(electrons, layout=picmi.GriddedLayout(n_macroparticle_per_cell=[2,2,2], grid=grid))
+
+# write_inputs will create an inputs file that can be used to run
+# with the compiled version.
+sim.write_input_file(file_name='inputs_from_PICMI')
+
+# Alternatively, sim.step will run WarpX, controlling it from Python
+sim.step()
+
diff --git a/Examples/Tests/PML/analysis_pml_ckc.py b/Examples/Tests/PML/analysis_pml_ckc.py
new file mode 100755
index 000000000..90d13cecc
--- /dev/null
+++ b/Examples/Tests/PML/analysis_pml_ckc.py
@@ -0,0 +1,34 @@
+#! /usr/bin/env python
+
+import sys
+import yt ; yt.funcs.mylog.setLevel(0)
+import numpy as np
+import scipy.constants as scc
+
+filename = sys.argv[1]
+
+############################
+### INITIAL LASER ENERGY ###
+############################
+energy_start = 9.1301289517e-08
+
+##########################
+### FINAL LASER ENERGY ###
+##########################
+ds = yt.load( filename )
+all_data_level_0 = ds.covering_grid(level=0,left_edge=ds.domain_left_edge, dims=ds.domain_dimensions)
+Bx = all_data_level_0['boxlib', 'Bx'].v.squeeze()
+By = all_data_level_0['boxlib', 'By'].v.squeeze()
+Bz = all_data_level_0['boxlib', 'Bz'].v.squeeze()
+Ex = all_data_level_0['boxlib', 'Ex'].v.squeeze()
+Ey = all_data_level_0['boxlib', 'Ey'].v.squeeze()
+Ez = all_data_level_0['boxlib', 'Ez'].v.squeeze()
+energyE = np.sum(scc.epsilon_0/2*(Ex**2+Ey**2+Ez**2))
+energyB = np.sum(1./scc.mu_0/2*(Bx**2+By**2+Bz**2))
+energy_end = energyE + energyB
+
+Reflectivity = energy_end/energy_start
+Reflectivity_theory = 1.8015e-06
+
+assert( abs(Reflectivity-Reflectivity_theory) < 5./100 * Reflectivity_theory )
+    
diff --git a/Examples/Tests/PML/analysis_pml_yee.py b/Examples/Tests/PML/analysis_pml_yee.py
new file mode 100755
index 000000000..6234cd5d2
--- /dev/null
+++ b/Examples/Tests/PML/analysis_pml_yee.py
@@ -0,0 +1,34 @@
+#! /usr/bin/env python
+
+import sys
+import yt ; yt.funcs.mylog.setLevel(0)
+import numpy as np
+import scipy.constants as scc
+
+filename = sys.argv[1]
+
+############################
+### INITIAL LASER ENERGY ###
+############################
+energy_start = 9.1301289517e-08
+
+##########################
+### FINAL LASER ENERGY ###
+##########################
+ds = yt.load( filename )
+all_data_level_0 = ds.covering_grid(level=0,left_edge=ds.domain_left_edge, dims=ds.domain_dimensions)
+Bx = all_data_level_0['boxlib', 'Bx'].v.squeeze()
+By = all_data_level_0['boxlib', 'By'].v.squeeze()
+Bz = all_data_level_0['boxlib', 'Bz'].v.squeeze()
+Ex = all_data_level_0['boxlib', 'Ex'].v.squeeze()
+Ey = all_data_level_0['boxlib', 'Ey'].v.squeeze()
+Ez = all_data_level_0['boxlib', 'Ez'].v.squeeze()
+energyE = np.sum(scc.epsilon_0/2*(Ex**2+Ey**2+Ez**2))
+energyB = np.sum(1./scc.mu_0/2*(Bx**2+By**2+Bz**2))
+energy_end = energyE + energyB
+
+Reflectivity = energy_end/energy_start
+Reflectivity_theory = 5.683000058954201e-07
+
+assert( abs(Reflectivity-Reflectivity_theory) < 5./100 * Reflectivity_theory )
+    
diff --git a/Examples/Tests/gpu_test/inputs b/Examples/Tests/gpu_test/inputs
new file mode 100644
index 000000000..18bb80d26
--- /dev/null
+++ b/Examples/Tests/gpu_test/inputs
@@ -0,0 +1,69 @@
+# Maximum number of time steps
+max_step = 10
+
+# number of grid points
+amr.n_cell =  64 64 64
+
+# Maximum allowable size of each subdomain in the problem domain;
+#    this is used to decompose the domain for parallel calculations.
+amr.max_grid_size = 64
+
+# Maximum level in hierarchy (for now must be 0, i.e., one level in total)
+amr.max_level = 0
+
+# Geometry
+geometry.coord_sys   = 0                  # 0: Cartesian
+geometry.is_periodic = 1     1     1      # Is periodic?
+geometry.prob_lo     = -20.e-6   -20.e-6   -20.e-6    # physical domain
+geometry.prob_hi     =  20.e-6    20.e-6    20.e-6
+
+warpx.serialize_ics = 1
+warpx.do_pml = 0
+
+# Verbosity
+warpx.verbose = 1
+
+# Algorithms
+algo.current_deposition = 3
+algo.charge_deposition = 0
+algo.field_gathering = 1
+algo.particle_pusher = 0
+
+interpolation.nox = 1
+interpolation.noy = 1
+interpolation.noz = 1
+
+particles.do_tiling = 0
+
+# CFL
+warpx.cfl = 1.0
+
+amr.plot_int = -10
+
+particles.nspecies = 1
+particles.species_names = electrons
+
+electrons.charge = -q_e
+electrons.mass = m_e
+electrons.injection_style = "NUniformPerCell"
+electrons.num_particles_per_cell_each_dim = 2 2 2
+electrons.profile = constant
+electrons.density = 1.e25  # number of electrons per m^3
+electrons.momentum_distribution_type = "gaussian"
+electrons.ux_th  = 0.01 # uth the std of the (unitless) momentum
+electrons.uy_th  = 0.01 # uth the std of the (unitless) momentum
+electrons.uz_th  = 0.01 # uth the std of the (unitless) momentum
+electrons.uz_m   = 10.  # Mean momentum along z (unitless)
+
+# Laser
+warpx.use_laser    = 1
+laser.profile      = Gaussian
+laser.position     = 0. 0. 0.e-6 # This point is on the laser plane
+laser.direction    = 0. 0. 1.     # The plane normal direction
+laser.polarization = 1. 0. 0.    # The main polarization vector
+laser.e_max        = 16.e12        # Maximum amplitude of the laser field (in V/m)
+laser.profile_waist = 3.e-6      # The waist of the laser (in meters)
+laser.profile_duration = 15.e-15  # The duration of the laser (in seconds)
+laser.profile_t_peak = 30.e-15    # The time at which the laser reaches its peak (in seconds)
+laser.profile_focal_distance = 100.e-6  # Focal distance from the antenna (in meters)
+laser.wavelength = 0.8e-6         # The wavelength of the laser (in meters)
diff --git a/Examples/Tests/gpu_test/script.sh b/Examples/Tests/gpu_test/script.sh
new file mode 100755
index 000000000..cd6b0eadd
--- /dev/null
+++ b/Examples/Tests/gpu_test/script.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#BSUB -P GEN109
+#BSUB -W 0:10
+#BSUB -nnodes 1
+#BSUB -J WarpX
+#BSUB -o WarpXo.%J
+#BSUB -e WarpXe.%J
+
+module load pgi
+module load cuda/9.1.85
+module list
+set -x
+
+omp=1
+export OMP_NUM_THREADS=${omp}
+#EXE="../main3d.pgi.DEBUG.TPROF.MPI.ACC.CUDA.ex"
+EXE="../main3d.pgi.TPROF.MPI.ACC.CUDA.ex"
+#JSRUN="jsrun -n 4 -a 1 -g 1 -c 1 --bind=packed:${omp} "
+#JSRUN="jsrun -n 1 -a 4 -g 4 -c 4 --bind=packed:${omp} "
+JSRUN="jsrun -n 1 -a 1 -g 1 -c 1 --bind=packed:${omp} "
+
+rundir="${LSB_JOBNAME}-${LSB_JOBID}"
+mkdir $rundir
+cp $0 $rundir
+cp inputs $rundir
+cd $rundir
+
+# 1. Run normally
+${JSRUN} --smpiargs="-gpu" ${EXE} inputs
+
+# 2. Run under cuda-memcheck
+# ${JSRUN} --smpiargs="-gpu" cuda-memcheck ${EXE} inputs &> memcheck.txt
+
+# 3. Run under nvprof and direct all stdout and stderr to nvprof.txt
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes ${EXE} inputs &> nvprof.txt
+
+# 4. Run under nvprof and store performance data in a nvvp file
+# Can be converted to text using nvprof -i nvprof-timeline-%p.nvvp
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes -o nvprof-timeline-%p.nvvp ${EXE} inputs
+
+# COLLECT PERFORMANCE METRICS - THIS IS MUCH SLOWER. Set nsteps=2 in the inputs files
+# 5. Run under nvprof and collect metrics for a subset of kernels
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --kernels '(deposit_current|gather_\w+_field|push_\w+_boris)' --analysis-metrics -o nvprof-metrics-kernel-%p.nvvp ${EXE} inputs
+
+# 6. Run under nvprof and collect metrics for all kernels -- much slower!
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --analysis-metrics -o nvprof-metrics-%p.nvvp ${EXE} inputs
+
+cp ../WarpX*.${LSB_JOBID} .
diff --git a/GNUmakefile b/GNUmakefile
index 01b9f03ef..06f4c6229 100644
--- a/GNUmakefile
+++ b/GNUmakefile
@@ -10,6 +10,7 @@ DIM	= 3
 
 COMP = gcc
 #COMP = intel
+#COMP = pgi
 
 TINY_PROFILE   = TRUE
 #PROFILE       = TRUE
@@ -19,6 +20,7 @@ TINY_PROFILE   = TRUE
 STORE_OLD_PARTICLE_ATTRIBS = FALSE
 
 USE_OMP   = TRUE
+USE_GPU   = FALSE
 
 EBASE     = main
 
diff --git a/Python/pywarpx/Bucket.py b/Python/pywarpx/Bucket.py
index c73b3dac9..66494a700 100644
--- a/Python/pywarpx/Bucket.py
+++ b/Python/pywarpx/Bucket.py
@@ -20,6 +20,10 @@ class Bucket(object):
         except KeyError:
             return object.__getattr__(self, name)
 
+    def check_consistency(self, vname, value, errmsg):
+        if vname in self.argvattrs:
+            assert (self.argvattrs[vname] is None) or (self.argvattrs[vname] == value), Exception(errmsg)
+
     def attrlist(self):
         "Concatenate the attributes into a string"
         result = []
diff --git a/Python/pywarpx/WarpX.py b/Python/pywarpx/WarpX.py
index 9c4a4b380..f58d4f111 100644
--- a/Python/pywarpx/WarpX.py
+++ b/Python/pywarpx/WarpX.py
@@ -70,7 +70,7 @@ class WarpX(Bucket):
         argv = self.create_argv_list()
         with open(filename, 'w') as ff:
 
-            for k, v in kw.iteritems():
+            for k, v in kw.items():
                 ff.write('{0} = {1}\n'.format(k, v))
 
             for arg in argv:
diff --git a/Python/pywarpx/picmi.py b/Python/pywarpx/picmi.py
index 1d02f8b29..81b1543bb 100644
--- a/Python/pywarpx/picmi.py
+++ b/Python/pywarpx/picmi.py
@@ -462,6 +462,9 @@ class Simulation(picmistandard.PICMI_Simulation):
             self.lasers[i].initialize_inputs()
             self.laser_injection_methods[i].initialize_inputs(self.lasers[i])
 
+        for diagnostic in self.diagnostics:
+            diagnostic.initialize_inputs()
+
     def initialize_warpx(self):
         if self.warpx_initialized:
             return
@@ -492,3 +495,90 @@ class Simulation(picmistandard.PICMI_Simulation):
         if self.warpx_initialized:
             self.warpx_initialized = False
             pywarpx.warpx.finalize()
+
+
+# ----------------------------
+# Simulation frame diagnostics
+# ----------------------------
+
+
+class FieldDiagnostic(picmistandard.PICMI_FieldDiagnostic):
+    def init(self, kw):
+
+        self.plot_raw_fields = kw.pop('warpx_plot_raw_fields', None)
+        self.plot_raw_fields_guards = kw.pop('warpx_plot_raw_fields_guards', None)
+        self.plot_finepatch = kw.pop('warpx_plot_finepatch', None)
+        self.plot_crsepatch = kw.pop('warpx_plot_crsepatch', None)
+
+    def initialize_inputs(self):
+        # --- For now, the period must be the same as plot_int if set
+        pywarpx.amr.check_consistency('plot_int', self.period, 'The period must be the same for all simulation frame diagnostics')
+        pywarpx.amr.plot_int = self.period
+
+        if 'rho' in self.data_list:
+            pywarpx.warpx.plot_rho = 1
+        if 'dive' in self.data_list:
+            pywarpx.warpx.plot_dive = 1
+        if 'divb' in self.data_list:
+            pywarpx.warpx.plot_divb = 1
+        if 'F' in self.data_list:
+            pywarpx.warpx.plot_F = 1
+        if 'proc_number' in self.data_list:
+            pywarpx.warpx.plot_proc_number = 1
+
+        pywarpx.warpx.plot_raw_fields = self.plot_raw_fields
+        pywarpx.warpx.plot_raw_fields_guards = self.plot_raw_fields_guards
+
+        pywarpx.amr.check_consistency('plot_finepatch', self.plot_finepatch, 'The fine patch flag must be the same for all simulation frame field diagnostics')
+        pywarpx.amr.check_consistency('plot_crsepatch', self.plot_crsepatch, 'The coarse patch flag must be the same for all simulation frame field diagnostics')
+        pywarpx.warpx.plot_finepatch = self.plot_finepatch
+        pywarpx.warpx.plot_crsepatch = self.plot_crsepatch
+
+class ElectrostaticFieldDiagnostic(picmistandard.PICMI_ElectrostaticFieldDiagnostic):
+    def initialize_inputs(self):
+        # --- For now, the period must be the same as plot_int if set
+        pywarpx.amr.check_consistency('plot_int', self.period, 'The period must be the same for all simulation frame diagnostics')
+        pywarpx.amr.plot_int = self.period
+
+
+class ParticleDiagnostic(picmistandard.PICMI_ParticleDiagnostic):
+    def initialize_inputs(self):
+        # --- For now, the period must be the same as plot_int if set
+        pywarpx.amr.check_consistency('plot_int', self.period, 'The period must be the same for all simulation frame diagnostics')
+        pywarpx.amr.plot_int = self.period
+
+        if 'part_per_cell' in self.data_list:
+            pywarpx.warpx.plot_part_per_cell = 1
+        if 'part_per_grid' in self.data_list:
+            pywarpx.warpx.plot_part_per_grid = 1
+        if 'part_per_proc' in self.data_list:
+            pywarpx.warpx.plot_part_per_proc = 1
+
+
+# ----------------------------
+# Lab frame diagnostics
+# ----------------------------
+
+
+class LabFrameFieldDiagnostic(picmistandard.PICMI_LabFrameFieldDiagnostic):
+    def initialize_inputs(self):
+
+        pywarpx.warpx.check_consistency('num_snapshots_lab', self.num_snapshots, 'The number of snapshots must be the same in all lab frame diagnostics')
+        pywarpx.warpx.check_consistency('dt_snapshots_lab', self.dt_snapshots, 'The time between snapshots must be the same in all lab frame diagnostics')
+
+        pywarpx.warpx.do_boosted_frame_diagnostic = 1
+        pywarpx.warpx.num_snapshots_lab = self.num_snapshots
+        pywarpx.warpx.dt_snapshots_lab = self.dt_snapshots
+        pywarpx.warpx.do_boosted_frame_fields = 1
+
+
+class LabFrameParticleDiagnostic(picmistandard.PICMI_LabFrameParticleDiagnostic):
+    def initialize_inputs(self):
+
+        pywarpx.warpx.check_consistency('num_snapshots_lab', self.num_snapshots, 'The number of snapshots must be the same in all lab frame diagnostics')
+        pywarpx.warpx.check_consistency('dt_snapshots_lab', self.dt_snapshots, 'The time between snapshots must be the same in all lab frame diagnostics')
+
+        pywarpx.warpx.do_boosted_frame_diagnostic = 1
+        pywarpx.warpx.num_snapshots_lab = self.num_snapshots
+        pywarpx.warpx.dt_snapshots_lab = self.dt_snapshots
+        pywarpx.warpx.do_boosted_frame_particles = 1
diff --git a/README.md b/README.md
index 87f53f969..d2d3bd5aa 100644
--- a/README.md
+++ b/README.md
@@ -6,3 +6,4 @@ WarpX is an advanced electromagnetic Particle-In-Cell code.
 It supports many features including Perfectly-Matched Layers (PML), mesh refinement, and the boosted-frame technique.
 
 In order to learn how to install and run the code, please see the online documentation: https://ecp-warpx.github.io/index.html
+
diff --git a/Regression/WarpX-tests.ini b/Regression/WarpX-tests.ini
index 14af09277..60647e89b 100644
--- a/Regression/WarpX-tests.ini
+++ b/Regression/WarpX-tests.ini
@@ -60,8 +60,9 @@ branch = master
 [pml_x_yee]
 buildDir = .
 inputFile = Examples/Tests/PML/inputs2d
-runtime_params = warpx.do_dynamic_scheduling=0 warpx.maxwell_fdtd_solver=yee
+runtime_params = warpx.do_dynamic_scheduling=0 algo.maxwell_fdtd_solver=yee
 dim = 2
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 2
@@ -69,13 +70,14 @@ useOMP = 1
 numthreads = 2
 compileTest = 0
 doVis = 0
-analysisRoutine = Examples/Tests/PML/analysis_pml.py
+analysisRoutine = Examples/Tests/PML/analysis_pml_yee.py
 
 [pml_x_ckc]
 buildDir = .
 inputFile = Examples/Tests/PML/inputs2d
-runtime_params = warpx.do_dynamic_scheduling=0 warpx.maxwell_fdtd_solver=ckc
+runtime_params = warpx.do_dynamic_scheduling=0 algo.maxwell_fdtd_solver=ckc
 dim = 2
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 2
@@ -83,13 +85,14 @@ useOMP = 1
 numthreads = 2
 compileTest = 0
 doVis = 0
-analysisRoutine = Examples/Tests/PML/analysis_pml.py
+analysisRoutine = Examples/Tests/PML/analysis_pml_ckc.py
 
 [nci_corrector]
 buildDir = .
 inputFile = Examples/Modules/nci_corrector/inputs2d
 runtime_params = warpx.do_dynamic_scheduling=0
 dim = 2
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 2
@@ -103,6 +106,7 @@ analysisRoutine = Examples/Modules/nci_corrector/ncicorr_analysis.py
 buildDir = .
 inputFile = Examples/Tests/Langmuir/inputs.rt
 dim = 2
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 2
@@ -120,6 +124,7 @@ analysisOutputImage = langmuir2d_analysis.png
 buildDir = .
 inputFile = Examples/Tests/Langmuir/inputs.rt
 dim = 3
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 4
@@ -137,6 +142,7 @@ analysisOutputImage = langmuir_x_analysis.png
 buildDir = .
 inputFile = Examples/Tests/Langmuir/inputs.rt
 dim = 3
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 4
@@ -154,6 +160,7 @@ analysisOutputImage = langmuir_y_analysis.png
 buildDir = .
 inputFile = Examples/Tests/Langmuir/inputs.rt
 dim = 3
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 4
@@ -171,6 +178,7 @@ analysisOutputImage = langmuir_z_analysis.png
 buildDir = .
 inputFile = Examples/Tests/Langmuir/inputs.multi.rt
 dim = 3
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 4
@@ -224,6 +232,7 @@ analysisOutputImage = langmuir_multi_2d_analysis.png
 buildDir = .
 inputFile = Examples/Modules/laser_injection/inputs.rt
 dim = 3
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 4
@@ -239,6 +248,7 @@ analysisOutputImage = laser_analysis.png
 buildDir = .
 inputFile = Examples/Modules/laser_injection/inputs.2d.rt
 dim = 2
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 2
@@ -246,6 +256,7 @@ useOMP = 1
 numthreads = 2
 compileTest = 0
 doVis = 0
+runtime_params = warpx.do_dynamic_scheduling=0 warpx.serialize_ics=1
 compareParticles = 0
 
 [LaserAcceleration]
@@ -253,6 +264,7 @@ buildDir = .
 inputFile = Examples/Physics_applications/laser_acceleration/inputs.3d
 runtime_params = warpx.do_dynamic_scheduling=0 amr.n_cell=32 32 256 max_step=100 electrons.zmin=0.e-6 warpx.serialize_ics=1
 dim = 3
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 2
@@ -268,6 +280,7 @@ buildDir = .
 inputFile = Examples/Tests/subcycling/inputs.2d
 runtime_params = warpx.serialize_ics=1 warpx.do_dynamic_scheduling=0
 dim = 2
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 2
@@ -282,6 +295,7 @@ buildDir = .
 inputFile = Examples/Physics_applications/laser_acceleration/inputs.2d
 runtime_params = amr.max_level=1 max_step=100 warpx.serialize_ics=1
 dim = 2
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 2
@@ -297,6 +311,7 @@ buildDir = .
 inputFile = Examples/Physics_applications/plasma_acceleration/inputs.2d
 runtime_params = amr.max_level=1 amr.n_cell=32 512 max_step=100 plasma_e.zmin=-200.e-6 warpx.serialize_ics=1 warpx.do_dynamic_scheduling=0
 dim = 2
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 2
@@ -309,9 +324,10 @@ particleTypes = beam driver plasma_e
 
 [Python_Langmuir]
 buildDir = .
-inputFile = Examples/Tests/Langmuir/langmuir_PICMI.py
-customRunCmd = python langmuir_PICMI.py
+inputFile = Examples/Tests/Langmuir/langmuir_PICMI_rt.py
+customRunCmd = python langmuir_PICMI_rt.py
 dim = 3
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 2
@@ -327,6 +343,7 @@ outputFile = plotfiles/plt00040
 buildDir = .
 inputFile = Examples/Physics_applications/uniform_plasma/inputs.3d
 dim = 3
+addToCompileString =
 restartTest = 1
 restartFileNum = 6
 useMPI = 1
@@ -342,6 +359,7 @@ particleTypes = electrons
 buildDir = tests/CurrentDeposition
 inputFile = inputs
 dim = 3
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 1
@@ -355,6 +373,7 @@ outputFile = plotfiles/plt00000
 buildDir = tests/FieldGather
 inputFile = inputs
 dim = 3
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 1
@@ -368,6 +387,7 @@ outputFile = plotfiles/plt00000
 buildDir = tests/FieldSolver
 inputFile = inputs
 dim = 3
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 1
@@ -381,6 +401,7 @@ outputFile = plotfiles/plt00000
 buildDir = tests/ParticlePusher
 inputFile = inputs
 dim = 3
+addToCompileString =
 restartTest = 0
 useMPI = 1
 numprocs = 1
diff --git a/Regression/prepare_file_travis.py b/Regression/prepare_file_travis.py
index 6d0e0dfc8..82b3c8e0e 100644
--- a/Regression/prepare_file_travis.py
+++ b/Regression/prepare_file_travis.py
@@ -1,22 +1,35 @@
 # This script modifies `WarpX-test.ini` (which is used for nightly builds)
 # and creates the file `travis-test.ini` (which is used for continous
 # integration on TravisCI (https://travis-ci.org/)
-# The subtests that are selected are controlled by the environement WARPX_DIM
+# The subtests that are selected are controlled by WARPX_TEST_DIM
+# The architecture (CPU/GPU) is selected by WARPX_TEST_ARCH
 import re
 import os
 # Get relevant environment variables
-dim = os.environ.get('WARPX_DIM', None)
+dim = os.environ.get('WARPX_TEST_DIM', None)
+arch = os.environ.get('WARPX_TEST_ARCH', 'CPU')
+
+# Find the directory in which the tests should be run
+current_dir = os.getcwd()
+test_dir = re.sub('warpx/Regression', '', current_dir )
 
 with open('WarpX-tests.ini') as f:
     text = f.read()
 
 # Replace default folder name
-text = re.sub('/home/regtester/AMReX_RegTesting/', '/home/travis/', text)
+text = re.sub('/home/regtester/AMReX_RegTesting', test_dir, text)
 
 # Add doComparison = 0 for each test
 text = re.sub( '\[(?P<name>.*)\]\nbuildDir = ',
                '[\g<name>]\ndoComparison = 0\nbuildDir = ', text )
 
+# Change compile options when running on GPU
+if arch == 'GPU':
+    text = re.sub( 'addToCompileString =',
+                    'addToCompileString = USE_GPU=TRUE USE_OMP=FALSE USE_ACC=TRUE', text)
+    text = re.sub( 'COMP\s*=.*', 'COMP = pgi', text )
+print('Compiling for %s' %arch)
+
 # Use only 2 cores for compiling
 text = re.sub( 'numMakeJobs = \d+', 'numMakeJobs = 2', text )
 
diff --git a/Source/LaserParticleContainer.cpp b/Source/LaserParticleContainer.cpp
index 8c5d799e3..cf669d32d 100644
--- a/Source/LaserParticleContainer.cpp
+++ b/Source/LaserParticleContainer.cpp
@@ -91,7 +91,7 @@ LaserParticleContainer::LaserParticleContainer (AmrCore* amr_core, int ispecies)
 
     if (WarpX::gamma_boost > 1.) {
         // Check that the laser direction is equal to the boost direction
-        AMREX_ALWAYS_ASSERT_WITH_MESSAGE( 
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(
             nvec[0]*WarpX::boost_direction[0]
           + nvec[1]*WarpX::boost_direction[1]
           + nvec[2]*WarpX::boost_direction[2] - 1. < 1.e-12,
@@ -110,22 +110,22 @@ LaserParticleContainer::LaserParticleContainer (AmrCore* amr_core, int ispecies)
 	p_X = { p_X[0]*s, p_X[1]*s, p_X[2]*s };
 
 	Real dp = std::inner_product(nvec.begin(), nvec.end(), p_X.begin(), 0.0);
-        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(std::abs(dp) < 1.0e-14, 
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(std::abs(dp) < 1.0e-14,
             "Laser plane vector is not perpendicular to the main polarization vector");
 
 	p_Y = CrossProduct(nvec, p_X);   // The second polarization vector
 
 	s = 1.0/std::sqrt(stc_direction[0]*stc_direction[0] + stc_direction[1]*stc_direction[1] + stc_direction[2]*stc_direction[2]);
-	stc_direction = { stc_direction[0]*s, stc_direction[1]*s, stc_direction[2]*s };    
+	stc_direction = { stc_direction[0]*s, stc_direction[1]*s, stc_direction[2]*s };
 	dp = std::inner_product(nvec.begin(), nvec.end(), stc_direction.begin(), 0.0);
-        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(std::abs(dp) < 1.0e-14, 
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(std::abs(dp) < 1.0e-14,
             "stc_direction is not perpendicular to the laser plane vector");
-    
+
 	// Get angle between p_X and stc_direction
 	// in 2d, stcs are in the simulation plane
 #if AMREX_SPACEDIM == 3
-	theta_stc = acos(stc_direction[0]*p_X[0] + 
-			 stc_direction[1]*p_X[1] + 
+	theta_stc = acos(stc_direction[0]*p_X[0] +
+			 stc_direction[1]*p_X[1] +
 			 stc_direction[2]*p_X[2]);
 #else
 	theta_stc = 0.;
@@ -250,7 +250,7 @@ LaserParticleContainer::InitData (int lev)
     BoxArray plane_ba { Box {IntVect(plane_lo[0],0), IntVect(plane_hi[0],0)} };
 #endif
 
-    Vector<Real> particle_x, particle_y, particle_z, particle_w;
+    RealVector particle_x, particle_y, particle_z, particle_w;
 
     const DistributionMapping plane_dm {plane_ba, nprocs};
     const Vector<int>& procmap = plane_dm.ProcessorMap();
@@ -263,7 +263,7 @@ LaserParticleContainer::InitData (int lev)
 	    {
 		const Vector<Real>& pos = Transform(cell[0], cell[1]);
 #if (AMREX_SPACEDIM == 3)
-		const Real* x = pos.data();
+		const Real* x = pos.dataPtr();
 #else
 		const Real x[2] = {pos[0], pos[2]};
 #endif
@@ -281,15 +281,15 @@ LaserParticleContainer::InitData (int lev)
 	}
     }
     const int np = particle_z.size();
-    Vector<Real> particle_ux(np, 0.0);
-    Vector<Real> particle_uy(np, 0.0);
-    Vector<Real> particle_uz(np, 0.0);
+    RealVector particle_ux(np, 0.0);
+    RealVector particle_uy(np, 0.0);
+    RealVector particle_uz(np, 0.0);
 
     if (Verbose()) amrex::Print() << "Adding laser particles\n";
     AddNParticles(lev,
-                  np, particle_x.data(), particle_y.data(), particle_z.data(),
-		  particle_ux.data(), particle_uy.data(), particle_uz.data(),
-		  1, particle_w.data(), 1);
+                  np, particle_x.dataPtr(), particle_y.dataPtr(), particle_z.dataPtr(),
+		  particle_ux.dataPtr(), particle_uy.dataPtr(), particle_uz.dataPtr(),
+		  1, particle_w.dataPtr(), 1);
 }
 
 void
@@ -297,8 +297,8 @@ LaserParticleContainer::Evolve (int lev,
 				const MultiFab&, const MultiFab&, const MultiFab&,
 				const MultiFab&, const MultiFab&, const MultiFab&,
 				MultiFab& jx, MultiFab& jy, MultiFab& jz,
-                                MultiFab*, MultiFab*, MultiFab*,
-                                MultiFab* rho, MultiFab*,
+                                MultiFab* cjx, MultiFab* cjy, MultiFab* cjz,
+                                MultiFab* rho, MultiFab* crho,
                                 const MultiFab*, const MultiFab*, const MultiFab*,
                                 const MultiFab*, const MultiFab*, const MultiFab*,
                                 Real t, Real dt)
@@ -307,11 +307,7 @@ LaserParticleContainer::Evolve (int lev,
     BL_PROFILE_VAR_NS("Laser::Evolve::Copy", blp_copy);
     BL_PROFILE_VAR_NS("PICSAR::LaserParticlePush", blp_pxr_pp);
     BL_PROFILE_VAR_NS("PICSAR::LaserCurrentDepo", blp_pxr_cd);
-
-    const std::array<Real,3>& dx = WarpX::CellSize(lev);
-
-    // WarpX assumes the same number of guard cells for Jx, Jy, Jz
-    long ngJ  = jx.nGrow();
+    BL_PROFILE_VAR_NS("Laser::Evolve::Accumulate", blp_accumulate);
 
     Real t_lab = t;
     if (WarpX::gamma_boost > 1) {
@@ -329,8 +325,18 @@ LaserParticleContainer::Evolve (int lev,
 #pragma omp parallel
 #endif
     {
-	Vector<Real> xp, yp, zp, giv, plane_Xp, plane_Yp, amplitude_E;
-        FArrayBox local_rho, local_jx, local_jy, local_jz;
+#ifdef _OPENMP
+      int thread_num = omp_get_thread_num();
+#else
+      int thread_num = 0;
+#endif
+
+      if (local_rho[thread_num] == nullptr) local_rho[thread_num].reset( new amrex::FArrayBox());
+      if (local_jx[thread_num]  == nullptr) local_jx[thread_num].reset( new amrex::FArrayBox());
+      if (local_jy[thread_num]  == nullptr) local_jy[thread_num].reset(  new amrex::FArrayBox());
+      if (local_jz[thread_num]  == nullptr) local_jz[thread_num].reset(  new amrex::FArrayBox());
+
+        Cuda::DeviceVector<Real> plane_Xp, plane_Yp, amplitude_E;
 
         for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
 	{
@@ -346,13 +352,11 @@ LaserParticleContainer::Evolve (int lev,
             auto& uzp = attribs[PIdx::uz];
 
 	    const long np  = pti.numParticles();
+            // For now, laser particles do not take the current buffers into account
+            const long np_current = np;
 
-	    // Data on the grid
-	    FArrayBox& jxfab = jx[pti];
-	    FArrayBox& jyfab = jy[pti];
-	    FArrayBox& jzfab = jz[pti];
+            m_giv[thread_num].resize(np);
 
-	            giv.resize(np);
                plane_Xp.resize(np);
                plane_Yp.resize(np);
             amplitude_E.resize(np);
@@ -361,198 +365,78 @@ LaserParticleContainer::Evolve (int lev,
 	    // copy data from particle container to temp arrays
 	    //
 	    BL_PROFILE_VAR_START(blp_copy);
-            pti.GetPosition(xp, yp, zp);
+            pti.GetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
 	    BL_PROFILE_VAR_STOP(blp_copy);
 
-	    for (int i = 0; i < np; ++i)
-            {
-                // Find the coordinates of the particles in the emission plane
-#if (AMREX_SPACEDIM == 3)
-                plane_Xp[i] = u_X[0]*(xp[i] - position[0])
-                            + u_X[1]*(yp[i] - position[1])
-                            + u_X[2]*(zp[i] - position[2]);
-                plane_Yp[i] = u_Y[0]*(xp[i] - position[0])
-                            + u_Y[1]*(yp[i] - position[1])
-                            + u_Y[2]*(zp[i] - position[2]);
-#elif (AMREX_SPACEDIM == 2)
-                plane_Xp[i] = u_X[0]*(xp[i] - position[0])
-                            + u_X[2]*(zp[i] - position[2]);
-                plane_Yp[i] = 0;
-#endif
-            }
-
-            const std::array<Real,3>& xyzmin_tile = WarpX::LowerCorner(pti.tilebox(), lev);
-            const std::array<Real,3>& xyzmin_grid = WarpX::LowerCorner(box, lev);
-
-            long lvect = 8;
-
-            auto depositCharge = [&] (MultiFab* rhomf, int icomp)
-            {
-                long ngRho = rhomf->nGrow();
-
-                Real* data_ptr;
-                const int *rholen;
-                FArrayBox& rhofab = (*rhomf)[pti];
-                Box tile_box = convert(pti.tilebox(), IntVect::TheUnitVector());
-                Box grown_box;
-                const std::array<Real, 3>& xyzmin = xyzmin_tile;
-                tile_box.grow(ngRho);
-                local_rho.resize(tile_box);
-                local_rho = 0.0;
-                data_ptr = local_rho.dataPtr();
-                rholen = local_rho.length();
-
-#if (AMREX_SPACEDIM == 3)
-                const long nx = rholen[0]-1-2*ngRho;
-                const long ny = rholen[1]-1-2*ngRho;
-                const long nz = rholen[2]-1-2*ngRho;
-#else
-                const long nx = rholen[0]-1-2*ngRho;
-                const long ny = 0;
-                const long nz = rholen[1]-1-2*ngRho;
-#endif
-            	warpx_charge_deposition(data_ptr, &np,
-					xp.data(), yp.data(), zp.data(), wp.data(),
-                                        &this->charge,
-					&xyzmin[0], &xyzmin[1], &xyzmin[2],
-                                        &dx[0], &dx[1], &dx[2], &nx, &ny, &nz,
-                                        &ngRho, &ngRho, &ngRho,
-					&WarpX::nox,&WarpX::noy,&WarpX::noz,
-                                        &lvect, &WarpX::charge_deposition_algo);
-
-                const int ncomp = 1;
-                amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_rho),
-                                            BL_TO_FORTRAN_N_3D(rhofab,icomp), ncomp);
-            };
-
-            if (rho) depositCharge(rho,0);
+            if (rho) DepositCharge(pti, wp, rho, crho, 0, np_current, np, thread_num, lev);
 
 	    //
 	    // Particle Push
 	    //
 	    BL_PROFILE_VAR_START(blp_pxr_pp);
+            // Find the coordinates of the particles in the emission plane
+            calculate_laser_plane_coordinates( &np,
+                m_xp[thread_num].dataPtr(),
+                m_yp[thread_num].dataPtr(),
+                m_zp[thread_num].dataPtr(),
+                plane_Xp.dataPtr(), plane_Yp.dataPtr(),
+                &u_X[0], &u_X[1], &u_X[2], &u_Y[0], &u_Y[1], &u_Y[2],
+                &position[0], &position[1], &position[2] );
 	    // Calculate the laser amplitude to be emitted,
 	    // at the position of the emission plane
 	    if (profile == laser_t::Gaussian) {
-		warpx_gaussian_laser( &np, plane_Xp.data(), plane_Yp.data(),
+		warpx_gaussian_laser( &np, plane_Xp.dataPtr(), plane_Yp.dataPtr(),
 				      &t_lab, &wavelength, &e_max, &profile_waist, &profile_duration,
-				      &profile_t_peak, &profile_focal_distance, amplitude_E.data(),
+				      &profile_t_peak, &profile_focal_distance, amplitude_E.dataPtr(),
 				      &zeta, &beta, &phi2, &theta_stc );
 	    }
 
             if (profile == laser_t::Harris) {
-		warpx_harris_laser( &np, plane_Xp.data(), plane_Yp.data(),
+		warpx_harris_laser( &np, plane_Xp.dataPtr(), plane_Yp.dataPtr(),
                                     &t, &wavelength, &e_max, &profile_waist, &profile_duration,
-                                    &profile_focal_distance, amplitude_E.data() );
+                                    &profile_focal_distance, amplitude_E.dataPtr() );
 	    }
 
             if (profile == laser_t::parse_field_function) {
-		parse_function_laser( &np, plane_Xp.data(), plane_Yp.data(), &t,
-				      amplitude_E.data(), parser_instance_number );
+		parse_function_laser( &np, plane_Xp.dataPtr(), plane_Yp.dataPtr(), &t,
+				      amplitude_E.dataPtr(), parser_instance_number );
 	    }
-
 	    // Calculate the corresponding momentum and position for the particles
-            for (int i = 0; i < np; ++i)
-            {
-                // Calculate the velocity according to the amplitude of E
-                Real sign_charge = std::copysign( 1.0, wp[i] );
-                Real v_over_c = sign_charge * mobility * amplitude_E[i];
-                AMREX_ALWAYS_ASSERT_WITH_MESSAGE( v_over_c < 1,
-                    "The laser particles have to move unphysically in order to emit the laser.");
-                // The velocity is along the laser polarization p_X
-                Real vx = PhysConst::c * v_over_c * p_X[0];
-                Real vy = PhysConst::c * v_over_c * p_X[1];
-                Real vz = PhysConst::c * v_over_c * p_X[2];
-                // When running in the boosted-frame, their is additional
-                // velocity along nvec
-                if (WarpX::gamma_boost > 1.) {
-                    vx -= PhysConst::c * WarpX::beta_boost * nvec[0];
-                    vy -= PhysConst::c * WarpX::beta_boost * nvec[1];
-                    vz -= PhysConst::c * WarpX::beta_boost * nvec[2];
-                }
-                // Get the corresponding momenta
-                giv[i] = std::sqrt(1 - std::pow(v_over_c,2))/WarpX::gamma_boost;
-                Real gamma = 1./giv[i];
-                uxp[i] = gamma * vx;
-                uyp[i] = gamma * vy;
-                uzp[i] = gamma * vz;
-                // Push the the particle positions
-                xp[i] += vx * dt;
-#if (AMREX_SPACEDIM == 3)
-                yp[i] += vy * dt;
-#endif
-                zp[i] += vz * dt;
-            }
-
+            update_laser_particle(
+               &np,
+               m_xp[thread_num].dataPtr(),
+               m_yp[thread_num].dataPtr(),
+               m_zp[thread_num].dataPtr(),
+               uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
+               m_giv[thread_num].dataPtr(),
+               wp.dataPtr(), amplitude_E.dataPtr(), &p_X[0], &p_X[1], &p_X[2],
+               &nvec[0], &nvec[1], &nvec[2], &mobility, &dt,
+               &PhysConst::c, &WarpX::beta_boost, &WarpX::gamma_boost );
 	    BL_PROFILE_VAR_STOP(blp_pxr_pp);
 
 	    //
 	    // Current Deposition
 	    //
-            BL_PROFILE_VAR_START(blp_pxr_cd);
-            Real *jx_ptr, *jy_ptr, *jz_ptr;
-            const int  *jxntot, *jyntot, *jzntot;
-            Box tbx = convert(pti.tilebox(), WarpX::jx_nodal_flag);
-            Box tby = convert(pti.tilebox(), WarpX::jy_nodal_flag);
-            Box tbz = convert(pti.tilebox(), WarpX::jz_nodal_flag);
-
-            const std::array<Real, 3>& xyzmin = xyzmin_tile;
-
-            tbx.grow(ngJ);
-            tby.grow(ngJ);
-            tbz.grow(ngJ);
-
-            local_jx.resize(tbx);
-            local_jy.resize(tby);
-            local_jz.resize(tbz);
-
-            local_jx = 0.0;
-            local_jy = 0.0;
-            local_jz = 0.0;
-
-            jx_ptr = local_jx.dataPtr();
-            jy_ptr = local_jy.dataPtr();
-            jz_ptr = local_jz.dataPtr();
-
-            jxntot = local_jx.length();
-            jyntot = local_jy.length();
-            jzntot = local_jz.length();
-
-            warpx_current_deposition(
-                jx_ptr, &ngJ, jxntot,
-                jy_ptr, &ngJ, jyntot,
-                jz_ptr, &ngJ, jzntot,
-                &np, xp.data(), yp.data(), zp.data(),
-                uxp.data(), uyp.data(), uzp.data(),
-                giv.data(), wp.data(), &this->charge,
-                &xyzmin[0], &xyzmin[1], &xyzmin[2],
-                &dt, &dx[0], &dx[1], &dx[2],
-                &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                &lvect,&WarpX::current_deposition_algo);
-
-            const int ncomp = 1;
-            amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_jx),
-                                        BL_TO_FORTRAN_3D(jxfab), ncomp);
-            amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_jy),
-                                        BL_TO_FORTRAN_3D(jyfab), ncomp);
-            amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_jz),
-                                        BL_TO_FORTRAN_3D(jzfab), ncomp);
-
-	    BL_PROFILE_VAR_STOP(blp_pxr_cd);
-
-            if (rho) depositCharge(rho,1);
+            DepositCurrent(pti, wp, uxp, uyp, uzp, jx, jy, jz,
+                           cjx, cjy, cjz, np_current, np, thread_num, lev, dt);
 
 	    //
 	    // copy particle data back
 	    //
 	    BL_PROFILE_VAR_START(blp_copy);
-            pti.SetPosition(xp, yp, zp);
+            pti.SetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
             BL_PROFILE_VAR_STOP(blp_copy);
 
+            if (rho) DepositCharge(pti, wp, rho, crho, 1, np_current, np, thread_num, lev);
+
             if (cost) {
                 const Box& tbx = pti.tilebox();
                 wt = (amrex::second() - wt) / tbx.d_numPts();
-                (*cost)[pti].plus(wt, tbx);
+                FArrayBox* costfab = cost->fabPtr(pti);
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, work_box,
+                {
+                    costfab->plus(wt, work_box);
+                });
             }
 	}
     }
diff --git a/Source/Make.WarpX b/Source/Make.WarpX
index 9dfe667e0..e5da17e93 100644
--- a/Source/Make.WarpX
+++ b/Source/Make.WarpX
@@ -5,6 +5,13 @@ OPENBC_HOME ?= ../../../openbc_poisson
 USE_MPI       = TRUE
 USE_PARTICLES = TRUE
 
+ifeq ($(USE_GPU),TRUE)
+  USE_OMP  = FALSE
+  USE_CUDA = TRUE
+  USE_ACC  = TRUE
+  NVCC_HOST_COMP = gcc
+endif
+
 include $(AMREX_HOME)/Tools/GNUMake/Make.defs
 
 ifndef USE_PYTHON_MAIN
@@ -55,8 +62,13 @@ ifeq ($(USE_PSATD),TRUE)
   endif
   USERSuffix += .PSATD
   DEFINES += -DWARPX_USE_PSATD
-  DEFINES += -DFFTW   # PICSAR uses it
-  LIBRARIES += -lfftw3_mpi -lfftw3 -lfftw3_omp
+  ifeq ($(USE_CUDA),TRUE)
+    DEFINES += -DFFTW -DCUDA_FFT=1 # PICSAR uses it
+    LIBRARIES += -lfftw3_mpi -lfftw3 -lfftw3_threads -lcufft
+  else
+    DEFINES += -DFFTW # PICSAR uses it
+    LIBRARIES += -lfftw3_mpi -lfftw3 -lfftw3_threads
+  endif
 endif
 
 ifeq ($(STORE_OLD_PARTICLE_ATTRIBS),TRUE)
diff --git a/Source/Make.package b/Source/Make.package
index f2adfc07c..f55e12297 100644
--- a/Source/Make.package
+++ b/Source/Make.package
@@ -1,8 +1,11 @@
 ifneq ($(USE_PYTHON_MAIN),TRUE)
   CEXE_sources += main.cpp
+else
+  CEXE_sources += WarpXWrappers.cpp
+  CEXE_headers += WarpXWrappers.h               
 endif
 
-CEXE_sources += WarpXWrappers.cpp WarpX_py.cpp
+CEXE_sources += WarpX_py.cpp
 
 CEXE_sources += WarpX.cpp WarpXInitData.cpp WarpXEvolve.cpp WarpXIO.cpp WarpXProb.cpp WarpXRegrid.cpp
 CEXE_sources += WarpXTagging.cpp WarpXComm.cpp WarpXMove.cpp WarpXBoostedFrameDiagnostic.cpp
@@ -10,7 +13,7 @@ CEXE_sources += WarpXTagging.cpp WarpXComm.cpp WarpXMove.cpp WarpXBoostedFrameDi
 CEXE_sources += ParticleIO.cpp
 CEXE_sources += ParticleContainer.cpp WarpXParticleContainer.cpp PhysicalParticleContainer.cpp LaserParticleContainer.cpp RigidInjectedParticleContainer.cpp
 
-CEXE_headers += WarpXWrappers.h WarpX_py.H
+CEXE_headers += WarpX_py.H
 
 CEXE_headers += WarpX.H WarpX_f.H WarpXConst.H WarpXBoostedFrameDiagnostic.H
 CEXE_sources += WarpXConst.cpp
diff --git a/Source/ParticleContainer.H b/Source/ParticleContainer.H
index 026ba90a5..9f732f355 100644
--- a/Source/ParticleContainer.H
+++ b/Source/ParticleContainer.H
@@ -139,6 +139,8 @@ public:
 
     void WriteHeader (std::ostream& os) const;
 
+    void SortParticlesByCell ();
+
     void Redistribute ();
 
     void RedistributeLocal (const int num_ghost);
@@ -160,10 +162,6 @@ public:
         return r;
     }
 
-    int L_lower_order_in_v() {return l_lower_order_in_v;}
-
-    bool Use_fdtd_nci_corr() {return use_fdtd_nci_corr;}
-
     void GetLabFrameData(const std::string& snapshot_name,
                          const int i_lab, const int direction,
                          const amrex::Real z_old, const amrex::Real z_new,
@@ -202,9 +200,5 @@ private:
 
     // runtime parameters
     int nspecies = 1;   // physical particles only. If WarpX::use_laser, nspecies+1 == allcontainers.size().
-
-    int l_lower_order_in_v = true;
-
-    bool use_fdtd_nci_corr = false;
 };
 #endif /*WARPX_ParticleContainer_H_*/
diff --git a/Source/ParticleContainer.cpp b/Source/ParticleContainer.cpp
index 75c13d793..8aac692d6 100644
--- a/Source/ParticleContainer.cpp
+++ b/Source/ParticleContainer.cpp
@@ -69,8 +69,8 @@ MultiParticleContainer::ReadParameters ()
                 }
             }
         }
-	pp.query("use_fdtd_nci_corr", use_fdtd_nci_corr);
-	pp.query("l_lower_order_in_v", l_lower_order_in_v);
+	pp.query("use_fdtd_nci_corr", WarpX::use_fdtd_nci_corr);
+	pp.query("l_lower_order_in_v", WarpX::l_lower_order_in_v);
 	initialized = true;
     }
 }
@@ -261,6 +261,14 @@ MultiParticleContainer::GetChargeDensity (int lev, bool local)
 }
 
 void
+MultiParticleContainer::SortParticlesByCell ()
+{
+    for (auto& pc : allcontainers) {
+	pc->SortParticlesByCell();
+    }
+}
+
+void
 MultiParticleContainer::Redistribute ()
 {
     for (auto& pc : allcontainers) {
diff --git a/Source/PhysicalParticleContainer.H b/Source/PhysicalParticleContainer.H
index 52e59713b..bd6015437 100644
--- a/Source/PhysicalParticleContainer.H
+++ b/Source/PhysicalParticleContainer.H
@@ -61,10 +61,10 @@ public:
                          amrex::Real dt) override;
 
     virtual void PushPX(WarpXParIter& pti,
-	                amrex::Vector<amrex::Real>& xp,
-                        amrex::Vector<amrex::Real>& yp,
-                        amrex::Vector<amrex::Real>& zp,
-                        amrex::Vector<amrex::Real>& giv,
+	                amrex::Cuda::DeviceVector<amrex::Real>& xp,
+                        amrex::Cuda::DeviceVector<amrex::Real>& yp,
+                        amrex::Cuda::DeviceVector<amrex::Real>& zp,
+                        amrex::Cuda::DeviceVector<amrex::Real>& giv,
                         amrex::Real dt);
 
     virtual void PushP (int lev, amrex::Real dt,
@@ -80,6 +80,10 @@ public:
     // Inject particles in Box 'part_box'
     virtual void AddParticles (int lev);
     void AddPlasma(int lev, amrex::RealBox part_realbox = amrex::RealBox());
+    void AddPlasmaCPU (int lev, amrex::RealBox part_realbox);
+#ifdef AMREX_USE_GPU
+    void AddPlasmaGPU (int lev, amrex::RealBox part_realbox);
+#endif
 
     void MapParticletoBoostedFrame(amrex::Real& x, amrex::Real& y, amrex::Real& z, std::array<amrex::Real, 3>& u);
 
@@ -105,9 +109,14 @@ protected:
     bool boost_adjust_transverse_positions = false;
     bool do_backward_propagation = false;
 
+    long NumParticlesToAdd (const amrex::Box& overlap_box,
+			    const amrex::RealBox& overlap_realbox,
+			    const amrex::RealBox& tile_real_box,
+			    const amrex::RealBox& particle_real_box);
+  
     int GetRefineFac(const amrex::Real x, const amrex::Real y, const amrex::Real z);
     std::unique_ptr<amrex::IArrayBox> m_refined_injection_mask = nullptr;
-    
+
 };
 
 #endif
diff --git a/Source/PhysicalParticleContainer.cpp b/Source/PhysicalParticleContainer.cpp
index df0ee3b3c..c4fdd4464 100644
--- a/Source/PhysicalParticleContainer.cpp
+++ b/Source/PhysicalParticleContainer.cpp
@@ -10,6 +10,62 @@
 
 using namespace amrex;
 
+long PhysicalParticleContainer::
+NumParticlesToAdd(const Box& overlap_box, const RealBox& overlap_realbox,
+		  const RealBox& tile_realbox, const RealBox& particle_real_box)
+{
+    const int lev = 0;
+    const Geometry& geom = Geom(lev);
+    int num_ppc = plasma_injector->num_particles_per_cell;
+    const Real* dx = geom.CellSize();
+
+    long np = 0;
+    const auto& overlap_corner = overlap_realbox.lo();
+    for (IntVect iv = overlap_box.smallEnd(); iv <= overlap_box.bigEnd(); overlap_box.next(iv))
+    {
+        int fac;
+	if (injected) {
+#if ( AMREX_SPACEDIM == 3 )
+	    Real x = overlap_corner[0] + (iv[0] + 0.5)*dx[0];
+	    Real y = overlap_corner[1] + (iv[1] + 0.5)*dx[1];
+	    Real z = overlap_corner[2] + (iv[2] + 0.5)*dx[2];
+#elif ( AMREX_SPACEDIM == 2 )
+	    Real x = overlap_corner[0] + (iv[0] + 0.5)*dx[0];
+	    Real y = 0;
+	    Real z = overlap_corner[1] + (iv[1] + 0.5)*dx[1];
+#endif
+	    fac = GetRefineFac(x, y, z);
+	} else {
+	    fac = 1.0;
+	}
+	
+	int ref_num_ppc = num_ppc * AMREX_D_TERM(fac, *fac, *fac);
+	for (int i_part=0; i_part<ref_num_ppc;i_part++) {
+	    std::array<Real, 3> r;
+	    plasma_injector->getPositionUnitBox(r, i_part, fac);
+#if ( AMREX_SPACEDIM == 3 )
+	    Real x = overlap_corner[0] + (iv[0] + r[0])*dx[0];
+	    Real y = overlap_corner[1] + (iv[1] + r[1])*dx[1];
+	    Real z = overlap_corner[2] + (iv[2] + r[2])*dx[2];
+#elif ( AMREX_SPACEDIM == 2 )
+	    Real x = overlap_corner[0] + (iv[0] + r[0])*dx[0];
+	    Real y = 0;
+	    Real z = overlap_corner[1] + (iv[1] + r[1])*dx[1];
+#endif
+	    // If the new particle is not inside the tile box,
+	    // go to the next generated particle.
+#if ( AMREX_SPACEDIM == 3 )
+	    if(!tile_realbox.contains( RealVect{x, y, z} )) continue;
+#elif ( AMREX_SPACEDIM == 2 )
+	    if(!tile_realbox.contains( RealVect{x, z} )) continue;
+#endif
+	    ++np;
+	}
+    }
+    
+    return np;
+}
+
 PhysicalParticleContainer::PhysicalParticleContainer (AmrCore* amr_core, int ispecies,
                                                       const std::string& name)
     : WarpXParticleContainer(amr_core, ispecies),
@@ -22,7 +78,7 @@ PhysicalParticleContainer::PhysicalParticleContainer (AmrCore* amr_core, int isp
     ParmParse pp(species_name);
 
     pp.query("boost_adjust_transverse_positions", boost_adjust_transverse_positions);
-    pp.query("do_backward_propagation", do_backward_propagation);
+    pp.query("do_backward_propagation", do_backward_propagation);    
 }
 
 void PhysicalParticleContainer::InitData()
@@ -174,9 +230,19 @@ PhysicalParticleContainer::AddParticles (int lev)
  * but its boundaries need not be aligned with the actual cells of the simulation)
  */
 void
-PhysicalParticleContainer::AddPlasma(int lev, RealBox part_realbox)
+PhysicalParticleContainer::AddPlasma (int lev, RealBox part_realbox)
 {
-    BL_PROFILE("PhysicalParticleContainer::AddPlasma");
+#ifdef AMREX_USE_GPU
+  AddPlasmaGPU(lev, part_realbox);
+#else
+  AddPlasmaCPU(lev, part_realbox);
+#endif
+}
+
+void
+PhysicalParticleContainer::AddPlasmaCPU (int lev, RealBox part_realbox)
+{
+    BL_PROFILE("PhysicalParticleContainer::AddPlasmaCPU");
 
     // If no part_realbox is provided, initialize particles in the whole domain
     const Geometry& geom = Geom(lev);
@@ -365,18 +431,264 @@ PhysicalParticleContainer::AddPlasma(int lev, RealBox part_realbox)
                     attribs[PIdx::uzold] = u[2];
 #endif
 
-                    AddOneParticle(lev, grid_id, tile_id, x, y, z, attribs);
+		    AddOneParticle(lev, grid_id, tile_id, x, y, z, attribs);
                 }
             }
 
             if (cost) {
-                wt = (amrex::second() - wt) / tile_box.d_numPts();
-                (*cost)[mfi].plus(wt, tile_box);
+	        wt = (amrex::second() - wt) / tile_box.d_numPts();
+                FArrayBox* costfab = cost->fabPtr(mfi);
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tile_box, work_box,
+                {
+                    costfab->plus(wt, work_box);
+                });
             }
         }
     }
 }
 
+#ifdef AMREX_USE_GPU
+void
+PhysicalParticleContainer::AddPlasmaGPU (int lev, RealBox part_realbox)
+{
+    BL_PROFILE("PhysicalParticleContainer::AddPlasmaGPU");
+
+    // If no part_realbox is provided, initialize particles in the whole domain
+    const Geometry& geom = Geom(lev);
+    if (!part_realbox.ok()) part_realbox = geom.ProbDomain();
+
+    int num_ppc = plasma_injector->num_particles_per_cell;
+
+    const Real* dx = geom.CellSize();
+
+    Real scale_fac;
+#if AMREX_SPACEDIM==3
+    scale_fac = dx[0]*dx[1]*dx[2]/num_ppc;
+#elif AMREX_SPACEDIM==2
+    scale_fac = dx[0]*dx[1]/num_ppc;
+#endif
+
+#ifdef _OPENMP
+    // First touch all tiles in the map in serial
+    for (MFIter mfi = MakeMFIter(lev); mfi.isValid(); ++mfi) {
+        const int grid_id = mfi.index();
+        const int tile_id = mfi.LocalTileIndex();
+        GetParticles(lev)[std::make_pair(grid_id, tile_id)];
+    }
+#endif
+
+    MultiFab* cost = WarpX::getCosts(lev);
+
+    if ( (not m_refined_injection_mask) and WarpX::do_moving_window)
+    {
+        Box mask_box = geom.Domain();
+        mask_box.setSmall(WarpX::moving_window_dir, 0);
+        mask_box.setBig(WarpX::moving_window_dir, 0);
+        m_refined_injection_mask.reset( new IArrayBox(mask_box));
+        m_refined_injection_mask->setVal(-1);
+    }
+
+    MFItInfo info;
+    if (do_tiling) {
+        info.EnableTiling(tile_size);
+    }
+    info.SetDynamic(true);
+
+#ifdef _OPENMP
+#pragma omp parallel if (not WarpX::serialize_ics)
+#endif
+    {
+        std::array<Real,PIdx::nattribs> attribs;
+        attribs.fill(0.0);
+
+        // Loop through the tiles
+        for (MFIter mfi = MakeMFIter(lev, info); mfi.isValid(); ++mfi) {
+
+            Real wt = amrex::second();
+
+            const Box& tile_box = mfi.tilebox();
+            const RealBox tile_realbox = WarpX::getRealBox(tile_box, lev);
+
+            // Find the cells of part_box that overlap with tile_realbox
+            // If there is no overlap, just go to the next tile in the loop
+            RealBox overlap_realbox;
+            Box overlap_box;
+            Real ncells_adjust;
+            bool no_overlap = 0;
+
+            for (int dir=0; dir<AMREX_SPACEDIM; dir++) {
+                if ( tile_realbox.lo(dir) <= part_realbox.hi(dir) ) {
+                    ncells_adjust = std::floor( (tile_realbox.lo(dir) - part_realbox.lo(dir))/dx[dir] );
+                    overlap_realbox.setLo( dir, part_realbox.lo(dir) + std::max(ncells_adjust, 0.) * dx[dir]);
+                } else {
+                    no_overlap = 1; break;
+                }
+                if ( tile_realbox.hi(dir) >= part_realbox.lo(dir) ) {
+                    ncells_adjust = std::floor( (part_realbox.hi(dir) - tile_realbox.hi(dir))/dx[dir] );
+                    overlap_realbox.setHi( dir, part_realbox.hi(dir) - std::max(ncells_adjust, 0.) * dx[dir]);
+                } else {
+                    no_overlap = 1; break;
+                }
+                // Count the number of cells in this direction in overlap_realbox
+                overlap_box.setSmall( dir, 0 );
+                overlap_box.setBig( dir,
+				    int( round((overlap_realbox.hi(dir)-overlap_realbox.lo(dir))/dx[dir] )) - 1);
+            }
+            if (no_overlap == 1) {
+                continue; // Go to the next tile
+            }
+
+            const int grid_id = mfi.index();
+            const int tile_id = mfi.LocalTileIndex();
+
+	    Cuda::HostVector<ParticleType> host_particles;
+	    std::array<Cuda::HostVector<Real>, PIdx::nattribs> host_attribs;
+	    
+            // Loop through the cells of overlap_box and inject
+            // the corresponding particles
+            const auto& overlap_corner = overlap_realbox.lo();
+            for (IntVect iv = overlap_box.smallEnd(); iv <= overlap_box.bigEnd(); overlap_box.next(iv))
+            {
+                int fac;
+                if (injected) {
+#if ( AMREX_SPACEDIM == 3 )
+                    Real x = overlap_corner[0] + (iv[0] + 0.5)*dx[0];
+                    Real y = overlap_corner[1] + (iv[1] + 0.5)*dx[1];
+                    Real z = overlap_corner[2] + (iv[2] + 0.5)*dx[2];
+#elif ( AMREX_SPACEDIM == 2 )
+                    Real x = overlap_corner[0] + (iv[0] + 0.5)*dx[0];
+                    Real y = 0;
+                    Real z = overlap_corner[1] + (iv[1] + 0.5)*dx[1];
+#endif
+                    fac = GetRefineFac(x, y, z);
+                } else {
+                    fac = 1.0;
+                }
+
+                int ref_num_ppc = num_ppc * AMREX_D_TERM(fac, *fac, *fac);
+                for (int i_part=0; i_part<ref_num_ppc;i_part++) {
+                    std::array<Real, 3> r;
+                    plasma_injector->getPositionUnitBox(r, i_part, fac);
+#if ( AMREX_SPACEDIM == 3 )
+                    Real x = overlap_corner[0] + (iv[0] + r[0])*dx[0];
+                    Real y = overlap_corner[1] + (iv[1] + r[1])*dx[1];
+                    Real z = overlap_corner[2] + (iv[2] + r[2])*dx[2];
+#elif ( AMREX_SPACEDIM == 2 )
+                    Real x = overlap_corner[0] + (iv[0] + r[0])*dx[0];
+                    Real y = 0;
+                    Real z = overlap_corner[1] + (iv[1] + r[1])*dx[1];
+#endif
+                    // If the new particle is not inside the tile box,
+                    // go to the next generated particle.
+#if ( AMREX_SPACEDIM == 3 )
+                    if(!tile_realbox.contains( RealVect{x, y, z} )) continue;
+#elif ( AMREX_SPACEDIM == 2 )
+                    if(!tile_realbox.contains( RealVect{x, z} )) continue;
+#endif
+
+                    Real dens;
+                    std::array<Real, 3> u;
+                    if (WarpX::gamma_boost == 1.){
+                      // Lab-frame simulation
+                      // If the particle is not within the species's
+                      // xmin, xmax, ymin, ymax, zmin, zmax, go to
+                      // the next generated particle.
+                      if (!plasma_injector->insideBounds(x, y, z)) continue;
+                      plasma_injector->getMomentum(u, x, y, z);
+                      dens = plasma_injector->getDensity(x, y, z);
+                    } else {
+                      // Boosted-frame simulation
+                      Real c = PhysConst::c;
+                      Real gamma_boost = WarpX::gamma_boost;
+                      Real beta_boost = WarpX::beta_boost;
+                      // Since the user provides the density distribution
+                      // at t_lab=0 and in the lab-frame coordinates,
+                      // we need to find the lab-frame position of this
+                      // particle at t_lab=0, from its boosted-frame coordinates
+                      // Assuming ballistic motion, this is given by:
+                      // z0_lab = gamma*( z_boost*(1-beta*betaz_lab) - ct_boost*(betaz_lab-beta) )
+                      // where betaz_lab is the speed of the particle in the lab frame
+                      //
+                      // In order for this equation to be solvable, betaz_lab
+                      // is explicitly assumed to have no dependency on z0_lab
+                      plasma_injector->getMomentum(u, x, y, 0.); // No z0_lab dependency
+                      // At this point u is the lab-frame momentum
+                      // => Apply the above formula for z0_lab
+                      Real gamma_lab = std::sqrt( 1 + (u[0]*u[0] + u[1]*u[1] + u[2]*u[2])/(c*c) );
+                      Real betaz_lab = u[2]/gamma_lab/c;
+                      Real t = WarpX::GetInstance().gett_new(lev);
+                      Real z0_lab = gamma_boost * ( z*(1-beta_boost*betaz_lab) - c*t*(betaz_lab-beta_boost) );
+                      // If the particle is not within the lab-frame zmin, zmax, etc.
+                      // go to the next generated particle.
+                      if (!plasma_injector->insideBounds(x, y, z0_lab)) continue;
+                      // call `getDensity` with lab-frame parameters
+                      dens = plasma_injector->getDensity(x, y, z0_lab);
+                      // At this point u and dens are the lab-frame quantities
+                      // => Perform Lorentz transform
+                      dens = gamma_boost * dens * ( 1 - beta_boost*betaz_lab );
+                      u[2] = gamma_boost * ( u[2] -beta_boost*c*gamma_lab );
+                    }
+                    attribs[PIdx::w ] = dens * scale_fac / (AMREX_D_TERM(fac, *fac, *fac));
+                    attribs[PIdx::ux] = u[0];
+                    attribs[PIdx::uy] = u[1];
+                    attribs[PIdx::uz] = u[2];
+
+#ifdef WARPX_STORE_OLD_PARTICLE_ATTRIBS
+                    attribs[PIdx::xold] = x;
+                    attribs[PIdx::yold] = y;
+                    attribs[PIdx::zold] = z;
+
+                    attribs[PIdx::uxold] = u[0];
+                    attribs[PIdx::uyold] = u[1];
+                    attribs[PIdx::uzold] = u[2];
+#endif
+
+		    ParticleType p;
+		    p.id()  = ParticleType::NextID();
+		    p.cpu() = ParallelDescriptor::MyProc();
+#if (AMREX_SPACEDIM == 3)
+		    p.pos(0) = x;
+		    p.pos(1) = y;
+		    p.pos(2) = z;
+#elif (AMREX_SPACEDIM == 2)
+		    p.pos(0) = x;
+		    p.pos(1) = z;
+#endif
+
+		    host_particles.push_back(p);
+		    for (int kk = 0; kk < PIdx::nattribs; ++kk)
+		      host_attribs[kk].push_back(attribs[kk]);
+                }
+            }
+
+	    auto& particle_tile = GetParticles(lev)[std::make_pair(grid_id,tile_id)];
+            auto old_size = particle_tile.GetArrayOfStructs().size();
+            auto new_size = old_size + host_particles.size();
+	    particle_tile.resize(new_size);
+
+	    thrust::copy(host_particles.begin(),
+			 host_particles.end(),
+			 particle_tile.GetArrayOfStructs().begin() + old_size);
+
+	    for (int kk = 0; kk < PIdx::nattribs; ++kk) {
+	      thrust::copy(host_attribs[kk].begin(),
+			   host_attribs[kk].end(),
+			   particle_tile.GetStructOfArrays().GetRealData(kk).begin() + old_size);
+	    }
+	    			 
+            if (cost) {
+	        wt = (amrex::second() - wt) / tile_box.d_numPts();
+                FArrayBox* costfab = cost->fabPtr(mfi);
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tile_box, work_box,
+                {
+                    costfab->plus(wt, work_box);
+                });
+            }
+        }		
+    }
+}
+#endif
+
 #ifdef WARPX_DO_ELECTROSTATIC
 void
 PhysicalParticleContainer::
@@ -425,10 +737,10 @@ FieldGatherES (const amrex::Vector<std::array<std::unique_ptr<amrex::MultiFab>,
             const FArrayBox& ezfab = (*E[lev][2])[pti];
 #endif
 
-            WRPX_INTERPOLATE_CIC(particles.data(), nstride, np,
-                                 Exp.data(), Eyp.data(),
+            WRPX_INTERPOLATE_CIC(particles.dataPtr(), nstride, np,
+                                 Exp.dataPtr(), Eyp.dataPtr(),
 #if AMREX_SPACEDIM == 3
-                                 Ezp.data(),
+                                 Ezp.dataPtr(),
 #endif
                                  exfab.dataPtr(), eyfab.dataPtr(),
 #if AMREX_SPACEDIM == 3
@@ -495,10 +807,10 @@ FieldGatherES (const amrex::Vector<std::array<std::unique_ptr<amrex::MultiFab>,
 #endif
 
             if (lev == 0) {
-                WRPX_INTERPOLATE_CIC(particles.data(), nstride, np,
-                                     Exp.data(), Eyp.data(),
+                WRPX_INTERPOLATE_CIC(particles.dataPtr(), nstride, np,
+                                     Exp.dataPtr(), Eyp.dataPtr(),
 #if AMREX_SPACEDIM == 3
-                Ezp.data(),
+                Ezp.dataPtr(),
 #endif
                                 exfab.dataPtr(), eyfab.dataPtr(),
 #if AMREX_SPACEDIM == 3
@@ -515,10 +827,10 @@ FieldGatherES (const amrex::Vector<std::array<std::unique_ptr<amrex::MultiFab>,
                 const Box& coarse_box = coarsened_fine_BA[pti];
                 const Real* coarse_dx = Geom(0).CellSize();
 
-                WRPX_INTERPOLATE_CIC_TWO_LEVELS(particles.data(), nstride, np,
-                                                Exp.data(), Eyp.data(),
+                WRPX_INTERPOLATE_CIC_TWO_LEVELS(particles.dataPtr(), nstride, np,
+                                                Exp.dataPtr(), Eyp.dataPtr(),
 #if AMREX_SPACEDIM == 3
-                                                Ezp.data(),
+                                                Ezp.dataPtr(),
 #endif
                                                 exfab.dataPtr(), eyfab.dataPtr(),
 #if AMREX_SPACEDIM == 3
@@ -573,14 +885,14 @@ PhysicalParticleContainer::EvolveES (const Vector<std::array<std::unique_ptr<Mul
             //
             // Particle Push
             //
-            WRPX_PUSH_LEAPFROG(particles.data(), nstride, np,
-                               uxp.data(), uyp.data(),
+            WRPX_PUSH_LEAPFROG(particles.dataPtr(), nstride, np,
+                               uxp.dataPtr(), uyp.dataPtr(),
 #if AMREX_SPACEDIM == 3
-                               uzp.data(),
+                               uzp.dataPtr(),
 #endif
-                               Exp.data(), Eyp.data(),
+                               Exp.dataPtr(), Eyp.dataPtr(),
 #if AMREX_SPACEDIM == 3
-                               Ezp.data(),
+                               Ezp.dataPtr(),
 #endif
                                &this->charge, &this->mass, &dt,
                                prob_domain.lo(), prob_domain.hi());
@@ -607,7 +919,7 @@ PhysicalParticleContainer::FieldGather (int lev,
 #pragma omp parallel
 #endif
     {
-	Vector<Real> xp, yp, zp;
+        Cuda::DeviceVector<Real> xp, yp, zp;
 
 	for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
 	{
@@ -653,12 +965,14 @@ PhysicalParticleContainer::FieldGather (int lev,
 	    // Field Gather
 	    //
 	    const int ll4symtry          = false;
-	    const int l_lower_order_in_v = warpx_l_lower_order_in_v();
             long lvect_fieldgathe = 64;
 	    warpx_geteb_energy_conserving(
-	       &np, xp.data(), yp.data(), zp.data(),
-	       Exp.data(),Eyp.data(),Ezp.data(),
-	       Bxp.data(),Byp.data(),Bzp.data(),
+	       &np,
+               xp.dataPtr(),
+               yp.dataPtr(),
+               zp.dataPtr(),
+	       Exp.dataPtr(),Eyp.dataPtr(),Ezp.dataPtr(),
+	       Bxp.dataPtr(),Byp.dataPtr(),Bzp.dataPtr(),
                ixyzmin,
                &xyzmin[0], &xyzmin[1], &xyzmin[2],
                &dx[0], &dx[1], &dx[2],
@@ -669,13 +983,17 @@ PhysicalParticleContainer::FieldGather (int lev,
                BL_TO_FORTRAN_ANYD(bxfab),
 	       BL_TO_FORTRAN_ANYD(byfab),
 	       BL_TO_FORTRAN_ANYD(bzfab),
-	       &ll4symtry, &l_lower_order_in_v,
+	       &ll4symtry, &WarpX::l_lower_order_in_v,
 	       &lvect_fieldgathe, &WarpX::field_gathering_algo);
 
             if (cost) {
                 const Box& tbx = pti.tilebox();
                 wt = (amrex::second() - wt) / tbx.d_numPts();
-                (*cost)[pti].plus(wt, tbx);
+                FArrayBox* costfab = cost->fabPtr(pti);
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, work_box,
+                {
+                    costfab->plus(wt, work_box);
+                });
             }
         }
     }
@@ -696,20 +1014,15 @@ PhysicalParticleContainer::Evolve (int lev,
     BL_PROFILE_VAR_NS("PPC::Evolve::Copy", blp_copy);
     BL_PROFILE_VAR_NS("PICSAR::FieldGather", blp_pxr_fg);
     BL_PROFILE_VAR_NS("PICSAR::ParticlePush", blp_pxr_pp);
-    BL_PROFILE_VAR_NS("PICSAR::CurrentDeposition", blp_pxr_cd);
-    BL_PROFILE_VAR_NS("PPC::Evolve::Accumulate", blp_accumulate);
     BL_PROFILE_VAR_NS("PPC::Evolve::partition", blp_partition);
-
+    
     const std::array<Real,3>& dx = WarpX::CellSize(lev);
     const std::array<Real,3>& cdx = WarpX::CellSize(std::max(lev-1,0));
 
     const auto& mypc = WarpX::GetInstance().GetPartContainer();
     const int nstencilz_fdtd_nci_corr = mypc.nstencilz_fdtd_nci_corr;
 
-    // WarpX assumes the same number of guard cells for Jx, Jy, Jz
-    long ngJ = jx.nGrow();
-
-    BL_ASSERT(OnSameGrids(lev,Ex));
+    BL_ASSERT(OnSameGrids(lev,jx));
 
     MultiFab* cost = WarpX::getCosts(lev);
 
@@ -719,17 +1032,26 @@ PhysicalParticleContainer::Evolve (int lev,
     bool has_buffer = cEx || cjx;
 
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel 
 #endif
     {
-	Vector<Real> xp, yp, zp, giv;
-        FArrayBox local_rho, local_jx, local_jy, local_jz;
+#ifdef _OPENMP
+        int thread_num = omp_get_thread_num();
+#else
+        int thread_num = 0;
+#endif
+
+	if (local_rho[thread_num] == nullptr) local_rho[thread_num].reset( new amrex::FArrayBox());
+	if (local_jx[thread_num]  == nullptr) local_jx[thread_num].reset(  new amrex::FArrayBox());
+	if (local_jy[thread_num]  == nullptr) local_jy[thread_num].reset(  new amrex::FArrayBox());
+	if (local_jz[thread_num]  == nullptr) local_jz[thread_num].reset(  new amrex::FArrayBox());
+
         FArrayBox filtered_Ex, filtered_Ey, filtered_Ez;
         FArrayBox filtered_Bx, filtered_By, filtered_Bz;
         std::vector<bool> inexflag;
         Vector<long> pid;
-        Vector<Real> tmp;
-        Vector<ParticleType> particle_tmp;
+        RealVector tmp;
+        ParticleVector particle_tmp;
 
 	for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
 	{
@@ -760,7 +1082,7 @@ PhysicalParticleContainer::Evolve (int lev,
             FArrayBox const* byfab = &(By[pti]);
             FArrayBox const* bzfab = &(Bz[pti]);
 
-            if (warpx_use_fdtd_nci_corr())
+            if (WarpX::use_fdtd_nci_corr)
             {
 #if (AMREX_SPACEDIM == 2)
                 const Box& tbox = amrex::grow(pti.tilebox(),{static_cast<int>(WarpX::nox),
@@ -823,10 +1145,6 @@ PhysicalParticleContainer::Evolve (int lev,
 #endif
             }
 
-	    FArrayBox& jxfab = jx[pti];
-	    FArrayBox& jyfab = jy[pti];
-	    FArrayBox& jzfab = jz[pti];
-
 	    Exp.assign(np,0.0);
 	    Eyp.assign(np,0.0);
 	    Ezp.assign(np,0.0);
@@ -834,7 +1152,7 @@ PhysicalParticleContainer::Evolve (int lev,
 	    Byp.assign(np,WarpX::B_external[1]);
 	    Bzp.assign(np,WarpX::B_external[2]);
 
-	    giv.resize(np);
+	    m_giv[thread_num].resize(np);
 
             long nfine_current = np;
             long nfine_gather = np;
@@ -933,121 +1251,33 @@ PhysicalParticleContainer::Evolve (int lev,
 	    // copy data from particle container to temp arrays
 	    //
 	    BL_PROFILE_VAR_START(blp_copy);
-            pti.GetPosition(xp, yp, zp);
+            pti.GetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
 	    BL_PROFILE_VAR_STOP(blp_copy);
 
-            const std::array<Real,3>& xyzmin_tile = WarpX::LowerCorner(pti.tilebox(), lev);
-            const std::array<Real,3>& xyzmin_grid = WarpX::LowerCorner(box, lev);
-            const int* ixyzmin_grid = box.loVect();
-
-	    long lvect = 8;
-
-            auto depositCharge = [&] (MultiFab* rhomf, MultiFab* crhomf, int icomp)
-            {
-                long ngRho = rhomf->nGrow();
-                Real* data_ptr;
-                Box tile_box = convert(pti.tilebox(), IntVect::TheUnitVector());
-                const int *rholen;
-                
-                if (np_current > 0)
-                {                
-                    FArrayBox& rhofab = (*rhomf)[pti];
-                    const std::array<Real, 3>& xyzmin = xyzmin_tile;
-                    tile_box.grow(ngRho);
-                    local_rho.resize(tile_box);
-                    local_rho = 0.0;
-                    data_ptr = local_rho.dataPtr();
-                    rholen = local_rho.length();
-                    
-#if (AMREX_SPACEDIM == 3)
-                    const long nx = rholen[0]-1-2*ngRho;
-                    const long ny = rholen[1]-1-2*ngRho;
-                    const long nz = rholen[2]-1-2*ngRho;
-#else
-                    const long nx = rholen[0]-1-2*ngRho;
-                    const long ny = 0;
-                    const long nz = rholen[1]-1-2*ngRho;
-#endif
-                    warpx_charge_deposition(data_ptr, &np_current,
-                                            xp.data(), yp.data(), zp.data(), wp.data(),
-                                            &this->charge,
-                                            &xyzmin[0], &xyzmin[1], &xyzmin[2],
-                                            &dx[0], &dx[1], &dx[2], &nx, &ny, &nz,
-                                            &ngRho, &ngRho, &ngRho,
-                                            &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                                            &lvect, &WarpX::charge_deposition_algo);
-
-                    const int ncomp = 1;
-                    amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_rho),
-                                                BL_TO_FORTRAN_N_3D(rhofab,icomp), ncomp);
-                }
-
-                if (np_current < np)
-                {
-                    const IntVect& ref_ratio = WarpX::RefRatio(lev-1);
-                    const Box& ctilebox = amrex::coarsen(pti.tilebox(), ref_ratio);
-                    const std::array<Real,3>& cxyzmin_tile = WarpX::LowerCorner(ctilebox, lev-1);
-
-                    tile_box = amrex::convert(ctilebox, IntVect::TheUnitVector());
-                    tile_box.grow(ngRho);
-
-                    local_rho.resize(tile_box);
-
-                    local_rho = 0.0;
-
-                    data_ptr = local_rho.dataPtr();
-                    rholen = local_rho.length();
-
-#if (AMREX_SPACEDIM == 3)
-                    const long nx = rholen[0]-1-2*ngRho;
-                    const long ny = rholen[1]-1-2*ngRho;
-                    const long nz = rholen[2]-1-2*ngRho;
-#else
-                    const long nx = rholen[0]-1-2*ngRho;
-                    const long ny = 0;
-                    const long nz = rholen[1]-1-2*ngRho;
-#endif
-                    
-                    long ncrse = np - nfine_current;
-                    warpx_charge_deposition(data_ptr, &ncrse,
-                                            xp.data() + nfine_current,
-                                            yp.data() + nfine_current,
-                                            zp.data() + nfine_current,
-                                            wp.data() + nfine_current,
-                                            &this->charge,
-                                            &cxyzmin_tile[0], &cxyzmin_tile[1], &cxyzmin_tile[2],
-                                            &cdx[0], &cdx[1], &cdx[2], &nx, &ny, &nz,
-                                            &ngRho, &ngRho, &ngRho,
-                                            &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                                            &lvect, &WarpX::charge_deposition_algo);
-
-                    FArrayBox& crhofab = (*crhomf)[pti];
-                    
-                    const int ncomp = 1;
-                    amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_rho),
-                                                BL_TO_FORTRAN_N_3D(crhofab,icomp), ncomp);
-                }                
-            };
-
-            if (rho) depositCharge(rho, crho, 0);
-
+            if (rho) DepositCharge(pti, wp, rho, crho, 0, np_current, np, thread_num, lev);
+            
             if (! do_not_push)
             {
                 //
                 // Field Gather of Aux Data (i.e., the full solution)
                 //
                 const int ll4symtry          = false;
-                const int l_lower_order_in_v = warpx_l_lower_order_in_v();
                 long lvect_fieldgathe = 64;
 
+                const std::array<Real,3>& xyzmin_grid = WarpX::LowerCorner(box, lev);
+                const int* ixyzmin_grid = box.loVect();
+                
                 const long np_gather = (cEx) ? nfine_gather : np;
 
                 BL_PROFILE_VAR_START(blp_pxr_fg);
 
                 warpx_geteb_energy_conserving(
-                    &np_gather, xp.data(), yp.data(), zp.data(),
-                    Exp.data(),Eyp.data(),Ezp.data(),
-                    Bxp.data(),Byp.data(),Bzp.data(),
+                    &np_gather,
+                    m_xp[thread_num].dataPtr(),
+                    m_yp[thread_num].dataPtr(),
+                    m_zp[thread_num].dataPtr(),
+                    Exp.dataPtr(),Eyp.dataPtr(),Ezp.dataPtr(),
+                    Bxp.dataPtr(),Byp.dataPtr(),Bzp.dataPtr(),
                     ixyzmin_grid,
                     &xyzmin_grid[0], &xyzmin_grid[1], &xyzmin_grid[2],
                     &dx[0], &dx[1], &dx[2],
@@ -1058,7 +1288,7 @@ PhysicalParticleContainer::Evolve (int lev,
                     BL_TO_FORTRAN_ANYD(*bxfab),
                     BL_TO_FORTRAN_ANYD(*byfab),
                     BL_TO_FORTRAN_ANYD(*bzfab),
-                    &ll4symtry, &l_lower_order_in_v,
+                    &ll4symtry, &WarpX::l_lower_order_in_v,
                     &lvect_fieldgathe, &WarpX::field_gathering_algo);
 
                 if (np_gather < np)
@@ -1075,7 +1305,7 @@ PhysicalParticleContainer::Evolve (int lev,
                     const FArrayBox* cbyfab = &(*cBy)[pti];
                     const FArrayBox* cbzfab = &(*cBz)[pti];
 
-                    if (warpx_use_fdtd_nci_corr())
+                    if (WarpX::use_fdtd_nci_corr)
                     {
 #if (AMREX_SPACEDIM == 2)
                         const Box& tbox = amrex::grow(cbox,{static_cast<int>(WarpX::nox),
@@ -1139,9 +1369,12 @@ PhysicalParticleContainer::Evolve (int lev,
                     
                     long ncrse = np - nfine_gather;
                     warpx_geteb_energy_conserving(
-                        &ncrse, xp.data()+nfine_gather, yp.data()+nfine_gather, zp.data()+nfine_gather,
-                        Exp.data()+nfine_gather, Eyp.data()+nfine_gather, Ezp.data()+nfine_gather,
-                        Bxp.data()+nfine_gather, Byp.data()+nfine_gather, Bzp.data()+nfine_gather,
+                        &ncrse,
+                        m_xp[thread_num].dataPtr()+nfine_gather,
+                        m_yp[thread_num].dataPtr()+nfine_gather,
+                        m_zp[thread_num].dataPtr()+nfine_gather,
+                        Exp.dataPtr()+nfine_gather, Eyp.dataPtr()+nfine_gather, Ezp.dataPtr()+nfine_gather,
+                        Bxp.dataPtr()+nfine_gather, Byp.dataPtr()+nfine_gather, Bzp.dataPtr()+nfine_gather,
                         cixyzmin_grid,
                         &cxyzmin_grid[0], &cxyzmin_grid[1], &cxyzmin_grid[2],
                         &cdx[0], &cdx[1], &cdx[2],
@@ -1152,7 +1385,7 @@ PhysicalParticleContainer::Evolve (int lev,
                         BL_TO_FORTRAN_ANYD(*cbxfab),
                         BL_TO_FORTRAN_ANYD(*cbyfab),
                         BL_TO_FORTRAN_ANYD(*cbzfab),
-                        &ll4symtry, &l_lower_order_in_v,
+                        &ll4symtry, &WarpX::l_lower_order_in_v,
                         &lvect_fieldgathe, &WarpX::field_gathering_algo);
                 }
 
@@ -1162,141 +1395,34 @@ PhysicalParticleContainer::Evolve (int lev,
                 // Particle Push
                 //
                 BL_PROFILE_VAR_START(blp_pxr_pp);
-                PushPX(pti, xp, yp, zp, giv, dt);
+                PushPX(pti, m_xp[thread_num], m_yp[thread_num], m_zp[thread_num], 
+		       m_giv[thread_num], dt);
                 BL_PROFILE_VAR_STOP(blp_pxr_pp);
 
                 //
-                // Current Deposition onto fine patch
+                // Current Deposition
                 //
-
-                BL_PROFILE_VAR_START(blp_pxr_cd);
-                Real *jx_ptr, *jy_ptr, *jz_ptr;
-                const int  *jxntot, *jyntot, *jzntot;
-                Box tbx = convert(pti.tilebox(), WarpX::jx_nodal_flag);
-                Box tby = convert(pti.tilebox(), WarpX::jy_nodal_flag);
-                Box tbz = convert(pti.tilebox(), WarpX::jz_nodal_flag);
-                Box gtbx, gtby, gtbz;
-
-                const std::array<Real, 3>& xyzmin = xyzmin_tile;
-
-                if (np_current > 0)
-                {
-                    tbx.grow(ngJ);
-                    tby.grow(ngJ);
-                    tbz.grow(ngJ);
-                    
-                    local_jx.resize(tbx);
-                    local_jy.resize(tby);
-                    local_jz.resize(tbz);
-
-                    local_jx = 0.0;
-                    local_jy = 0.0;
-                    local_jz = 0.0;
-
-                    jx_ptr = local_jx.dataPtr();
-                    jy_ptr = local_jy.dataPtr();
-                    jz_ptr = local_jz.dataPtr();
-
-                    jxntot = local_jx.length();
-                    jyntot = local_jy.length();
-                    jzntot = local_jz.length();
-
-                    warpx_current_deposition(
-                        jx_ptr, &ngJ, jxntot,
-                        jy_ptr, &ngJ, jyntot,
-                        jz_ptr, &ngJ, jzntot,
-                        &np_current, xp.data(), yp.data(), zp.data(),
-                        uxp.data(), uyp.data(), uzp.data(),
-                        giv.data(), wp.data(), &this->charge,
-                        &xyzmin[0], &xyzmin[1], &xyzmin[2],
-                        &dt, &dx[0], &dx[1], &dx[2],
-                        &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                        &lvect,&WarpX::current_deposition_algo);
-
-                    BL_PROFILE_VAR_STOP(blp_pxr_cd);
-
-                    BL_PROFILE_VAR_START(blp_accumulate);
-                    const int ncomp = 1;
-                    amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_jx),
-                                                BL_TO_FORTRAN_3D(jxfab), ncomp);
-                    
-                    amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_jy),
-                                                BL_TO_FORTRAN_3D(jyfab), ncomp);
-
-                    amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_jz),
-                                                BL_TO_FORTRAN_3D(jzfab), ncomp);
-                    BL_PROFILE_VAR_STOP(blp_accumulate);
-                }
-
-                if (np_current < np)
-                {
-                    const IntVect& ref_ratio = WarpX::RefRatio(lev-1);
-                    const Box& ctilebox = amrex::coarsen(pti.tilebox(),ref_ratio);
-                    const std::array<Real,3>& cxyzmin_tile = WarpX::LowerCorner(ctilebox, lev-1);
-
-                    tbx = amrex::convert(ctilebox, WarpX::jx_nodal_flag);
-                    tby = amrex::convert(ctilebox, WarpX::jy_nodal_flag);
-                    tbz = amrex::convert(ctilebox, WarpX::jz_nodal_flag);
-                    tbx.grow(ngJ);
-                    tby.grow(ngJ);
-                    tbz.grow(ngJ);
-
-                    local_jx.resize(tbx);
-                    local_jy.resize(tby);
-                    local_jz.resize(tbz);
-
-                    local_jx = 0.0;
-                    local_jy = 0.0;
-                    local_jz = 0.0;
-
-                    jx_ptr = local_jx.dataPtr();
-                    jy_ptr = local_jy.dataPtr();
-                    jz_ptr = local_jz.dataPtr();
-
-                    jxntot = local_jx.length();
-                    jyntot = local_jy.length();
-                    jzntot = local_jz.length();
-
-                    long ncrse = np - nfine_current;
-                    warpx_current_deposition(
-                        jx_ptr, &ngJ, jxntot,
-                        jy_ptr, &ngJ, jyntot,
-                        jz_ptr, &ngJ, jzntot,
-                        &ncrse, xp.data()+nfine_current, yp.data()+nfine_current, zp.data()+nfine_current,
-                        uxp.data()+nfine_current, uyp.data()+nfine_current, uzp.data()+nfine_current,
-                        giv.data()+nfine_current, wp.data()+nfine_current, &this->charge,
-                        &cxyzmin_tile[0], &cxyzmin_tile[1], &cxyzmin_tile[2],
-                        &dt, &cdx[0], &cdx[1], &cdx[2],
-                        &WarpX::nox,&WarpX::noy,&WarpX::noz,
-                        &lvect,&WarpX::current_deposition_algo);
-
-                    FArrayBox& cjxfab = (*cjx)[pti];
-                    FArrayBox& cjyfab = (*cjy)[pti];
-                    FArrayBox& cjzfab = (*cjz)[pti];
-
-                    const int ncomp = 1;
-                    amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_jx),
-                                                BL_TO_FORTRAN_3D(cjxfab), ncomp);
-                    amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_jy),
-                                                BL_TO_FORTRAN_3D(cjyfab), ncomp);
-                    amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_jz),
-                                                BL_TO_FORTRAN_3D(cjzfab), ncomp);
-                }
-
+                DepositCurrent(pti, wp, uxp, uyp, uzp, jx, jy, jz,
+                               cjx, cjy, cjz, np_current, np, thread_num, lev, dt);
+  
                 //
                 // copy particle data back
                 //
                 BL_PROFILE_VAR_START(blp_copy);
-                pti.SetPosition(xp, yp, zp);
+                pti.SetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
                 BL_PROFILE_VAR_STOP(blp_copy);
             }
-
-            if (rho) depositCharge(rho, crho, 1);
+            
+            if (rho) DepositCharge(pti, wp, rho, crho, 1, np_current, np, thread_num, lev);
 
             if (cost) {
                 const Box& tbx = pti.tilebox();
                 wt = (amrex::second() - wt) / tbx.d_numPts();
-                (*cost)[pti].plus(wt, tbx);
+                FArrayBox* costfab = cost->fabPtr(pti);
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, work_box,
+                {
+                    costfab->plus(wt, work_box);
+                });
             }
         }
     }
@@ -1304,8 +1430,10 @@ PhysicalParticleContainer::Evolve (int lev,
 
 void
 PhysicalParticleContainer::PushPX(WarpXParIter& pti,
-	                          Vector<Real>& xp, Vector<Real>& yp, Vector<Real>& zp,
-                                  Vector<Real>& giv,
+	                          Cuda::DeviceVector<Real>& xp,
+                                  Cuda::DeviceVector<Real>& yp,
+                                  Cuda::DeviceVector<Real>& zp,
+                                  Cuda::DeviceVector<Real>& giv,
                                   Real dt)
 {
 
@@ -1330,15 +1458,19 @@ PhysicalParticleContainer::PushPX(WarpXParIter& pti,
     auto& uypold = attribs[PIdx::uyold];
     auto& uzpold = attribs[PIdx::uzold];
 
-    warpx_copy_attribs(&np, xp.data(), yp.data(), zp.data(),
-                       uxp.data(), uyp.data(), uzp.data(),
-                       xpold.data(), ypold.data(), zpold.data(),
-                       uxpold.data(), uypold.data(), uzpold.data());
+    warpx_copy_attribs(&np, xp.dataPtr(), yp.dataPtr(), zp.dataPtr(),
+                       uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
+                       xpold.dataPtr(), ypold.dataPtr(), zpold.dataPtr(),
+                       uxpold.dataPtr(), uypold.dataPtr(), uzpold.dataPtr());
 
 #endif
 
-    warpx_particle_pusher(&np, xp.data(), yp.data(), zp.data(),
-                          uxp.data(), uyp.data(), uzp.data(), giv.data(),
+    warpx_particle_pusher(&np,
+                          xp.dataPtr(),
+                          yp.dataPtr(),
+                          zp.dataPtr(),
+                          uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
+                          giv.dataPtr(),
                           Exp.dataPtr(), Eyp.dataPtr(), Ezp.dataPtr(),
                           Bxp.dataPtr(), Byp.dataPtr(), Bzp.dataPtr(),
                           &this->charge, &this->mass, &dt,
@@ -1351,6 +1483,8 @@ PhysicalParticleContainer::PushP (int lev, Real dt,
                                   const MultiFab& Ex, const MultiFab& Ey, const MultiFab& Ez,
                                   const MultiFab& Bx, const MultiFab& By, const MultiFab& Bz)
 {
+    BL_PROFILE("PhysicalParticleContainer::PushP");
+
     if (do_not_push) return;
 
     const std::array<Real,3>& dx = WarpX::CellSize(lev);
@@ -1359,8 +1493,11 @@ PhysicalParticleContainer::PushP (int lev, Real dt,
 #pragma omp parallel
 #endif
     {
-	Vector<Real> xp, yp, zp, giv;
-
+#ifdef _OPENMP
+        int thread_num = omp_get_thread_num();
+#else
+        int thread_num = 0;
+#endif      
         for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
 	{
 	    const Box& box = pti.validbox();
@@ -1394,23 +1531,25 @@ PhysicalParticleContainer::PushP (int lev, Real dt,
 	    Byp.assign(np,WarpX::B_external[1]);
 	    Bzp.assign(np,WarpX::B_external[2]);
 
-	    giv.resize(np);
+	    m_giv[thread_num].resize(np);
 
 	    //
 	    // copy data from particle container to temp arrays
 	    //
-            pti.GetPosition(xp, yp, zp);
+            pti.GetPosition(m_xp[thread_num], m_yp[thread_num], m_zp[thread_num]);
 
             const std::array<Real,3>& xyzmin_grid = WarpX::LowerCorner(box, lev);
             const int* ixyzmin_grid = box.loVect();
 
             const int ll4symtry          = false;
-            const int l_lower_order_in_v = true;
             long lvect_fieldgathe = 64;
             warpx_geteb_energy_conserving(
-                &np, xp.data(), yp.data(), zp.data(),
-                Exp.data(),Eyp.data(),Ezp.data(),
-                Bxp.data(),Byp.data(),Bzp.data(),
+                &np,
+                m_xp[thread_num].dataPtr(),
+                m_yp[thread_num].dataPtr(),
+                m_zp[thread_num].dataPtr(),
+                Exp.dataPtr(),Eyp.dataPtr(),Ezp.dataPtr(),
+                Bxp.dataPtr(),Byp.dataPtr(),Bzp.dataPtr(),
                 ixyzmin_grid,
                 &xyzmin_grid[0], &xyzmin_grid[1], &xyzmin_grid[2],
                 &dx[0], &dx[1], &dx[2],
@@ -1421,11 +1560,15 @@ PhysicalParticleContainer::PushP (int lev, Real dt,
                 BL_TO_FORTRAN_ANYD(bxfab),
                 BL_TO_FORTRAN_ANYD(byfab),
                 BL_TO_FORTRAN_ANYD(bzfab),
-                &ll4symtry, &l_lower_order_in_v,
+                &ll4symtry, &WarpX::l_lower_order_in_v,
                 &lvect_fieldgathe, &WarpX::field_gathering_algo);
 
-            warpx_particle_pusher_momenta(&np, xp.data(), yp.data(), zp.data(),
-                                          uxp.data(), uyp.data(), uzp.data(), giv.data(),
+            warpx_particle_pusher_momenta(&np,
+                                          m_xp[thread_num].dataPtr(),
+                                          m_yp[thread_num].dataPtr(),
+                                          m_zp[thread_num].dataPtr(),
+                                          uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
+                                          m_giv[thread_num].dataPtr(),
                                           Exp.dataPtr(), Eyp.dataPtr(), Ezp.dataPtr(),
                                           Bxp.dataPtr(), Byp.dataPtr(), Bzp.dataPtr(),
                                           &this->charge, &this->mass, &dt,
@@ -1482,7 +1625,7 @@ void PhysicalParticleContainer::GetParticleSlice(const int direction, const Real
 #pragma omp parallel
 #endif
         {
-            Vector<Real> xp_new, yp_new, zp_new;
+            RealVector xp_new, yp_new, zp_new;
 
             for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
             {
@@ -1615,7 +1758,7 @@ int PhysicalParticleContainer::GetRefineFac(const Real x, const Real y, const Re
             stretched_boxes.push_back(bx);
         }
 
-        BoxArray stretched_ba(stretched_boxes.data(), stretched_boxes.size());
+        BoxArray stretched_ba(stretched_boxes.dataPtr(), stretched_boxes.size());
 
         const int num_ghost = 0;
         if ( stretched_ba.intersects(Box(iv, iv), num_ghost) )
diff --git a/Source/RigidInjectedParticleContainer.H b/Source/RigidInjectedParticleContainer.H
index f2440ad7a..8a9ac9e6e 100644
--- a/Source/RigidInjectedParticleContainer.H
+++ b/Source/RigidInjectedParticleContainer.H
@@ -43,10 +43,10 @@ public:
                          amrex::Real dt) override;
 
     virtual void PushPX(WarpXParIter& pti,
-	                amrex::Vector<amrex::Real>& xp,
-                        amrex::Vector<amrex::Real>& yp,
-                        amrex::Vector<amrex::Real>& zp,
-                        amrex::Vector<amrex::Real>& giv,
+	                amrex::Cuda::DeviceVector<amrex::Real>& xp,
+                        amrex::Cuda::DeviceVector<amrex::Real>& yp,
+                        amrex::Cuda::DeviceVector<amrex::Real>& zp,
+                        amrex::Cuda::DeviceVector<amrex::Real>& giv,
                         amrex::Real dt) override;
 
     virtual void PushP (int lev, amrex::Real dt,
diff --git a/Source/RigidInjectedParticleContainer.cpp b/Source/RigidInjectedParticleContainer.cpp
index 96bdc59a4..eb2c1c4a8 100644
--- a/Source/RigidInjectedParticleContainer.cpp
+++ b/Source/RigidInjectedParticleContainer.cpp
@@ -73,7 +73,7 @@ RigidInjectedParticleContainer::RemapParticles()
                 // Note that the particles are already in the boosted frame.
                 // This value is saved to advance the particles not injected yet
 
-                Vector<Real> xp, yp, zp;
+                Cuda::DeviceVector<Real> xp, yp, zp;
 
                 for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
                 {
@@ -134,7 +134,7 @@ RigidInjectedParticleContainer::BoostandRemapParticles()
 #pragma omp parallel
 #endif
     {
-        Vector<Real> xp, yp, zp;
+        Cuda::DeviceVector<Real> xp, yp, zp;
 
         for (WarpXParIter pti(*this, 0); pti.isValid(); ++pti)
         {
@@ -205,8 +205,10 @@ RigidInjectedParticleContainer::BoostandRemapParticles()
 
 void
 RigidInjectedParticleContainer::PushPX(WarpXParIter& pti,
-	                               Vector<Real>& xp, Vector<Real>& yp, Vector<Real>& zp,
-                                       Vector<Real>& giv,
+	                               Cuda::DeviceVector<Real>& xp,
+                                       Cuda::DeviceVector<Real>& yp,
+                                       Cuda::DeviceVector<Real>& zp,
+                                       Cuda::DeviceVector<Real>& giv,
                                        Real dt)
 {
 
@@ -231,15 +233,16 @@ RigidInjectedParticleContainer::PushPX(WarpXParIter& pti,
     auto& uypold = attribs[PIdx::uyold];
     auto& uzpold = attribs[PIdx::uzold];
 
-    warpx_copy_attribs(&np, xp.data(), yp.data(), zp.data(),
-                       uxp.data(), uyp.data(), uzp.data(),
-                       xpold.data(), ypold.data(), zpold.data(),
-                       uxpold.data(), uypold.data(), uzpold.data());
+    warpx_copy_attribs(&np, xp.dataPtr(), yp.dataPtr(), zp.dataPtr(),
+                       uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
+                       xpold.dataPtr(), ypold.dataPtr(), zpold.dataPtr(),
+                       uxpold.dataPtr(), uypold.dataPtr(), uzpold.dataPtr());
 
 #endif
 
     // Save the position and momenta, making copies
-    Vector<Real> xp_save, yp_save, zp_save, uxp_save, uyp_save, uzp_save;
+    Cuda::DeviceVector<Real> xp_save, yp_save, zp_save;
+    RealVector uxp_save, uyp_save, uzp_save;
 
     if (!done_injecting_lev) {
         xp_save = xp;
@@ -265,8 +268,12 @@ RigidInjectedParticleContainer::PushPX(WarpXParIter& pti,
         }
     }
 
-    warpx_particle_pusher(&np, xp.data(), yp.data(), zp.data(),
-                          uxp.data(), uyp.data(), uzp.data(), giv.data(),
+    warpx_particle_pusher(&np,
+                          xp.dataPtr(),
+                          yp.dataPtr(),
+                          zp.dataPtr(),
+                          uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
+                          giv.dataPtr(),
                           Exp.dataPtr(), Eyp.dataPtr(), Ezp.dataPtr(),
                           Bxp.dataPtr(), Byp.dataPtr(), Bzp.dataPtr(),
                           &this->charge, &this->mass, &dt,
@@ -355,7 +362,7 @@ RigidInjectedParticleContainer::PushP (int lev, Real dt,
 #pragma omp parallel
 #endif
     {
-        Vector<Real> xp, yp, zp, giv;
+        Cuda::DeviceVector<Real> xp, yp, zp, giv;
 
         for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
         {
@@ -404,9 +411,12 @@ RigidInjectedParticleContainer::PushP (int lev, Real dt,
             const int l_lower_order_in_v = true;
             long lvect_fieldgathe = 64;
             warpx_geteb_energy_conserving(
-                &np, xp.data(), yp.data(), zp.data(),
-                Exp.data(),Eyp.data(),Ezp.data(),
-                Bxp.data(),Byp.data(),Bzp.data(),
+                &np,
+                xp.dataPtr(),
+                yp.dataPtr(),
+                zp.dataPtr(),
+                Exp.dataPtr(),Eyp.dataPtr(),Ezp.dataPtr(),
+                Bxp.dataPtr(),Byp.dataPtr(),Bzp.dataPtr(),
                 ixyzmin_grid,
                 &xyzmin_grid[0], &xyzmin_grid[1], &xyzmin_grid[2],
                 &dx[0], &dx[1], &dx[2],
@@ -425,8 +435,12 @@ RigidInjectedParticleContainer::PushP (int lev, Real dt,
             auto uyp_save = uyp;
             auto uzp_save = uzp;
 
-            warpx_particle_pusher_momenta(&np, xp.data(), yp.data(), zp.data(),
-                                          uxp.data(), uyp.data(), uzp.data(), giv.data(),
+            warpx_particle_pusher_momenta(&np,
+                                          xp.dataPtr(),
+                                          yp.dataPtr(),
+                                          zp.dataPtr(),
+                                          uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
+                                          giv.dataPtr(),
                                           Exp.dataPtr(), Eyp.dataPtr(), Ezp.dataPtr(),
                                           Bxp.dataPtr(), Byp.dataPtr(), Bzp.dataPtr(),
                                           &this->charge, &this->mass, &dt,
diff --git a/Source/WarpX.H b/Source/WarpX.H
index ec41115cd..08fe657b4 100644
--- a/Source/WarpX.H
+++ b/Source/WarpX.H
@@ -86,6 +86,9 @@ public:
     static long noy;
     static long noz;
 
+    static bool use_fdtd_nci_corr;
+    static int  l_lower_order_in_v;
+    
     static bool use_laser;
     static bool use_filter;
     static bool serialize_ics;
@@ -105,6 +108,8 @@ public:
     static bool do_dynamic_scheduling;
     static bool refine_plasma;
 
+    static int sort_int;
+
     // buffers
     static int n_field_gather_buffer;
     static int n_current_deposition_buffer;
@@ -139,9 +144,13 @@ public:
     void EvolveB (int lev, amrex::Real dt);
     void EvolveF (         amrex::Real dt, DtType dt_type);
     void EvolveF (int lev, amrex::Real dt, DtType dt_type);
+    void EvolveB (int lev, PatchType patch_type, amrex::Real dt);
+    void EvolveE (int lev, PatchType patch_type, amrex::Real dt);
+    void EvolveF (int lev, PatchType patch_type, amrex::Real dt, DtType dt_type);
 
     void DampPML ();
     void DampPML (int lev);
+    void DampPML (int lev, PatchType patch_type);
 
     void PushParticlesandDepose (int lev, amrex::Real cur_time);
     void PushParticlesandDepose (         amrex::Real cur_time);
@@ -248,11 +257,6 @@ private:
     void FillBoundaryE (int lev, PatchType patch_type);
     void FillBoundaryF (int lev, PatchType patch_type);
 
-    void EvolveB (int lev, PatchType patch_type, amrex::Real dt);
-    void EvolveE (int lev, PatchType patch_type, amrex::Real dt);
-    void EvolveF (int lev, PatchType patch_type, amrex::Real dt, DtType dt_type);
-    void DampPML (int lev, PatchType patch_type);
-
     void OneStep_nosub (amrex::Real t);
     void OneStep_sub1 (amrex::Real t);
 
@@ -350,10 +354,18 @@ private:
                              const std::array<const amrex::MultiFab*, 3>& B,
                              const std::array<amrex::Real,3>& dx);
 
+    static void ComputeDivB (amrex::MultiFab& divB, int dcomp,
+                             const std::array<const amrex::MultiFab*, 3>& B,
+                             const std::array<amrex::Real,3>& dx, int ngrow);
+
     static void ComputeDivE (amrex::MultiFab& divE, int dcomp,
                              const std::array<const amrex::MultiFab*, 3>& B,
                              const std::array<amrex::Real,3>& dx);
 
+    static void ComputeDivE (amrex::MultiFab& divE, int dcomp,
+                             const std::array<const amrex::MultiFab*, 3>& B,
+                             const std::array<amrex::Real,3>& dx, int ngrow);
+
     void SyncCurrent (const std::array<const amrex::MultiFab*,3>& fine,
                       const std::array<      amrex::MultiFab*,3>& crse,
                       int ref_ratio);
@@ -425,6 +437,13 @@ private:
     amrex::Vector<std::array< std::unique_ptr<amrex::MultiFab>, 3 > > current_buf;
     amrex::Vector<std::unique_ptr<amrex::MultiFab> > charge_buf;
 
+    amrex::Vector<std::array<std::unique_ptr<amrex::iMultiFab>, 3 > > current_fp_owner_masks;
+    amrex::Vector<std::array<std::unique_ptr<amrex::iMultiFab>, 3 > > current_cp_owner_masks;
+    
+    amrex::Vector<std::unique_ptr<amrex::iMultiFab> > rho_fp_owner_masks;
+    amrex::Vector<std::unique_ptr<amrex::iMultiFab> > rho_cp_owner_masks;
+
+
     // div E cleaning
     int do_dive_cleaning = 0;
 
@@ -582,6 +601,7 @@ private:
     int insitu_int;
     int insitu_start;
     std::string insitu_config;
+    int insitu_pin_mesh;
 };
 
 #endif
diff --git a/Source/WarpX.cpp b/Source/WarpX.cpp
index 44eff9cd4..1828ebcec 100644
--- a/Source/WarpX.cpp
+++ b/Source/WarpX.cpp
@@ -43,11 +43,16 @@ long WarpX::nox = 1;
 long WarpX::noy = 1;
 long WarpX::noz = 1;
 
+bool WarpX::use_fdtd_nci_corr = false;
+int  WarpX::l_lower_order_in_v = true;
+
 bool WarpX::use_laser         = false;
 bool WarpX::use_filter        = false;
 bool WarpX::serialize_ics     = false;
 bool WarpX::refine_plasma     = false;
 
+int  WarpX::sort_int = -1;
+
 bool WarpX::do_boosted_frame_diagnostic = false;
 int  WarpX::num_snapshots_lab = std::numeric_limits<int>::lowest();
 Real WarpX::dt_snapshots_lab  = std::numeric_limits<Real>::lowest();
@@ -91,6 +96,8 @@ int WarpX::n_current_deposition_buffer = -1;
 
 WarpX* WarpX::m_instance = nullptr;
 
+
+
 WarpX&
 WarpX::GetInstance ()
 {
@@ -169,6 +176,11 @@ WarpX::WarpX ()
     current_buf.resize(nlevs_max);
     charge_buf.resize(nlevs_max);
 
+    current_fp_owner_masks.resize(nlevs_max);
+    current_cp_owner_masks.resize(nlevs_max);
+    rho_fp_owner_masks.resize(nlevs_max);
+    rho_cp_owner_masks.resize(nlevs_max);
+
     pml.resize(nlevs_max);
 
 #ifdef WARPX_DO_ELECTROSTATIC
@@ -335,6 +347,7 @@ WarpX::ReadParameters ()
         pp.query("do_dive_cleaning", do_dive_cleaning);
         pp.query("n_field_gather_buffer", n_field_gather_buffer);
         pp.query("n_current_deposition_buffer", n_current_deposition_buffer);
+	pp.query("sort_int", sort_int);
 
         pp.query("do_pml", do_pml);
         pp.query("pml_ncell", pml_ncell);
@@ -450,11 +463,13 @@ WarpX::ReadParameters ()
         insitu_start = 0;
         insitu_int = 0;
         insitu_config = "";
+        insitu_pin_mesh = 0;
 
         ParmParse pp("insitu");
         pp.query("int", insitu_int);
         pp.query("start", insitu_start);
         pp.query("config", insitu_config);
+        pp.query("pin_mesh", insitu_pin_mesh);
     }
 }
 
@@ -492,8 +507,14 @@ WarpX::ClearLevel (int lev)
 	Efield_cax[lev][i].reset();
 	Bfield_cax[lev][i].reset();
         current_buf[lev][i].reset();
+
+        current_fp_owner_masks[lev][i].reset();
+        current_cp_owner_masks[lev][i].reset();
     }
 
+    rho_fp_owner_masks[lev].reset();
+    rho_cp_owner_masks[lev].reset();
+
     charge_buf[lev].reset();
 
     current_buffer_masks[lev].reset();
@@ -550,7 +571,7 @@ WarpX::AllocLevelData (int lev, const BoxArray& ba, const DistributionMapping& d
     int ngy = (ngy_tmp % 2) ? ngy_tmp+1 : ngy_tmp;  // Always even number
     int ngz_nonci = (ngz_tmp % 2) ? ngz_tmp+1 : ngz_tmp;  // Always even number
     int ngz;
-    if (warpx_use_fdtd_nci_corr()) {
+    if (WarpX::use_fdtd_nci_corr) {
         int ng = ngz_tmp + (mypc->nstencilz_fdtd_nci_corr-1);
         ngz = (ng % 2) ? ng+1 : ng;
     } else {
@@ -613,16 +634,24 @@ WarpX::AllocLevelData (int lev, const BoxArray& ba, const DistributionMapping& d
     current_fp[lev][1].reset( new MultiFab(amrex::convert(ba,jy_nodal_flag),dm,1,ngJ));
     current_fp[lev][2].reset( new MultiFab(amrex::convert(ba,jz_nodal_flag),dm,1,ngJ));
 
-    if (do_subcycling == 1 && lev == 0) {
+    const auto& period = Geom(lev).periodicity();
+    current_fp_owner_masks[lev][0] = std::move(current_fp[lev][0]->OwnerMask(period));
+    current_fp_owner_masks[lev][1] = std::move(current_fp[lev][1]->OwnerMask(period));
+    current_fp_owner_masks[lev][2] = std::move(current_fp[lev][2]->OwnerMask(period));
+    
+    if (do_dive_cleaning || plot_rho)
+    {
+        rho_fp[lev].reset(new MultiFab(amrex::convert(ba,IntVect::TheUnitVector()),dm,2,ngRho));    
+        rho_fp_owner_masks[lev] = std::move(rho_fp[lev]->OwnerMask(period));
+    }
+    
+    if (do_subcycling == 1 && lev == 0)
+    {
         current_store[lev][0].reset( new MultiFab(amrex::convert(ba,jx_nodal_flag),dm,1,ngJ));
         current_store[lev][1].reset( new MultiFab(amrex::convert(ba,jy_nodal_flag),dm,1,ngJ));
         current_store[lev][2].reset( new MultiFab(amrex::convert(ba,jz_nodal_flag),dm,1,ngJ));
     }
 
-    if (do_dive_cleaning || plot_rho){
-        rho_fp[lev].reset(new MultiFab(amrex::convert(ba,IntVect::TheUnitVector()),dm,2,ngRho));
-    }
-
     if (do_dive_cleaning)
     {
         F_fp[lev].reset  (new MultiFab(amrex::convert(ba,IntVect::TheUnitVector()),dm,1, ngF));
@@ -631,6 +660,7 @@ WarpX::AllocLevelData (int lev, const BoxArray& ba, const DistributionMapping& d
     else
     {
         rho_fp[lev].reset(new MultiFab(amrex::convert(ba,IntVect::TheUnitVector()),dm,2,ngRho));
+        rho_fp_owner_masks[lev] = std::move(rho_fp[lev]->OwnerMask(period));
     }
 #endif
 
@@ -678,8 +708,14 @@ WarpX::AllocLevelData (int lev, const BoxArray& ba, const DistributionMapping& d
         current_cp[lev][1].reset( new MultiFab(amrex::convert(cba,jy_nodal_flag),dm,1,ngJ));
         current_cp[lev][2].reset( new MultiFab(amrex::convert(cba,jz_nodal_flag),dm,1,ngJ));
 
+        const auto& cperiod = Geom(lev).periodicity();
+        current_cp_owner_masks[lev][0] = std::move(current_cp[lev][0]->OwnerMask(cperiod));
+        current_cp_owner_masks[lev][1] = std::move(current_cp[lev][1]->OwnerMask(cperiod));
+        current_cp_owner_masks[lev][2] = std::move(current_cp[lev][2]->OwnerMask(cperiod));
+
         if (do_dive_cleaning || plot_rho){
             rho_cp[lev].reset(new MultiFab(amrex::convert(cba,IntVect::TheUnitVector()),dm,2,ngRho));
+            rho_cp_owner_masks[lev] = std::move(rho_cp[lev]->OwnerMask(cperiod));
         }
         if (do_dive_cleaning)
         {
@@ -689,6 +725,7 @@ WarpX::AllocLevelData (int lev, const BoxArray& ba, const DistributionMapping& d
         else
         {
             rho_cp[lev].reset(new MultiFab(amrex::convert(cba,IntVect::TheUnitVector()),dm,2,ngRho));
+            rho_cp_owner_masks[lev] = std::move(rho_cp[lev]->OwnerMask(cperiod));
         }
 #endif
     }
@@ -808,6 +845,26 @@ WarpX::ComputeDivB (MultiFab& divB, int dcomp,
 }
 
 void
+WarpX::ComputeDivB (MultiFab& divB, int dcomp,
+                    const std::array<const MultiFab*, 3>& B,
+                    const std::array<Real,3>& dx, int ngrow)
+{
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+    for (MFIter mfi(divB, true); mfi.isValid(); ++mfi)
+    {
+        Box bx = mfi.growntilebox(ngrow);
+        WRPX_COMPUTE_DIVB(bx.loVect(), bx.hiVect(),
+                           BL_TO_FORTRAN_N_ANYD(divB[mfi],dcomp),
+                           BL_TO_FORTRAN_ANYD((*B[0])[mfi]),
+                           BL_TO_FORTRAN_ANYD((*B[1])[mfi]),
+                           BL_TO_FORTRAN_ANYD((*B[2])[mfi]),
+                           dx.data());
+    }
+}
+
+void
 WarpX::ComputeDivE (MultiFab& divE, int dcomp,
                     const std::array<const MultiFab*, 3>& E,
                     const std::array<Real,3>& dx)
@@ -828,6 +885,26 @@ WarpX::ComputeDivE (MultiFab& divE, int dcomp,
 }
 
 void
+WarpX::ComputeDivE (MultiFab& divE, int dcomp,
+                    const std::array<const MultiFab*, 3>& E,
+                    const std::array<Real,3>& dx, int ngrow)
+{
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+    for (MFIter mfi(divE, true); mfi.isValid(); ++mfi)
+    {
+        Box bx = mfi.growntilebox(ngrow);
+        WRPX_COMPUTE_DIVE(bx.loVect(), bx.hiVect(),
+                           BL_TO_FORTRAN_N_ANYD(divE[mfi],dcomp),
+                           BL_TO_FORTRAN_ANYD((*E[0])[mfi]),
+                           BL_TO_FORTRAN_ANYD((*E[1])[mfi]),
+                           BL_TO_FORTRAN_ANYD((*E[2])[mfi]),
+                           dx.data());
+    }
+}
+
+void
 WarpX::applyFilter (MultiFab& dstmf, const MultiFab& srcmf, int scomp, int dcomp, int ncomp)
 {
     ncomp = std::min(ncomp, srcmf.nComp());
diff --git a/Source/WarpXComm.cpp b/Source/WarpXComm.cpp
index f225f1015..32b9e80f5 100644
--- a/Source/WarpXComm.cpp
+++ b/Source/WarpXComm.cpp
@@ -491,16 +491,16 @@ WarpX::SyncCurrent ()
     for (int lev = 0; lev <= finest_level; ++lev)
     {
         const auto& period = Geom(lev).periodicity();
-        current_fp[lev][0]->OverrideSync(period);
-        current_fp[lev][1]->OverrideSync(period);
-        current_fp[lev][2]->OverrideSync(period);
+        current_fp[lev][0]->OverrideSync(*current_fp_owner_masks[lev][0], period);
+        current_fp[lev][1]->OverrideSync(*current_fp_owner_masks[lev][1], period);
+        current_fp[lev][2]->OverrideSync(*current_fp_owner_masks[lev][2],period);
     }
     for (int lev = 1; lev <= finest_level; ++lev)
     {
         const auto& cperiod = Geom(lev-1).periodicity();
-        current_cp[lev][0]->OverrideSync(cperiod);
-        current_cp[lev][1]->OverrideSync(cperiod);
-        current_cp[lev][2]->OverrideSync(cperiod);
+        current_cp[lev][0]->OverrideSync(*current_cp_owner_masks[lev][0], cperiod);
+        current_cp[lev][1]->OverrideSync(*current_cp_owner_masks[lev][1], cperiod);
+        current_cp[lev][2]->OverrideSync(*current_cp_owner_masks[lev][2], cperiod);
     }
 }
 
@@ -645,12 +645,12 @@ WarpX::SyncRho (amrex::Vector<std::unique_ptr<amrex::MultiFab> >& rhof,
     for (int lev = 0; lev <= finest_level; ++lev)
     {
         const auto& period = Geom(lev).periodicity();
-        rhof[lev]->OverrideSync(period);
+        rhof[lev]->OverrideSync(*rho_fp_owner_masks[lev], period);
     }
     for (int lev = 1; lev <= finest_level; ++lev)
     {
         const auto& cperiod = Geom(lev-1).periodicity();
-        rhoc[lev]->OverrideSync(cperiod);
+        rhoc[lev]->OverrideSync(*rho_cp_owner_masks[lev], cperiod);
     }
 }
 
diff --git a/Source/WarpXEvolve.cpp b/Source/WarpXEvolve.cpp
index 7c1629e94..6d917e36d 100644
--- a/Source/WarpXEvolve.cpp
+++ b/Source/WarpXEvolve.cpp
@@ -155,6 +155,12 @@ WarpX::EvolveEM (int numsteps)
             mypc->Redistribute();
         }
 
+	bool to_sort = (sort_int > 0) && ((step+1) % sort_int == 0);
+	if (to_sort) {
+	    amrex::Print() << "re-sorting particles \n";
+	    mypc->SortParticlesByCell();
+	}
+
         amrex::Print()<< "STEP " << step+1 << " ends." << " TIME = " << cur_time
                       << " DT = " << dt[0] << "\n";
         Real walltime_end_step = amrex::second();
@@ -492,9 +498,9 @@ WarpX::EvolveB (int lev, PatchType patch_type, amrex::Real dt)
 
     // Loop through the grids, and over the tiles within each grid
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-    for ( MFIter mfi(*Bx,true); mfi.isValid(); ++mfi )
+    for ( MFIter mfi(*Bx, TilingIfNotGPU()); mfi.isValid(); ++mfi )
     {
         Real wt = amrex::second();
 
@@ -519,8 +525,12 @@ WarpX::EvolveB (int lev, PatchType patch_type, amrex::Real dt)
         if (cost) {
             Box cbx = mfi.tilebox(IntVect{AMREX_D_DECL(0,0,0)});
             if (patch_type == PatchType::coarse) cbx.refine(rr);
-            wt = (amrex::second() - wt) / cbx.d_numPts();
-            (*cost)[mfi].plus(wt, cbx);
+            wt = (amrex::second() - wt) / cbx.d_numPts();\
+            FArrayBox* costfab = cost->fabPtr(mfi);
+            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( cbx, work_box,
+            {
+                costfab->plus(wt, work_box);
+            });
         }
     }
 
@@ -530,9 +540,9 @@ WarpX::EvolveB (int lev, PatchType patch_type, amrex::Real dt)
         const auto& pml_E = (patch_type == PatchType::fine) ? pml[lev]->GetE_fp() : pml[lev]->GetE_cp();
 
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-        for ( MFIter mfi(*pml_B[0],true); mfi.isValid(); ++mfi )
+        for ( MFIter mfi(*pml_B[0], TilingIfNotGPU()); mfi.isValid(); ++mfi )
         {
             const Box& tbx  = mfi.tilebox(Bx_nodal_flag);
             const Box& tby  = mfi.tilebox(By_nodal_flag);
@@ -617,9 +627,9 @@ WarpX::EvolveE (int lev, PatchType patch_type, amrex::Real dt)
 
     // Loop through the grids, and over the tiles within each grid
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-    for ( MFIter mfi(*Ex,true); mfi.isValid(); ++mfi )
+    for ( MFIter mfi(*Ex, TilingIfNotGPU()); mfi.isValid(); ++mfi )
     {
         Real wt = amrex::second();
 
@@ -662,7 +672,11 @@ WarpX::EvolveE (int lev, PatchType patch_type, amrex::Real dt)
             Box cbx = mfi.tilebox(IntVect{AMREX_D_DECL(0,0,0)});
             if (patch_type == PatchType::coarse) cbx.refine(rr);
             wt = (amrex::second() - wt) / cbx.d_numPts();
-            (*cost)[mfi].plus(wt, cbx);
+            FArrayBox* costfab = cost->fabPtr(mfi);
+            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( cbx, work_box,
+            {
+                costfab->plus(wt, work_box);
+            });
         }
     }
 
@@ -674,9 +688,9 @@ WarpX::EvolveE (int lev, PatchType patch_type, amrex::Real dt)
         const auto& pml_E = (patch_type == PatchType::fine) ? pml[lev]->GetE_fp() : pml[lev]->GetE_cp();
         const auto& pml_F = (patch_type == PatchType::fine) ? pml[lev]->GetF_fp() : pml[lev]->GetF_cp();
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-        for ( MFIter mfi(*pml_E[0],true); mfi.isValid(); ++mfi )
+        for ( MFIter mfi(*pml_E[0], TilingIfNotGPU()); mfi.isValid(); ++mfi )
         {
             const Box& tex  = mfi.tilebox(Ex_nodal_flag);
             const Box& tey  = mfi.tilebox(Ey_nodal_flag);
@@ -775,9 +789,9 @@ WarpX::EvolveF (int lev, PatchType patch_type, Real dt, DtType dt_type)
         const auto& pml_E = (patch_type == PatchType::fine) ? pml[lev]->GetE_fp() : pml[lev]->GetE_cp();
 
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-        for ( MFIter mfi(*pml_F,true); mfi.isValid(); ++mfi )
+        for ( MFIter mfi(*pml_F, TilingIfNotGPU()); mfi.isValid(); ++mfi )
         {
             const Box& bx = mfi.tilebox();
             WRPX_PUSH_PML_F(bx.loVect(), bx.hiVect(),
@@ -821,9 +835,9 @@ WarpX::DampPML (int lev, PatchType patch_type)
                                                               : pml[lev]->GetMultiSigmaBox_cp();
 
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-        for ( MFIter mfi(*pml_E[0],true); mfi.isValid(); ++mfi )
+        for ( MFIter mfi(*pml_E[0], TilingIfNotGPU()); mfi.isValid(); ++mfi )
         {
             const Box& tex  = mfi.tilebox(Ex_nodal_flag);
             const Box& tey  = mfi.tilebox(Ey_nodal_flag);
diff --git a/Source/WarpXIO.cpp b/Source/WarpXIO.cpp
index 70985ac7e..93a359d7c 100644
--- a/Source/WarpXIO.cpp
+++ b/Source/WarpXIO.cpp
@@ -488,11 +488,11 @@ WarpX::UpdateInSitu () const
         Vector<const MultiFab*> srcmf(AMREX_SPACEDIM);
         PackPlotDataPtrs(srcmf, current_fp[lev]);
         int dcomp = 0;
-        amrex::average_edge_to_cellcenter(mf[lev], dcomp, srcmf);
+        amrex::average_edge_to_cellcenter(mf[lev], dcomp, srcmf, ngrow);
 
 #if (AMREX_SPACEDIM == 2)
         MultiFab::Copy(mf[lev], mf[lev], dcomp+1, dcomp+2, 1, ngrow);
-            amrex::average_node_to_cellcenter(mf[lev], dcomp+1, *current_fp[lev][1], 0, 1);
+        amrex::average_node_to_cellcenter(mf[lev], dcomp+1, *current_fp[lev][1], 0, 1, ngrow);
 #endif
         if (lev == 0)
         {
@@ -503,10 +503,10 @@ WarpX::UpdateInSitu () const
         dcomp += 3;
 
         PackPlotDataPtrs(srcmf, Efield_aux[lev]);
-        amrex::average_edge_to_cellcenter(mf[lev], dcomp, srcmf);
+        amrex::average_edge_to_cellcenter(mf[lev], dcomp, srcmf, ngrow);
 #if (AMREX_SPACEDIM == 2)
         MultiFab::Copy(mf[lev], mf[lev], dcomp+1, dcomp+2, 1, ngrow);
-            amrex::average_node_to_cellcenter(mf[lev], dcomp+1, *Efield_aux[lev][1], 0, 1);
+        amrex::average_node_to_cellcenter(mf[lev], dcomp+1, *Efield_aux[lev][1], 0, 1, ngrow);
 #endif
         if (lev == 0)
         {
@@ -517,10 +517,10 @@ WarpX::UpdateInSitu () const
         dcomp += 3;
 
         PackPlotDataPtrs(srcmf, Bfield_aux[lev]);
-        amrex::average_face_to_cellcenter(mf[lev], dcomp, srcmf);
+        amrex::average_face_to_cellcenter(mf[lev], dcomp, srcmf, ngrow);
 #if (AMREX_SPACEDIM == 2)
         MultiFab::Copy(mf[lev], mf[lev], dcomp+1, dcomp+2, 1, ngrow);
-            MultiFab::Copy(mf[lev], *Bfield_aux[lev][1], 0, dcomp+1, 1, ngrow);
+        MultiFab::Copy(mf[lev], *Bfield_aux[lev][1], 0, dcomp+1, 1, ngrow);
 #endif
         if (lev == 0)
         {
@@ -532,16 +532,14 @@ WarpX::UpdateInSitu () const
 
         if (plot_part_per_cell)
         {
-            MultiFab temp_dat(grids[lev],mf[lev].DistributionMap(),1,0);
-            temp_dat.setVal(0);
+            MultiFab temp_dat(grids[lev], mf[lev].DistributionMap(), 1, ngrow);
+            temp_dat.setVal(0, ngrow);
 
             // MultiFab containing number of particles in each cell
             mypc->Increment(temp_dat, lev);
-            MultiFab::Copy(mf[lev], temp_dat, 0, dcomp, 1, 0);
+            MultiFab::Copy(mf[lev], temp_dat, 0, dcomp, 1, ngrow);
             if (lev == 0)
-            {
                 varnames.push_back("part_per_cell");
-            }
             dcomp += 1;
         }
 
@@ -554,14 +552,13 @@ WarpX::UpdateInSitu () const
 #ifdef _OPENMP
 #pragma omp parallel
 #endif
-               for (MFIter mfi(mf[lev]); mfi.isValid(); ++mfi) {
-                   (mf[lev])[mfi].setVal(static_cast<Real>(npart_in_grid[mfi.index()]), dcomp);
-               }
-               if (lev == 0)
-               {
-                   varnames.push_back("part_per_grid");
-               }
-               dcomp += 1;
+                for (MFIter mfi(mf[lev]); mfi.isValid(); ++mfi)
+                    (mf[lev])[mfi].setVal(static_cast<Real>(npart_in_grid[mfi.index()]), dcomp);
+
+                if (lev == 0)
+                    varnames.push_back("part_per_grid");
+
+                dcomp += 1;
            }
 
            if (plot_part_per_proc)
@@ -570,30 +567,30 @@ WarpX::UpdateInSitu () const
 #ifdef _OPENMP
 #pragma omp parallel reduction(+:n_per_proc)
 #endif
-                for (MFIter mfi(mf[lev]); mfi.isValid(); ++mfi) {
+                for (MFIter mfi(mf[lev]); mfi.isValid(); ++mfi)
                     n_per_proc += npart_in_grid[mfi.index()];
-                }
-                mf[lev].setVal(static_cast<Real>(n_per_proc), dcomp,1);
+
+                mf[lev].setVal(static_cast<Real>(n_per_proc), dcomp, ngrow);
+
                 if (lev == 0)
-                {
                     varnames.push_back("part_per_proc");
-                }
+
                 dcomp += 1;
             }
         }
 
         if (plot_proc_number)
         {
+            Real procid = static_cast<Real>(ParallelDescriptor::MyProc());
 #ifdef _OPENMP
 #pragma omp parallel
 #endif
-            for (MFIter mfi(mf[lev]); mfi.isValid(); ++mfi) {
-                (mf[lev])[mfi].setVal(static_cast<Real>(ParallelDescriptor::MyProc()), dcomp);
-            }
+            for (MFIter mfi(mf[lev]); mfi.isValid(); ++mfi)
+                (mf[lev])[mfi].setVal(procid, dcomp);
+
             if (lev == 0)
-            {
                 varnames.push_back("proc_number");
-            }
+
             dcomp += 1;
         }
 
@@ -601,56 +598,56 @@ WarpX::UpdateInSitu () const
         {
             ComputeDivB(mf[lev], dcomp,
                         {Bfield_aux[lev][0].get(),Bfield_aux[lev][1].get(),Bfield_aux[lev][2].get()},
-                        WarpX::CellSize(lev));
+                        WarpX::CellSize(lev), ngrow);
             if (lev == 0)
-            {
                 varnames.push_back("divB");
-            }
+
             dcomp += 1;
         }
 
         if (plot_dive)
         {
             const BoxArray& ba = amrex::convert(boxArray(lev),IntVect::TheUnitVector());
-            MultiFab dive(ba,DistributionMap(lev),1,0);
+            MultiFab dive(ba, DistributionMap(lev), 1, ngrow);
+
             ComputeDivE(dive, 0,
                         {Efield_aux[lev][0].get(), Efield_aux[lev][1].get(), Efield_aux[lev][2].get()},
-                        WarpX::CellSize(lev));
-            amrex::average_node_to_cellcenter(mf[lev], dcomp, dive, 0, 1);
+                        WarpX::CellSize(lev), ngrow);
+
+            amrex::average_node_to_cellcenter(mf[lev], dcomp, dive, 0, 1, ngrow);
+
             if (lev == 0)
-            {
                 varnames.push_back("divE");
-            }
+
             dcomp += 1;
         }
 
         if (plot_rho)
         {
-            amrex::average_node_to_cellcenter(mf[lev], dcomp, *rho_fp[lev], 0, 1);
+            amrex::average_node_to_cellcenter(mf[lev], dcomp, *rho_fp[lev], 0, 1, ngrow);
             if (lev == 0)
-            {
                 varnames.push_back("rho");
-            }
+
             dcomp += 1;
         }
 
         if (plot_F)
         {
-            amrex::average_node_to_cellcenter(mf[lev], dcomp, *F_fp[lev], 0, 1);
+            amrex::average_node_to_cellcenter(mf[lev], dcomp, *F_fp[lev], 0, 1, ngrow);
+
             if (lev == 0)
-            {
                 varnames.push_back("F");
-            }
+
             dcomp += 1;
         }
 
         if (plot_finepatch)
         {
             PackPlotDataPtrs(srcmf, Efield_fp[lev]);
-            amrex::average_edge_to_cellcenter(mf[lev], dcomp, srcmf);
+            amrex::average_edge_to_cellcenter(mf[lev], dcomp, srcmf, ngrow);
 #if (AMREX_SPACEDIM == 2)
             MultiFab::Copy(mf[lev], mf[lev], dcomp+1, dcomp+2, 1, ngrow);
-            amrex::average_node_to_cellcenter(mf[lev], dcomp+1, *Efield_fp[lev][1], 0, 1);
+            amrex::average_node_to_cellcenter(mf[lev], dcomp+1, *Efield_fp[lev][1], 0, 1, ngrow);
 #endif
             if (lev == 0)
             {
@@ -661,7 +658,7 @@ WarpX::UpdateInSitu () const
             dcomp += 3;
 
             PackPlotDataPtrs(srcmf, Bfield_fp[lev]);
-            amrex::average_face_to_cellcenter(mf[lev], dcomp, srcmf);
+            amrex::average_face_to_cellcenter(mf[lev], dcomp, srcmf, ngrow);
 #if (AMREX_SPACEDIM == 2)
             MultiFab::Copy(mf[lev], mf[lev], dcomp+1, dcomp+2, 1, ngrow);
             MultiFab::Copy(mf[lev], *Bfield_fp[lev][1], 0, dcomp+1, 1, ngrow);
@@ -686,10 +683,10 @@ WarpX::UpdateInSitu () const
             {
                 std::array<std::unique_ptr<MultiFab>, 3> E = getInterpolatedE(lev);
                 PackPlotDataPtrs(srcmf, E);
-                amrex::average_edge_to_cellcenter(mf[lev], dcomp, srcmf);
+                amrex::average_edge_to_cellcenter(mf[lev], dcomp, srcmf, ngrow);
 #if (AMREX_SPACEDIM == 2)
                 MultiFab::Copy(mf[lev], mf[lev], dcomp+1, dcomp+2, 1, ngrow);
-                amrex::average_node_to_cellcenter(mf[lev], dcomp+1, *E[1], 0, 1);
+                amrex::average_node_to_cellcenter(mf[lev], dcomp+1, *E[1], 0, 1, ngrow);
 #endif
             }
             if (lev == 0)
@@ -709,7 +706,7 @@ WarpX::UpdateInSitu () const
             {
                 std::array<std::unique_ptr<MultiFab>, 3> B = getInterpolatedB(lev);
                 PackPlotDataPtrs(srcmf, B);
-                amrex::average_face_to_cellcenter(mf[lev], dcomp, srcmf);
+                amrex::average_face_to_cellcenter(mf[lev], dcomp, srcmf, ngrow);
 #if (AMREX_SPACEDIM == 2)
                 MultiFab::Copy(mf[lev], mf[lev], dcomp+1, dcomp+2, 1, ngrow);
                 MultiFab::Copy(mf[lev], *B[1], 0, dcomp+1, 1, ngrow);
diff --git a/Source/WarpXInitData.cpp b/Source/WarpXInitData.cpp
index a6b63a92d..496b14e7a 100644
--- a/Source/WarpXInitData.cpp
+++ b/Source/WarpXInitData.cpp
@@ -5,7 +5,6 @@
 
 #include <WarpX.H>
 #include <WarpX_f.H>
-#include <WarpXWrappers.h>
 
 #ifdef BL_USE_SENSEI_INSITU
 #include <AMReX_AmrMeshInSituBridge.H>
@@ -34,7 +33,7 @@ WarpX::InitData ()
 
     ComputePMLFactors();
 
-    if (warpx_use_fdtd_nci_corr()) {
+    if (WarpX::use_fdtd_nci_corr) {
         WarpX::InitNCICorrector();
     }
 
@@ -51,6 +50,7 @@ WarpX::InitData ()
     insitu_bridge = new amrex::AmrMeshInSituBridge;
     insitu_bridge->setEnabled(insitu_int > 0 ? 1 : 0);
     insitu_bridge->setConfig(insitu_config);
+    insitu_bridge->setPinMesh(insitu_pin_mesh);
     if (insitu_bridge->initialize())
     {
         amrex::ErrorStream()
@@ -153,7 +153,7 @@ WarpX::ComputePMLFactors ()
 void
 WarpX::InitNCICorrector ()
 {
-    if (warpx_use_fdtd_nci_corr())
+    if (WarpX::use_fdtd_nci_corr)
     {
         mypc->fdtd_nci_stencilz_ex.resize(max_level+1);
         mypc->fdtd_nci_stencilz_by.resize(max_level+1);
@@ -161,7 +161,6 @@ WarpX::InitNCICorrector ()
         {
             const Geometry& gm = Geom(lev);
             const Real* dx = gm.CellSize();
-            const int l_lower_order_in_v = warpx_l_lower_order_in_v();
             amrex::Real dz, cdtodz;
             if (AMREX_SPACEDIM == 3){
                 dz = dx[2];
@@ -172,7 +171,7 @@ WarpX::InitNCICorrector ()
             WRPX_PXR_NCI_CORR_INIT( (mypc->fdtd_nci_stencilz_ex)[lev].data(),
                                     (mypc->fdtd_nci_stencilz_by)[lev].data(),
                                     mypc->nstencilz_fdtd_nci_corr, cdtodz,
-                                    l_lower_order_in_v);
+                                    WarpX::l_lower_order_in_v);
         }
     }
 }
diff --git a/Source/WarpXMove.cpp b/Source/WarpXMove.cpp
index 401211554..908c70573 100644
--- a/Source/WarpXMove.cpp
+++ b/Source/WarpXMove.cpp
@@ -40,8 +40,8 @@ WarpX::MoveWindow (bool move_j)
     Real new_hi[AMREX_SPACEDIM];
     const Real* current_lo = geom[0].ProbLo();
     const Real* current_hi = geom[0].ProbHi();
-    const Real* dx = geom[0].CellSize();
-    int num_shift_base = static_cast<int>((moving_window_x - current_lo[dir]) / dx[dir]);
+    const Real* cdx = geom[0].CellSize();
+    int num_shift_base = static_cast<int>((moving_window_x - current_lo[dir]) / cdx[dir]);
 
     if (num_shift_base == 0) return 0;
 
@@ -51,8 +51,8 @@ WarpX::MoveWindow (bool move_j)
         new_lo[i] = current_lo[i];
         new_hi[i] = current_hi[i];
     }
-    new_lo[dir] = current_lo[dir] + num_shift_base * dx[dir];
-    new_hi[dir] = current_hi[dir] + num_shift_base * dx[dir];
+    new_lo[dir] = current_lo[dir] + num_shift_base * cdx[dir];
+    new_hi[dir] = current_hi[dir] + num_shift_base * cdx[dir];
     RealBox new_box(new_lo, new_hi);
     Geometry::ProbDomain(new_box);
 
@@ -181,6 +181,7 @@ WarpX::MoveWindow (bool move_j)
 void
 WarpX::shiftMF (MultiFab& mf, const Geometry& geom, int num_shift, int dir)
 {
+    BL_PROFILE("WarpX::shiftMF()");
     const BoxArray& ba = mf.boxArray();
     const DistributionMapping& dm = mf.DistributionMap();
     const int nc = mf.nComp();
@@ -221,25 +222,24 @@ WarpX::shiftMF (MultiFab& mf, const Geometry& geom, int num_shift, int dir)
 #endif
     for (MFIter mfi(tmpmf); mfi.isValid(); ++mfi )
     {
-        FArrayBox& srcfab = tmpmf[mfi];
-
-        Box outbox = mfi.fabbox();
-        outbox &= adjBox;
-        if (outbox.ok()) {  // outbox is the region that the window moved into
-            srcfab.setVal(0.0, outbox, 0, nc);
-        }
-
-        FArrayBox& dstfab = mf[mfi];
-        dstfab.setVal(0.0);
-
-        Box dstBox = dstfab.box();
-
-        if (num_shift > 0) {
-            dstBox.growHi(dir, -num_shift);
-        } else {
-            dstBox.growLo(dir,  num_shift);
-        }
-
-        dstfab.copy(srcfab, amrex::shift(dstBox,dir,num_shift), 0, dstBox, 0, nc);
+      FArrayBox* dstfab = mf.fabPtr(mfi);
+      FArrayBox* srcfab = tmpmf.fabPtr(mfi);
+      Box outbox = mfi.fabbox();
+      outbox &= adjBox;
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(outbox, toutbox,
+      {
+	srcfab->setVal(0.0, toutbox, 0, nc);
+      });
+      Box dstBox = mf[mfi].box();
+      if (num_shift > 0) {
+	dstBox.growHi(dir, -num_shift);
+      } else {
+	dstBox.growLo(dir,  num_shift);
+      }
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(dstBox, tdstBox,
+      {
+        dstfab->setVal(0.0, tdstBox, 0, nc);
+        dstfab->copy(*srcfab, amrex::shift(tdstBox,dir,num_shift), 0, tdstBox, 0, nc);
+      });
     }
 }
diff --git a/Source/WarpXPML.H b/Source/WarpXPML.H
index 0cf367284..7a86bf0da 100644
--- a/Source/WarpXPML.H
+++ b/Source/WarpXPML.H
@@ -62,6 +62,13 @@ namespace amrex {
         virtual void destroy (SigmaBox* fab) const final {
             delete fab;
         }
+#ifdef AMREX_USE_GPU
+        virtual SigmaBox* createHostAlias (SigmaBox const& src) const final
+        {
+            return const_cast<SigmaBox*>(&src);
+        }
+        virtual void destroyHostAlias (SigmaBox* fab) const final {}
+#endif
         virtual FabFactory<SigmaBox>* clone () const {
             return new FabFactory<SigmaBox>(*this);
         }
diff --git a/Source/WarpXParticleContainer.H b/Source/WarpXParticleContainer.H
index a2d5c0fac..b879f49d9 100644
--- a/Source/WarpXParticleContainer.H
+++ b/Source/WarpXParticleContainer.H
@@ -36,27 +36,27 @@ public:
     WarpXParIter (ContainerType& pc, int level);
 
 #if (AMREX_SPACEDIM == 2)
-    void GetPosition (amrex::Vector<amrex::Real>& x,
-                      amrex::Vector<amrex::Real>& y,
-                      amrex::Vector<amrex::Real>& z) const;
-    void SetPosition (const amrex::Vector<amrex::Real>& x,
-                      const amrex::Vector<amrex::Real>& y,
-                      const amrex::Vector<amrex::Real>& z);
+    void GetPosition (amrex::Cuda::DeviceVector<amrex::Real>& x,
+                      amrex::Cuda::DeviceVector<amrex::Real>& y,
+                      amrex::Cuda::DeviceVector<amrex::Real>& z) const;
+    void SetPosition (const amrex::Cuda::DeviceVector<amrex::Real>& x,
+                      const amrex::Cuda::DeviceVector<amrex::Real>& y,
+                      const amrex::Cuda::DeviceVector<amrex::Real>& z);
 #endif
 
-    const std::array<amrex::Vector<amrex::Real>, PIdx::nattribs>& GetAttribs () const { 
+    const std::array<RealVector, PIdx::nattribs>& GetAttribs () const { 
         return GetStructOfArrays().GetRealData(); 
     }
     
-    std::array<amrex::Vector<amrex::Real>, PIdx::nattribs>& GetAttribs () { 
+    std::array<RealVector, PIdx::nattribs>& GetAttribs () { 
         return GetStructOfArrays().GetRealData(); 
     }
 
-    const amrex::Vector<amrex::Real>& GetAttribs (int comp) const { 
+    const RealVector& GetAttribs (int comp) const { 
         return GetStructOfArrays().GetRealData(comp);
     }
     
-    amrex::Vector<amrex::Real>& GetAttribs (int comp) { 
+    RealVector& GetAttribs (int comp) { 
         return GetStructOfArrays().GetRealData(comp);
     }
 };
@@ -141,6 +141,33 @@ public:
                        bool local = false);
     std::unique_ptr<amrex::MultiFab> GetChargeDensity(int lev, bool local = false);
 
+   virtual void DepositCharge(WarpXParIter& pti,
+                             RealVector& wp,
+                             amrex::MultiFab* rhomf,
+                             amrex::MultiFab* crhomf,
+                             int icomp,
+                             const long np_current,
+                             const long np,
+                             int thread_num,
+                             int lev );
+  
+  virtual void DepositCurrent(WarpXParIter& pti,
+                              RealVector& wp,
+                              RealVector& uxp,
+                              RealVector& uyp,
+                              RealVector& uzp,
+                              amrex::MultiFab& jx,
+                              amrex::MultiFab& jy,
+                              amrex::MultiFab& jz,
+                              amrex::MultiFab* cjx,
+                              amrex::MultiFab* cjy,
+                              amrex::MultiFab* cjz,
+                              const long np_current,
+                              const long np,
+                              int thread_num,
+                              int lev,
+                              amrex::Real dt );
+  
     /// 
     /// This returns the total charge for all the particles in this ParticleContainer.
     /// This is needed when solving Poisson's equation with periodic boundary conditions.
@@ -182,6 +209,14 @@ protected:
     bool deposit_on_main_grid = false;
 
     static int do_not_push;
+
+  amrex::Vector<std::unique_ptr<amrex::FArrayBox> > local_rho;
+  amrex::Vector<std::unique_ptr<amrex::FArrayBox> > local_jx;
+  amrex::Vector<std::unique_ptr<amrex::FArrayBox> > local_jy;
+  amrex::Vector<std::unique_ptr<amrex::FArrayBox> > local_jz;
+
+  amrex::Vector<amrex::Cuda::DeviceVector<amrex::Real> > m_xp, m_yp, m_zp, m_giv;
+  
 };
 
 #endif
diff --git a/Source/WarpXParticleContainer.cpp b/Source/WarpXParticleContainer.cpp
index ad6979462..a9699071c 100644
--- a/Source/WarpXParticleContainer.cpp
+++ b/Source/WarpXParticleContainer.cpp
@@ -18,14 +18,14 @@ WarpXParIter::WarpXParIter (ContainerType& pc, int level)
 
 #if (AMREX_SPACEDIM == 2)
 void
-WarpXParIter::GetPosition (Vector<Real>& x, Vector<Real>& y, Vector<Real>& z) const
+WarpXParIter::GetPosition (Cuda::DeviceVector<Real>& x, Cuda::DeviceVector<Real>& y, Cuda::DeviceVector<Real>& z) const
 {
     amrex::ParIter<0,0,PIdx::nattribs>::GetPosition(x, z);
     y.resize(x.size(), std::numeric_limits<Real>::quiet_NaN());
 }
 
 void
-WarpXParIter::SetPosition (const Vector<Real>& x, const Vector<Real>& y, const Vector<Real>& z)
+WarpXParIter::SetPosition (const Cuda::DeviceVector<Real>& x, const Cuda::DeviceVector<Real>& y, const Cuda::DeviceVector<Real>& z)
 {
     amrex::ParIter<0,0,PIdx::nattribs>::SetPosition(x, z);
 }
@@ -40,6 +40,30 @@ WarpXParticleContainer::WarpXParticleContainer (AmrCore* amr_core, int ispecies)
     }
     SetParticleSize();
     ReadParameters();
+
+    // Initialize temporary local arrays for charge/current deposition
+    int num_threads = 1;
+    #ifdef _OPENMP
+    #pragma omp parallel
+    #pragma omp single
+    num_threads = omp_get_num_threads();
+    #endif
+    local_rho.resize(num_threads);
+    local_jx.resize(num_threads);
+    local_jy.resize(num_threads);
+    local_jz.resize(num_threads);
+    m_xp.resize(num_threads);
+    m_yp.resize(num_threads);
+    m_zp.resize(num_threads);
+    m_giv.resize(num_threads);
+    for (int i = 0; i < num_threads; ++i)
+      {
+        local_rho[i].reset(nullptr);
+        local_jx[i].reset(nullptr);
+        local_jy[i].reset(nullptr);
+        local_jz[i].reset(nullptr);
+      }
+
 }
 
 void
@@ -50,10 +74,14 @@ WarpXParticleContainer::ReadParameters ()
     {
 	ParmParse pp("particles");
 
-        do_tiling = true;  // because the default in amrex is false
+#ifdef AMREX_USE_GPU
+        do_tiling = false; // By default, tiling is off on GPU
+#else
+        do_tiling = true;
+#endif
 	pp.query("do_tiling",  do_tiling);
         pp.query("do_not_push", do_not_push);
-        
+
 	initialized = true;
     }
 }
@@ -73,7 +101,7 @@ WarpXParticleContainer::AddOneParticle (int lev, int grid, int tile,
                                         const std::array<Real,PIdx::nattribs>& attribs)
 {
     auto& particle_tile = GetParticles(lev)[std::make_pair(grid,tile)];
-    AddOneParticle(particle_tile, x, y, z, attribs); 
+    AddOneParticle(particle_tile, x, y, z, attribs);
 }
 
 void
@@ -92,7 +120,7 @@ WarpXParticleContainer::AddOneParticle (ParticleTileType& particle_tile,
     p.pos(0) = x;
     p.pos(1) = z;
 #endif
-    
+
     particle_tile.push_back(p);
     particle_tile.push_back_real(attribs);
 }
@@ -153,16 +181,341 @@ WarpXParticleContainer::AddNParticles (int lev,
         particle_tile.push_back_real(PIdx::ux,     vx + ibegin,     vx + iend);
         particle_tile.push_back_real(PIdx::uy,     vy + ibegin,     vy + iend);
         particle_tile.push_back_real(PIdx::uz,     vz + ibegin,     vz + iend);
-        
+
         for (int comp = PIdx::uz+1; comp < PIdx::nattribs; ++comp)
         {
             particle_tile.push_back_real(comp, np, 0.0);
         }
-    }        
+    }
 
     Redistribute();
 }
 
+
+void
+WarpXParticleContainer::DepositCurrent(WarpXParIter& pti,
+                                          RealVector& wp, RealVector& uxp,
+                                          RealVector& uyp, RealVector& uzp,
+                                          MultiFab& jx, MultiFab& jy, MultiFab& jz,
+                                          MultiFab* cjx, MultiFab* cjy, MultiFab* cjz,
+                                          const long np_current, const long np,
+                                          int thread_num, int lev, Real dt )
+{
+  Real *jx_ptr, *jy_ptr, *jz_ptr;
+  const int  *jxntot, *jyntot, *jzntot;
+  const std::array<Real,3>& xyzmin_tile = WarpX::LowerCorner(pti.tilebox(), lev);
+  const std::array<Real,3>& dx = WarpX::CellSize(lev);
+  const std::array<Real,3>& cdx = WarpX::CellSize(std::max(lev-1,0));
+  const std::array<Real, 3>& xyzmin = xyzmin_tile;
+  const long lvect = 8;
+
+  BL_PROFILE_VAR_NS("PICSAR::CurrentDeposition", blp_pxr_cd);
+  BL_PROFILE_VAR_NS("PPC::Evolve::Accumulate", blp_accumulate);
+
+  Box tbx = convert(pti.tilebox(), WarpX::jx_nodal_flag);
+  Box tby = convert(pti.tilebox(), WarpX::jy_nodal_flag);
+  Box tbz = convert(pti.tilebox(), WarpX::jz_nodal_flag);
+
+  // WarpX assumes the same number of guard cells for Jx, Jy, Jz
+  long ngJ = jx.nGrow();
+
+  // Deposit charge for particles that are not in the current buffers
+  if (np_current > 0)
+    {
+      tbx.grow(ngJ);
+      tby.grow(ngJ);
+      tbz.grow(ngJ);
+
+      local_jx[thread_num]->resize(tbx);
+      local_jy[thread_num]->resize(tby);
+      local_jz[thread_num]->resize(tbz);
+
+      jx_ptr = local_jx[thread_num]->dataPtr();
+      jy_ptr = local_jy[thread_num]->dataPtr();
+      jz_ptr = local_jz[thread_num]->dataPtr();
+
+      FArrayBox* local_jx_ptr = local_jx[thread_num].get();
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tbx, b,
+      {
+        local_jx_ptr->setVal(0.0, b, 0, 1);
+      });
+
+      FArrayBox* local_jy_ptr = local_jy[thread_num].get();
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tby, b,
+      {
+        local_jy_ptr->setVal(0.0, b, 0, 1);
+      });
+
+      FArrayBox* local_jz_ptr = local_jz[thread_num].get();
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tbz, b,
+      {
+        local_jz_ptr->setVal(0.0, b, 0, 1);
+      });
+
+      jxntot = local_jx[thread_num]->length();
+      jyntot = local_jy[thread_num]->length();
+      jzntot = local_jz[thread_num]->length();
+
+      BL_PROFILE_VAR_START(blp_pxr_cd);
+      warpx_current_deposition(
+                               jx_ptr, &ngJ, jxntot,
+                               jy_ptr, &ngJ, jyntot,
+                               jz_ptr, &ngJ, jzntot,
+                               &np_current,
+                               m_xp[thread_num].dataPtr(),
+                               m_yp[thread_num].dataPtr(),
+                               m_zp[thread_num].dataPtr(),
+                               uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
+                               m_giv[thread_num].dataPtr(),
+                               wp.dataPtr(), &this->charge,
+                               &xyzmin[0], &xyzmin[1], &xyzmin[2],
+                               &dt, &dx[0], &dx[1], &dx[2],
+                               &WarpX::nox,&WarpX::noy,&WarpX::noz,
+                               &lvect,&WarpX::current_deposition_algo);
+      BL_PROFILE_VAR_STOP(blp_pxr_cd);
+
+      BL_PROFILE_VAR_START(blp_accumulate);
+
+      FArrayBox const* local_jx_const_ptr = local_jx[thread_num].get();
+      FArrayBox* global_jx_ptr = jx.fabPtr(pti);
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tbx, thread_bx,
+      {
+        global_jx_ptr->atomicAdd(*local_jx_const_ptr, thread_bx, thread_bx, 0, 0, 1);
+      });
+
+      FArrayBox const* local_jy_const_ptr = local_jy[thread_num].get();
+      FArrayBox* global_jy_ptr = jy.fabPtr(pti);
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tby, thread_bx,
+      {
+        global_jy_ptr->atomicAdd(*local_jy_const_ptr, thread_bx, thread_bx, 0, 0, 1);
+      });
+
+      FArrayBox const* local_jz_const_ptr = local_jz[thread_num].get();
+      FArrayBox* global_jz_ptr = jz.fabPtr(pti);
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tbz, thread_bx,
+      {
+        global_jz_ptr->atomicAdd(*local_jz_const_ptr, thread_bx, thread_bx, 0, 0, 1);
+      });
+
+      BL_PROFILE_VAR_STOP(blp_accumulate);
+    }
+
+  // Deposit charge for particles that are in the current buffers
+  if (np_current < np)
+    {
+      const IntVect& ref_ratio = WarpX::RefRatio(lev-1);
+      const Box& ctilebox = amrex::coarsen(pti.tilebox(),ref_ratio);
+      const std::array<Real,3>& cxyzmin_tile = WarpX::LowerCorner(ctilebox, lev-1);
+
+      tbx = amrex::convert(ctilebox, WarpX::jx_nodal_flag);
+      tby = amrex::convert(ctilebox, WarpX::jy_nodal_flag);
+      tbz = amrex::convert(ctilebox, WarpX::jz_nodal_flag);
+      tbx.grow(ngJ);
+      tby.grow(ngJ);
+      tbz.grow(ngJ);
+
+      local_jx[thread_num]->resize(tbx);
+      local_jy[thread_num]->resize(tby);
+      local_jz[thread_num]->resize(tbz);
+
+      jx_ptr = local_jx[thread_num]->dataPtr();
+      jy_ptr = local_jy[thread_num]->dataPtr();
+      jz_ptr = local_jz[thread_num]->dataPtr();
+
+      FArrayBox* local_jx_ptr = local_jx[thread_num].get();
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tbx, b,
+      {
+        local_jx_ptr->setVal(0.0, b, 0, 1);
+      });
+
+      FArrayBox* local_jy_ptr = local_jy[thread_num].get();
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tby, b,
+      {
+        local_jy_ptr->setVal(0.0, b, 0, 1);
+      });
+
+      FArrayBox* local_jz_ptr = local_jz[thread_num].get();
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tbz, b,
+      {
+        local_jz_ptr->setVal(0.0, b, 0, 1);
+      });
+      jxntot = local_jx[thread_num]->length();
+      jyntot = local_jy[thread_num]->length();
+      jzntot = local_jz[thread_num]->length();
+
+      long ncrse = np - np_current;
+      BL_PROFILE_VAR_START(blp_pxr_cd);
+      warpx_current_deposition(
+                               jx_ptr, &ngJ, jxntot,
+                               jy_ptr, &ngJ, jyntot,
+                               jz_ptr, &ngJ, jzntot,
+                               &ncrse,
+                               m_xp[thread_num].dataPtr() +np_current,
+                               m_yp[thread_num].dataPtr() +np_current,
+                               m_zp[thread_num].dataPtr() +np_current,
+                               uxp.dataPtr()+np_current,
+                               uyp.dataPtr()+np_current,
+                               uzp.dataPtr()+np_current,
+                               m_giv[thread_num].dataPtr()+np_current,
+                               wp.dataPtr()+np_current, &this->charge,
+                               &cxyzmin_tile[0], &cxyzmin_tile[1], &cxyzmin_tile[2],
+                               &dt, &cdx[0], &cdx[1], &cdx[2],
+                               &WarpX::nox,&WarpX::noy,&WarpX::noz,
+                               &lvect,&WarpX::current_deposition_algo);
+      BL_PROFILE_VAR_STOP(blp_pxr_cd);
+
+      BL_PROFILE_VAR_START(blp_accumulate);
+
+      FArrayBox const* local_jx_const_ptr = local_jx[thread_num].get();
+      FArrayBox* global_jx_ptr = cjx->fabPtr(pti);
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tbx, thread_bx,
+      {
+        global_jx_ptr->atomicAdd(*local_jx_const_ptr, thread_bx, thread_bx, 0, 0, 1);
+      });
+
+      FArrayBox const* local_jy_const_ptr = local_jy[thread_num].get();
+      FArrayBox* global_jy_ptr = cjy->fabPtr(pti);
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tby, thread_bx,
+      {
+        global_jy_ptr->atomicAdd(*local_jy_const_ptr, thread_bx, thread_bx, 0, 0, 1);
+      });
+
+      FArrayBox const* local_jz_const_ptr = local_jz[thread_num].get();
+      FArrayBox* global_jz_ptr = cjz->fabPtr(pti);
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tbz, thread_bx,
+      {
+        global_jz_ptr->atomicAdd(*local_jz_const_ptr, thread_bx, thread_bx, 0, 0, 1);
+      });
+
+      BL_PROFILE_VAR_STOP(blp_accumulate);
+    }
+};
+
+
+void
+WarpXParticleContainer::DepositCharge ( WarpXParIter& pti, RealVector& wp,
+                                  MultiFab* rhomf, MultiFab* crhomf, int icomp,
+                                 const long np_current,
+                                 const long np, int thread_num, int lev )
+{
+
+  BL_PROFILE_VAR_NS("PICSAR::ChargeDeposition", blp_pxr_chd);
+  BL_PROFILE_VAR_NS("PPC::Evolve::Accumulate", blp_accumulate);
+
+  const std::array<Real,3>& xyzmin_tile = WarpX::LowerCorner(pti.tilebox(), lev);
+  const long lvect = 8;
+
+  long ngRho = rhomf->nGrow();
+  Real* data_ptr;
+  Box tile_box = convert(pti.tilebox(), IntVect::TheUnitVector());
+  const int *rholen;
+
+  const std::array<Real,3>& dx = WarpX::CellSize(lev);
+  const std::array<Real,3>& cdx = WarpX::CellSize(std::max(lev-1,0));
+
+  // Deposit charge for particles that are not in the current buffers
+  if (np_current > 0)
+    {
+      const std::array<Real, 3>& xyzmin = xyzmin_tile;
+      tile_box.grow(ngRho);
+      local_rho[thread_num]->resize(tile_box);
+      FArrayBox* local_rho_ptr = local_rho[thread_num].get();
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tile_box, b,
+      {
+        local_rho_ptr->setVal(0.0, b, 0, 1);
+      });
+
+      data_ptr = local_rho[thread_num]->dataPtr();
+      rholen = local_rho[thread_num]->length();
+#if (AMREX_SPACEDIM == 3)
+      const long nx = rholen[0]-1-2*ngRho;
+      const long ny = rholen[1]-1-2*ngRho;
+      const long nz = rholen[2]-1-2*ngRho;
+#else
+      const long nx = rholen[0]-1-2*ngRho;
+      const long ny = 0;
+      const long nz = rholen[1]-1-2*ngRho;
+#endif
+      BL_PROFILE_VAR_START(blp_pxr_chd);
+      warpx_charge_deposition(data_ptr, &np_current,
+                              m_xp[thread_num].dataPtr(),
+                              m_yp[thread_num].dataPtr(),
+                              m_zp[thread_num].dataPtr(),
+                              wp.dataPtr(),
+                              &this->charge,
+                              &xyzmin[0], &xyzmin[1], &xyzmin[2],
+                              &dx[0], &dx[1], &dx[2], &nx, &ny, &nz,
+                              &ngRho, &ngRho, &ngRho,
+                              &WarpX::nox,&WarpX::noy,&WarpX::noz,
+                              &lvect, &WarpX::charge_deposition_algo);
+      BL_PROFILE_VAR_STOP(blp_pxr_chd);
+
+      const int ncomp = 1;
+      FArrayBox const* local_fab = local_rho[thread_num].get();
+      FArrayBox*       global_fab = rhomf->fabPtr(pti);
+      BL_PROFILE_VAR_START(blp_accumulate);
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tile_box, tbx,
+      {
+        global_fab->atomicAdd(*local_fab, tbx, tbx, 0, icomp, ncomp);
+      });
+      BL_PROFILE_VAR_STOP(blp_accumulate);
+    }
+
+  // Deposit charge for particles that are in the current buffers
+  if (np_current < np)
+    {
+      const IntVect& ref_ratio = WarpX::RefRatio(lev-1);
+      const Box& ctilebox = amrex::coarsen(pti.tilebox(), ref_ratio);
+      const std::array<Real,3>& cxyzmin_tile = WarpX::LowerCorner(ctilebox, lev-1);
+
+      tile_box = amrex::convert(ctilebox, IntVect::TheUnitVector());
+      tile_box.grow(ngRho);
+      local_rho[thread_num]->resize(tile_box);
+      FArrayBox* local_rho_ptr = local_rho[thread_num].get();
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tile_box, b,
+      {
+        local_rho_ptr->setVal(0.0, b, 0, 1);
+      });
+
+      data_ptr = local_rho[thread_num]->dataPtr();
+      rholen = local_rho[thread_num]->length();
+#if (AMREX_SPACEDIM == 3)
+      const long nx = rholen[0]-1-2*ngRho;
+      const long ny = rholen[1]-1-2*ngRho;
+      const long nz = rholen[2]-1-2*ngRho;
+#else
+      const long nx = rholen[0]-1-2*ngRho;
+      const long ny = 0;
+      const long nz = rholen[1]-1-2*ngRho;
+#endif
+
+      long ncrse = np - np_current;
+      BL_PROFILE_VAR_START(blp_pxr_chd);
+      warpx_charge_deposition(data_ptr, &ncrse,
+                              m_xp[thread_num].dataPtr() + np_current,
+                              m_yp[thread_num].dataPtr() + np_current,
+                              m_zp[thread_num].dataPtr() + np_current,
+                              wp.dataPtr() + np_current,
+                              &this->charge,
+                              &cxyzmin_tile[0], &cxyzmin_tile[1], &cxyzmin_tile[2],
+                              &cdx[0], &cdx[1], &cdx[2], &nx, &ny, &nz,
+                              &ngRho, &ngRho, &ngRho,
+                              &WarpX::nox,&WarpX::noy,&WarpX::noz,
+                              &lvect, &WarpX::charge_deposition_algo);
+      BL_PROFILE_VAR_STOP(blp_pxr_chd);
+
+      const int ncomp = 1;
+      FArrayBox const* local_fab = local_rho[thread_num].get();
+      FArrayBox*       global_fab = crhomf->fabPtr(pti);
+      BL_PROFILE_VAR_START(blp_accumulate);
+      AMREX_LAUNCH_HOST_DEVICE_LAMBDA(tile_box, tbx,
+      {
+        global_fab->atomicAdd(*local_fab, tbx, tbx, 0, icomp, ncomp);
+      });
+      BL_PROFILE_VAR_STOP(blp_accumulate);
+    }
+};
+
 void
 WarpXParticleContainer::DepositCharge (Vector<std::unique_ptr<MultiFab> >& rho, bool local)
 {
@@ -172,32 +525,32 @@ WarpXParticleContainer::DepositCharge (Vector<std::unique_ptr<MultiFab> >& rho,
 
     // each level deposits it's own particles
     const int ng = rho[0]->nGrow();
-    for (int lev = 0; lev < num_levels; ++lev) {       
+    for (int lev = 0; lev < num_levels; ++lev) {
 
         rho[lev]->setVal(0.0, ng);
 
         const auto& gm = m_gdb->Geom(lev);
         const auto& ba = m_gdb->ParticleBoxArray(lev);
         const auto& dm = m_gdb->DistributionMap(lev);
-    
+
         const Real* dx  = gm.CellSize();
         const Real* plo = gm.ProbLo();
         BoxArray nba = ba;
         nba.surroundingNodes();
-    
+
         for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti) {
             const Box& box = nba[pti];
-            
+
             auto& wp = pti.GetAttribs(PIdx::w);
             const auto& particles = pti.GetArrayOfStructs();
             int nstride = particles.dataShape().first;
             const long np  = pti.numParticles();
-            
+
             FArrayBox& rhofab = (*rho[lev])[pti];
-            
-            WRPX_DEPOSIT_CIC(particles.data(), nstride, np,
-                             wp.data(), &this->charge,
-                             rhofab.dataPtr(), box.loVect(), box.hiVect(), 
+
+            WRPX_DEPOSIT_CIC(particles.dataPtr(), nstride, np,
+                             wp.dataPtr(), &this->charge,
+                             rhofab.dataPtr(), box.loVect(), box.hiVect(),
                              plo, dx, &ng);
         }
 
@@ -211,12 +564,12 @@ WarpXParticleContainer::DepositCharge (Vector<std::unique_ptr<MultiFab> >& rho,
         const DistributionMapping& fine_dm = rho[lev+1]->DistributionMap();
         BoxArray coarsened_fine_BA = fine_BA;
         coarsened_fine_BA.coarsen(m_gdb->refRatio(lev));
-        
+
         MultiFab coarsened_fine_data(coarsened_fine_BA, fine_dm, 1, 0);
         coarsened_fine_data.setVal(0.0);
-        
+
         IntVect ratio(AMREX_D_DECL(2, 2, 2));  // FIXME
-        
+
         for (MFIter mfi(coarsened_fine_data); mfi.isValid(); ++mfi) {
             const Box& bx = mfi.validbox();
             const Box& crse_box = coarsened_fine_data[mfi].box();
@@ -225,7 +578,7 @@ WarpXParticleContainer::DepositCharge (Vector<std::unique_ptr<MultiFab> >& rho,
                                         coarsened_fine_data[mfi].dataPtr(), crse_box.loVect(), crse_box.hiVect(),
                                         (*rho[lev+1])[mfi].dataPtr(), fine_box.loVect(), fine_box.hiVect());
         }
-        
+
         rho[lev]->copy(coarsened_fine_data, m_gdb->Geom(lev).periodicity(), FabArrayBase::ADD);
     }
 }
@@ -250,19 +603,19 @@ WarpXParticleContainer::GetChargeDensity (int lev, bool local)
 #pragma omp parallel
 #endif
     {
-        Vector<Real> xp, yp, zp;
+        Cuda::DeviceVector<Real> xp, yp, zp;
         FArrayBox local_rho;
 
         for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
         {
             const Box& box = pti.validbox();
-            
+
             auto& wp = pti.GetAttribs(PIdx::w);
-            
+
             const long np  = pti.numParticles();
 
             pti.GetPosition(xp, yp, zp);
-            
+
             const std::array<Real,3>& xyzmin_tile = WarpX::LowerCorner(pti.tilebox(), lev);
             const std::array<Real,3>& xyzmin_grid = WarpX::LowerCorner(box, lev);
 
@@ -298,26 +651,26 @@ WarpXParticleContainer::GetChargeDensity (int lev, bool local)
             long nyg = ng;
             long nzg = ng;
             long lvect = 8;
-            
+
             warpx_charge_deposition(data_ptr,
-                                    &np, xp.data(), yp.data(), zp.data(), wp.data(),
-                                    &this->charge, &xyzmin[0], &xyzmin[1], &xyzmin[2], 
+                                    &np,
+                                    xp.dataPtr(),
+                                    yp.dataPtr(),
+                                    zp.dataPtr(), wp.dataPtr(),
+                                    &this->charge, &xyzmin[0], &xyzmin[1], &xyzmin[2],
                                     &dx[0], &dx[1], &dx[2], &nx, &ny, &nz,
                                     &nxg, &nyg, &nzg, &WarpX::nox,&WarpX::noy,&WarpX::noz,
                                     &lvect, &WarpX::charge_deposition_algo);
-            
+
 #ifdef _OPENMP
-            const Box& fabbox = rhofab.box();
-            const int ncomp = 1;
-            amrex_atomic_accumulate_fab(BL_TO_FORTRAN_3D(local_rho),
-                                        BL_TO_FORTRAN_3D(rhofab), ncomp);
+            rhofab.atomicAdd(local_rho);
 #endif
         }
-        
+
     }
 
     if (!local) rho->SumBoundary(gm.periodicity());
-    
+
     return rho;
 }
 
@@ -327,7 +680,7 @@ Real WarpXParticleContainer::sumParticleCharge(bool local) {
 
     for (int lev = 0; lev < finestLevel(); ++lev)
     {
-        
+
 #ifdef _OPENMP
 #pragma omp parallel reduction(+:total_charge)
 #endif
@@ -356,7 +709,7 @@ std::array<Real, 3> WarpXParticleContainer::meanParticleVelocity(bool local) {
     amrex::Real inv_clight_sq = 1.0/PhysConst::c/PhysConst::c;
 
     for (int lev = 0; lev <= finestLevel(); ++lev) {
-    
+
 #ifdef _OPENMP
 #pragma omp parallel reduction(+:vx_total, vy_total, vz_total, np_total)
 #endif
@@ -365,9 +718,9 @@ std::array<Real, 3> WarpXParticleContainer::meanParticleVelocity(bool local) {
             auto& ux = pti.GetAttribs(PIdx::ux);
             auto& uy = pti.GetAttribs(PIdx::uy);
             auto& uz = pti.GetAttribs(PIdx::uz);
-            
+
             np_total += pti.numParticles();
-            
+
             for (unsigned long i = 0; i < ux.size(); i++) {
                 Real usq = (ux[i]*ux[i] + uy[i]*uy[i] + uz[i]*uz[i])*inv_clight_sq;
                 Real gaminv = 1.0/std::sqrt(1.0 + usq);
@@ -377,14 +730,14 @@ std::array<Real, 3> WarpXParticleContainer::meanParticleVelocity(bool local) {
             }
         }
     }
-    
+
     if (!local) {
         ParallelDescriptor::ReduceRealSum(vx_total);
         ParallelDescriptor::ReduceRealSum(vy_total);
         ParallelDescriptor::ReduceRealSum(vz_total);
         ParallelDescriptor::ReduceLongSum(np_total);
     }
-    
+
     std::array<Real, 3> mean_v;
     if (np_total > 0) {
         mean_v[0] = vx_total / np_total;
@@ -415,7 +768,7 @@ Real WarpXParticleContainer::maxParticleVelocity(bool local) {
             }
         }
     }
-    
+
     if (!local) ParallelDescriptor::ReduceRealMax(max_v);
     return max_v;
 }
@@ -427,23 +780,23 @@ WarpXParticleContainer::PushXES (Real dt)
 
     int num_levels = finestLevel() + 1;
 
-    for (int lev = 0; lev < num_levels; ++lev) {       
+    for (int lev = 0; lev < num_levels; ++lev) {
         const auto& gm = m_gdb->Geom(lev);
         const RealBox& prob_domain = gm.ProbDomain();
         for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti) {
             auto& particles = pti.GetArrayOfStructs();
-            int nstride = particles.dataShape().first;           
+            int nstride = particles.dataShape().first;
             const long np  = pti.numParticles();
-            
-            auto& attribs = pti.GetAttribs();        
+
+            auto& attribs = pti.GetAttribs();
             auto& uxp = attribs[PIdx::ux];
             auto& uyp = attribs[PIdx::uy];
             auto& uzp = attribs[PIdx::uz];
-            
-            WRPX_PUSH_LEAPFROG_POSITIONS(particles.data(), nstride, np,
-                                         uxp.data(), uyp.data(),
+
+            WRPX_PUSH_LEAPFROG_POSITIONS(particles.dataPtr(), nstride, np,
+                                         uxp.dataPtr(), uyp.dataPtr(),
 #if AMREX_SPACEDIM == 3
-                                         uzp.data(),
+                                         uzp.dataPtr(),
 #endif
                                          &dt,
                                          prob_domain.lo(), prob_domain.hi());
@@ -474,7 +827,7 @@ WarpXParticleContainer::PushX (int lev, Real dt)
 #pragma omp parallel
 #endif
     {
-        Vector<Real> xp, yp, zp, giv;
+        Cuda::DeviceVector<Real> xp, yp, zp, giv;
 
         for (WarpXParIter pti(*this, lev); pti.isValid(); ++pti)
         {
@@ -485,26 +838,30 @@ WarpXParticleContainer::PushX (int lev, Real dt)
             auto& uxp = attribs[PIdx::ux];
             auto& uyp = attribs[PIdx::uy];
             auto& uzp = attribs[PIdx::uz];
-            
+
             const long np = pti.numParticles();
-            
+
             giv.resize(np);
-            
+
             //
             // copy data from particle container to temp arrays
             //
             BL_PROFILE_VAR_START(blp_copy);
             pti.GetPosition(xp, yp, zp);
             BL_PROFILE_VAR_STOP(blp_copy);
-            
+
             //
             // Particle Push
             //
             BL_PROFILE_VAR_START(blp_pxr_pp);
-            warpx_particle_pusher_positions(&np, xp.data(), yp.data(), zp.data(),
-                                            uxp.data(), uyp.data(), uzp.data(), giv.data(), &dt);
+            warpx_particle_pusher_positions(&np,
+                                            xp.dataPtr(),
+                                            yp.dataPtr(),
+                                            zp.dataPtr(),
+                                            uxp.dataPtr(), uyp.dataPtr(), uzp.dataPtr(),
+                                            giv.dataPtr(), &dt);
             BL_PROFILE_VAR_STOP(blp_pxr_pp);
-            
+
             //
             // copy particle data back
             //
@@ -515,9 +872,12 @@ WarpXParticleContainer::PushX (int lev, Real dt)
             if (cost) {
                 const Box& tbx = pti.tilebox();
                 wt = (amrex::second() - wt) / tbx.d_numPts();
-                (*cost)[pti].plus(wt, tbx);
+                FArrayBox* costfab = cost->fabPtr(pti);
+                AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( tbx, work_box,
+                {
+                    costfab->plus(wt, work_box);
+                });
             }
         }
     }
 }
-
diff --git a/Source/WarpXRegrid.cpp b/Source/WarpXRegrid.cpp
index d21e2ad9c..8d7873041 100644
--- a/Source/WarpXRegrid.cpp
+++ b/Source/WarpXRegrid.cpp
@@ -40,6 +40,7 @@ WarpX::RemakeLevel (int lev, Real time, const BoxArray& ba, const DistributionMa
         
         // Fine patch
 
+        const auto& period = Geom(lev).periodicity();
         for (int idim=0; idim < 3; ++idim)
         {
             {
@@ -60,8 +61,8 @@ WarpX::RemakeLevel (int lev, Real time, const BoxArray& ba, const DistributionMa
                 const IntVect& ng = current_fp[lev][idim]->nGrowVect();
                 auto pmf = std::unique_ptr<MultiFab>(new MultiFab(current_fp[lev][idim]->boxArray(),
                                                                   dm, 1, ng));
-                // pmf->Redistribute(*current_fp[lev][idim], 0, 0, 1, ng);
                 current_fp[lev][idim] = std::move(pmf);
+                current_fp_owner_masks[lev][idim] = std::move(current_fp[lev][idim]->OwnerMask(period));
             }
             if (current_store[lev][idim])
             {
@@ -86,8 +87,8 @@ WarpX::RemakeLevel (int lev, Real time, const BoxArray& ba, const DistributionMa
             const IntVect& ng = rho_fp[lev]->nGrowVect();
             auto pmf = std::unique_ptr<MultiFab>(new MultiFab(rho_fp[lev]->boxArray(),
                                                               dm, nc, ng));
-            // pmf->Redistribute(*rho_fp[lev], 0, 0, nc, ng);
             rho_fp[lev] = std::move(pmf);
+            rho_fp_owner_masks[lev] = std::move(rho_fp[lev]->OwnerMask(period));
         }
 
         // Aux patch
@@ -120,6 +121,7 @@ WarpX::RemakeLevel (int lev, Real time, const BoxArray& ba, const DistributionMa
 
         // Coarse patch
         if (lev > 0) {
+            const auto& cperiod = Geom(lev-1).periodicity();
             for (int idim=0; idim < 3; ++idim)
             {
                 {
@@ -138,10 +140,11 @@ WarpX::RemakeLevel (int lev, Real time, const BoxArray& ba, const DistributionMa
                 }
                 {
                     const IntVect& ng = current_cp[lev][idim]->nGrowVect();
-                    auto pmf = std::unique_ptr<MultiFab>(new MultiFab(current_cp[lev][idim]->boxArray(),
-                                                                      dm, 1, ng));
-                    // pmf->Redistribute(*current_cp[lev][idim], 0, 0, 1, ng);
+                    auto pmf = std::unique_ptr<MultiFab>( new MultiFab(current_cp[lev][idim]->boxArray(),
+                                                                       dm, 1, ng));
                     current_cp[lev][idim] = std::move(pmf);
+                    current_cp_owner_masks[lev][idim] = std::move(
+                        current_cp[lev][idim]->OwnerMask(cperiod));
                 }
             }
 
@@ -158,8 +161,8 @@ WarpX::RemakeLevel (int lev, Real time, const BoxArray& ba, const DistributionMa
                 const IntVect& ng = rho_cp[lev]->nGrowVect();
                 auto pmf = std::unique_ptr<MultiFab>(new MultiFab(rho_cp[lev]->boxArray(),
                                                                   dm, nc, ng));
-                // pmf->Redistribute(*rho_cp[lev], 0, 0, nc, ng);
                 rho_cp[lev] = std::move(pmf);
+                rho_cp_owner_masks[lev] = std::move(rho_cp[lev]->OwnerMask(cperiod));
             }
         }
 
diff --git a/Source/WarpXUtil.cpp b/Source/WarpXUtil.cpp
index eb0eec92b..4a884330a 100644
--- a/Source/WarpXUtil.cpp
+++ b/Source/WarpXUtil.cpp
@@ -38,9 +38,8 @@ void ReadBoostedFrameParameters(Real& gamma_boost, Real& beta_boost,
 
 void ConvertLabParamsToBoost()
 {
-
-    Real gamma_boost, beta_boost;
-    int max_level;
+    Real gamma_boost = 1., beta_boost = 0.;
+    int max_level = 0;
     Vector<int> boost_direction {0,0,0};
 
     ReadBoostedFrameParameters(gamma_boost, beta_boost, boost_direction);
diff --git a/Source/WarpXWrappers.cpp b/Source/WarpXWrappers.cpp
index 198cd818d..6ee06a7d5 100644
--- a/Source/WarpXWrappers.cpp
+++ b/Source/WarpXWrappers.cpp
@@ -56,14 +56,12 @@ extern "C"
 
     bool warpx_use_fdtd_nci_corr()
     {
-	auto & mypc = WarpX::GetInstance().GetPartContainer();
-        return mypc.Use_fdtd_nci_corr();
+	return WarpX::use_fdtd_nci_corr;
     }
 
     int warpx_l_lower_order_in_v()
     {
-	auto & mypc = WarpX::GetInstance().GetPartContainer();
-        return mypc.L_lower_order_in_v();
+	return WarpX::l_lower_order_in_v;
     }
 
     int warpx_nComps() 
diff --git a/Source/WarpX_f.H b/Source/WarpX_f.H
index e709c1332..578785a14 100644
--- a/Source/WarpX_f.H
+++ b/Source/WarpX_f.H
@@ -180,6 +180,22 @@ extern "C"
 
         void parse_function_laser( const long* np, amrex::Real* Xp, amrex::Real* Yp, amrex::Real* t, amrex::Real* amplitude, const int parser_instance_number );
 
+  void calculate_laser_plane_coordinates( const long* np,
+       amrex::Real* xp, amrex::Real* yp, amrex::Real* zp,
+       amrex::Real* plane_Xp, amrex::Real* plane_Yp,
+       amrex::Real* u_Xx, amrex::Real* u_Xy, amrex::Real* u_Xz,
+       amrex::Real* u_Yx, amrex::Real* u_Yy, amrex::Real* u_Yz,
+       amrex::Real* positionx, amrex::Real* positiony, amrex::Real* positionz );
+  
+  void update_laser_particle( const long* np,
+       amrex::Real* xp, amrex::Real* yp, amrex::Real* zp,
+       amrex::Real* uxp, amrex::Real* uyp, amrex::Real* uzp,
+       amrex::Real* giv, amrex::Real* wp, amrex::Real* amplitude_E,
+       amrex::Real* p_Xx, amrex::Real* p_Xy, amrex::Real* p_Xz,
+       amrex::Real* nvecx, amrex::Real* nvecy, amrex::Real* nvecz,
+       amrex::Real* mobility, amrex::Real* dt,  const amrex::Real* c,
+       const amrex::Real* beta_boost, const amrex::Real* gamma_boost );
+  
     // Maxwell solver
 
         void warpx_push_evec(
diff --git a/Source/WarpX_fft.F90 b/Source/WarpX_fft.F90
index 8d697d184..e605b2be0 100644
--- a/Source/WarpX_fft.F90
+++ b/Source/WarpX_fft.F90
@@ -198,7 +198,7 @@ contains
     call c_f_pointer(fft_data(21), rhof, shp)
     fft_data(22) = fftw_alloc_complex(sz)
     call c_f_pointer(fft_data(22), rhooldf, shp)
-
+!$acc enter data create (exf,eyf,ezf,bxf,byf,bzf,jxf,jyf,jzf,rhof,rhooldf)
     if (ndata < 22) then
        call amrex_abort("size of fft_data is too small")
     end if
diff --git a/Source/WarpX_laser.F90 b/Source/WarpX_laser.F90
index a99cec8c7..b08413019 100644
--- a/Source/WarpX_laser.F90
+++ b/Source/WarpX_laser.F90
@@ -1,3 +1,10 @@
+#if AMREX_USE_GPU && __PGIC__ <= 18 && __PGIC_MINOR__ <= 7
+#define USE_ACC_LASER_SUBROUTINE
+! Define a separate subroutine that contains the openacc pragmas
+! This is because fortran subroutines containing openacc pramas
+! ignore the `bind(C,` statement, and so cannot directly be called from C++
+! (at least until pgi version 18.7)
+#endif
 
 module warpx_laser_module
 
@@ -27,6 +34,31 @@ contains
                         stc_exponent, stcfactor
     complex*16, parameter :: j=cmplx(0., 1.)
 
+#ifdef USE_ACC_LASER_SUBROUTINE
+    call warpx_gaussian_laser_acc( np, Xp, Yp, t, &
+      wavelength, e_max, waist, duration, t_peak, f, amplitude, &
+      zeta, beta, phi2, theta_stc  )
+
+  end subroutine warpx_gaussian_laser
+
+  subroutine warpx_gaussian_laser_acc( np, Xp, Yp, t, &
+      wavelength, e_max, waist, duration, t_peak, f, amplitude, &
+      zeta, beta, phi2, theta_stc  )
+
+    integer(c_long), intent(in) :: np
+    real(amrex_real), intent(in)    :: Xp(np),Yp(np)
+    real(amrex_real), intent(in)    :: e_max, t, t_peak, wavelength, duration, f, waist
+    real(amrex_real), intent(in)    :: zeta, beta, phi2, theta_stc 
+    real(amrex_real), intent(inout) :: amplitude(np)
+
+    integer(c_long)  :: i
+    real(amrex_real) :: k0, oscillation_phase, inv_tau2
+    complex*16       :: diffract_factor, exp_argument, prefactor, &
+                        inv_complex_waist_2, stretch_factor, &
+                        stc_exponent, stcfactor
+    complex*16, parameter :: j=cmplx(0., 1.)
+#endif
+
     ! This function uses the complex expression of a Gaussian laser
     ! (Including Gouy phase and laser oscillations)
 
@@ -38,13 +70,13 @@ contains
     ! laser diffraction, and phase front curvature
     diffract_factor = 1 + j * f * 2./(k0*waist**2)
     inv_complex_waist_2 = 1./( waist**2 * diffract_factor )
-    
+
     ! Time stretching due to STCs and phi2 complex envelope
     ! (1 if zeta=0, beta=0, phi2=0)
-    stretch_factor = 1. + \
-        4*(zeta + beta*f)**2 * (inv_tau2*inv_complex_waist_2) + \
+   stretch_factor = 1. + &
+        4*(zeta + beta*f)**2 * (inv_tau2*inv_complex_waist_2) + &
         2*j*(phi2 - beta**2*k0*f) * inv_tau2
-    
+
     ! Amplitude and monochromatic oscillations
     prefactor = e_max * exp( j * oscillation_phase )
 
@@ -58,21 +90,29 @@ contains
 #endif
 
     ! Loop through the macroparticle to calculate the proper amplitude
+    !$acc parallel deviceptr(amplitude, Xp, Yp)
+    !$acc loop
     do i = 1, np
       ! Exp argument for the temporal gaussian envelope + STCs
-      stc_exponent = 1./stretch_factor * inv_tau2 * \
-          (t - t_peak - beta*k0*(Xp(i)*cos(theta_stc) + Yp(i)*sin(theta_stc)) -\
-          2*j*(Xp(i)*cos(theta_stc) + Yp(i)*sin(theta_stc))*( zeta - beta*f ) *\
-          inv_complex_waist_2)**2 
-      ! stcfactor = everything but complex transverse envelope 
+      stc_exponent = 1./stretch_factor * inv_tau2 * &
+          (t - t_peak - beta*k0*(Xp(i)*cos(theta_stc) + Yp(i)*sin(theta_stc)) - &
+          2*j*(Xp(i)*cos(theta_stc) + Yp(i)*sin(theta_stc))*( zeta - beta*f ) * &
+          inv_complex_waist_2)**2
+      ! stcfactor = everything but complex transverse envelope
       stcfactor = prefactor * exp( - stc_exponent )
       ! Exp argument for transverse envelope
       exp_argument = - ( Xp(i)*Xp(i) + Yp(i)*Yp(i) ) * inv_complex_waist_2
       ! stcfactor + transverse envelope
       amplitude(i) = DREAL( stcfactor * exp( exp_argument ) )
-    enddo
+   enddo
+   !$acc end loop
+   !$acc end parallel
 
+#ifdef USE_ACC_LASER_SUBROUTINE
+  end subroutine warpx_gaussian_laser_acc
+#else
   end subroutine warpx_gaussian_laser
+#endif
 
   ! Harris function for the laser temporal profile
   subroutine warpx_harris_laser( np, Xp, Yp, t, &
@@ -88,6 +128,25 @@ contains
        real(amrex_real) :: space_envelope, time_envelope, arg_osc, arg_env
        real(amrex_real) :: omega0, zR, wz, inv_Rz, oscillations, inv_wz_2
 
+#ifdef USE_ACC_LASER_SUBROUTINE
+       call warpx_harris_laser_acc( np, Xp, Yp, t, &
+            wavelength, e_max, waist, duration, f, amplitude )
+
+  end subroutine warpx_harris_laser
+
+  subroutine warpx_harris_laser_acc( np, Xp, Yp, t, &
+          wavelength, e_max, waist, duration, f, amplitude )
+
+       integer(c_long), intent(in) :: np
+       real(amrex_real), intent(in)    :: Xp(np),Yp(np)
+       real(amrex_real), intent(in)    :: e_max, t, wavelength, duration, f, waist
+       real(amrex_real), intent(inout) :: amplitude(np)
+
+       integer(c_long)  :: i
+       real(amrex_real) :: space_envelope, time_envelope, arg_osc, arg_env
+       real(amrex_real) :: omega0, zR, wz, inv_Rz, oscillations, inv_wz_2
+#endif
+
     ! This function uses the Harris function as the temporal profile of the pulse
     omega0 = 2*pi*clight/wavelength
     zR = pi * waist**2 / wavelength
@@ -110,14 +169,22 @@ contains
     end if
 
     ! Loop through the macroparticle to calculate the proper amplitude
+    !$acc parallel deviceptr(amplitude, Xp, Yp)
+    !$acc loop
     do i = 1, np
       space_envelope = exp(- ( Xp(i)*Xp(i) + Yp(i)*Yp(i) ) * inv_wz_2)
       arg_osc = omega0*t - omega0/clight*(Xp(i)*Xp(i) + Yp(i)*Yp(i)) * inv_Rz / 2
       oscillations = cos(arg_osc)
       amplitude(i) = e_max * time_envelope * space_envelope * oscillations
     enddo
+    !$acc end loop
+    !$acc end parallel
 
+#ifdef USE_ACC_LASER_SUBROUTINE
+  end subroutine warpx_harris_laser_acc
+#else
   end subroutine warpx_harris_laser
+#endif
 
   ! Parse function from the input script for the laser temporal profile
   subroutine parse_function_laser( np, Xp, Yp, t, amplitude, parser_instance_number ) bind(C, name="parse_function_laser")
@@ -136,4 +203,135 @@ contains
     enddo
   end subroutine parse_function_laser
 
+
+  subroutine calculate_laser_plane_coordinates(np, xp, yp, zp, &
+       plane_Xp, plane_Yp, u_Xx, u_Xy, u_Xz, u_Yx, u_Yy, u_Yz, &
+       positionx, positiony, positionz ) &
+       bind(C, name="calculate_laser_plane_coordinates")
+    integer(c_long), intent(in) :: np
+    real(amrex_real), intent(in) :: xp(np), yp(np), zp(np)
+    real(amrex_real), intent(in) :: u_Xx, u_Xy, u_Xz
+    real(amrex_real), intent(in) :: u_Yx, u_Yy, u_Yz
+    real(amrex_real), intent(in) :: positionx, positiony, positionz
+    real(amrex_real), intent(out) :: plane_Xp(np), plane_Yp(np)
+    integer :: ip
+
+#ifdef USE_ACC_LASER_SUBROUTINE
+    call calculate_laser_plane_coordinates_acc(np, xp, yp, zp, &
+       plane_Xp, plane_Yp, u_Xx, u_Xy, u_Xz, u_Yx, u_Yy, u_Yz, &
+       positionx, positiony, positionz )
+
+  end subroutine calculate_laser_plane_coordinates
+
+   subroutine calculate_laser_plane_coordinates_acc(np, xp, yp, zp, &
+       plane_Xp, plane_Yp, u_Xx, u_Xy, u_Xz, u_Yx, u_Yy, u_Yz, &
+       positionx, positiony, positionz )
+    integer(c_long), intent(in) :: np
+    real(amrex_real), intent(in) :: xp(np), yp(np), zp(np)
+    real(amrex_real), intent(in) :: u_Xx, u_Xy, u_Xz
+    real(amrex_real), intent(in) :: u_Yx, u_Yy, u_Yz
+    real(amrex_real), intent(in) :: positionx, positiony, positionz
+    real(amrex_real), intent(out) :: plane_Xp(np), plane_Yp(np)
+    integer :: ip
+#endif
+
+    !$acc parallel deviceptr(xp, yp, zp, plane_Xp, plane_Yp)
+    !$acc loop
+    do ip = 1, np
+#if (BL_SPACEDIM == 3)
+       plane_Xp(ip) = u_Xx*(xp(ip) - positionx) &
+                   + u_Xy*(yp(ip) - positiony) &
+                   + u_Xz*(zp(ip) - positionz)
+       plane_Yp(ip) = u_Yx*(xp(ip) - positionx) &
+                   + u_Yy*(yp(ip) - positiony) &
+                   + u_Yz*(zp(ip) - positionz)
+#elif (AMREX_SPACEDIM == 2)
+       plane_Xp(ip) = u_Xx*(xp(ip) - positionx) &
+                   + u_Xz*(zp(ip) - positionz)
+       plane_Yp(ip) = 0
+#endif
+    enddo
+    !$acc end loop
+    !$acc end parallel
+
+#ifdef USE_ACC_LASER_SUBROUTINE
+  end subroutine calculate_laser_plane_coordinates_acc
+#else
+  end subroutine calculate_laser_plane_coordinates
+#endif
+
+  subroutine update_laser_particle( np, &
+    xp, yp, zp, uxp, uyp, uzp, giv, wp, amplitude_E, p_Xx, p_Xy, p_Xz, &
+    nvecx, nvecy, nvecz, mobility, dt, c, beta_boost, gamma_boost ) &
+    bind(C, name="update_laser_particle")
+
+    integer(c_long), intent(in) :: np
+    real(amrex_real), intent(in) :: amplitude_E(np), wp(np)
+    real(amrex_real), intent(inout) :: xp(np), yp(np), zp(np)
+    real(amrex_real), intent(out) :: uxp(np), uyp(np), uzp(np), giv(np)
+    real(amrex_real), intent(in) :: p_Xx, p_Xy, p_Xz
+    real(amrex_real), intent(in) :: nvecx, nvecy, nvecz
+    real(amrex_real), intent(in) :: c, beta_boost, gamma_boost, mobility, dt
+
+#ifdef USE_ACC_LASER_SUBROUTINE
+    call update_laser_particle_acc( np, &
+    xp, yp, zp, uxp, uyp, uzp, giv, wp, amplitude_E, p_Xx, p_Xy, p_Xz, &
+    nvecx, nvecy, nvecz, mobility, dt, c, beta_boost, gamma_boost )
+
+  end subroutine update_laser_particle
+
+  subroutine update_laser_particle_acc( np, &
+    xp, yp, zp, uxp, uyp, uzp, giv, wp, amplitude_E, p_Xx, p_Xy, p_Xz, &
+    nvecx, nvecy, nvecz, mobility, dt, c, beta_boost, gamma_boost )
+
+    integer(c_long), intent(in) :: np
+    real(amrex_real), intent(in) :: amplitude_E(np), wp(np)
+    real(amrex_real), intent(inout) :: xp(np), yp(np), zp(np)
+    real(amrex_real), intent(out) :: uxp(np), uyp(np), uzp(np), giv(np)
+    real(amrex_real), intent(in) :: p_Xx, p_Xy, p_Xz
+    real(amrex_real), intent(in) :: nvecx, nvecy, nvecz
+    real(amrex_real), intent(in) :: c, beta_boost, gamma_boost, mobility, dt
+#endif
+
+    real(amrex_real)     :: vx, vy, vz, v_over_c, sign_charge, gamma
+    integer  :: ip
+
+    !$acc parallel deviceptr(amplitude_E, wp, xp, yp, zp, uxp, uyp, uzp, giv)
+    !$acc loop
+    do ip = 1, np
+       ! Calculate the velocity according to the amplitude of E
+       sign_charge = SIGN( 1.0_amrex_real, wp(ip) )
+       v_over_c = sign_charge * mobility * amplitude_E(ip)
+       ! The velocity is along the laser polarization p_X
+       vx = c * v_over_c * p_Xx
+       vy = c * v_over_c * p_Xy
+       vz = c * v_over_c * p_Xz
+       ! When running in the boosted-frame, their is additional velocity along nvec
+       if (gamma_boost > 1.) then
+          vx = vx - c * beta_boost * nvecx
+          vy = vy - c * beta_boost * nvecy
+          vz = vz - c * beta_boost * nvecz
+       endif
+       ! Get the corresponding momenta
+       gamma = gamma_boost/sqrt(1 - v_over_c*v_over_c)
+       giv(ip) = 1./gamma
+       uxp(ip) = gamma * vx
+       uyp(ip) = gamma * vy
+       uzp(ip) = gamma * vz
+       ! Push the the particle positions
+       xp(ip) = xp(ip) + vx * dt
+#if (BL_SPACEDIM == 3)
+       yp(ip) = yp(ip) + vy * dt
+#endif
+       zp(ip) = zp(ip) + vz * dt
+    enddo
+    !$acc end loop
+    !$acc end parallel
+
+#ifdef USE_ACC_LASER_SUBROUTINE
+  end subroutine update_laser_particle_acc
+#else
+  end subroutine update_laser_particle
+#endif
+
 end module warpx_laser_module
diff --git a/Source/WarpX_picsar.F90 b/Source/WarpX_picsar.F90
index 91a2f08e7..13168ceca 100644
--- a/Source/WarpX_picsar.F90
+++ b/Source/WarpX_picsar.F90
@@ -294,7 +294,8 @@ subroutine warpx_charge_deposition(rho,np,xp,yp,zp,w,q,xmin,ymin,zmin,dx,dy,dz,n
         jy,jy_nguards,jy_nvalid,            &
         jz,jz_nguards,jz_nvalid,            &
         np,xp,yp,zp,uxp,uyp,uzp,gaminv,w,q, &
-        xmin,zmin,dt,dx,dz,nox,noz,lvect)
+        xmin,zmin,dt,dx,dz,nox,noz,lvect,   &
+        current_depo_algo)
 #endif
 
   end subroutine warpx_current_deposition
diff --git a/Tools/parallel_postproc.py b/Tools/parallel_postproc.py
new file mode 100644
index 000000000..0942118ac
--- /dev/null
+++ b/Tools/parallel_postproc.py
@@ -0,0 +1,83 @@
+import matplotlib
+matplotlib.use('Agg')
+from mpi4py import MPI
+import glob, read_raw_data
+import numpy as np
+import scipy.constants as scc
+import  matplotlib.pyplot as plt
+
+species = 'electrons'
+fieldname = 'Ey'
+lambda0 = .81e-6
+
+# --- custom functions --- #
+# ------------------------ #
+omega0 = 2*np.pi*scc.c/lambda0
+# Read field fieldname and return normalized max
+def get_a0(res_dir, snapshot):
+    header = res_dir + '/Header'
+    print( snapshot )
+    allrd, info = read_raw_data.read_lab_snapshot(snapshot, header)
+    F = allrd[ fieldname ]
+    return info['z'][-1], np.max(np.abs(F)) * scc.e/(scc.m_e*omega0*scc.c)
+# Convert elements of a list to numpy arrays
+def convert_to_np_array(list_in):
+    list_out = []
+    for elem in list_in:
+        list_out.append( np.array( elem ) )
+    return list_out
+
+# --- MPI parallelization --- #
+# --------------------------- #
+# Get ordered list of snapshot files
+res_dir = './lab_frame_data/';
+file_list = glob.glob(res_dir + '/snapshot?????')
+file_list.sort()
+# Total number of files
+nfiles = len(file_list)
+# Each file has a unique number
+number_list = range(nfiles)
+comm_world = MPI.COMM_WORLD
+me = comm_world.Get_rank()
+nrank = comm_world.Get_size()
+# List of files to process for current proc
+my_list = file_list[ (me*nfiles)/nrank : ((me+1)*nfiles)/nrank ]
+# List if file numbers for current proc
+my_number_list = number_list[ (me*nfiles)/nrank : ((me+1)*nfiles)/nrank ]
+
+# --- Run parallel analysis --- #
+# ----------------------------- #
+# Each MPI rank reads roughly (nb snapshots)/(nb ranks) snapshots.
+# Works with any number of snapshots.
+for count, filename in enumerate(my_list):
+    zwin, a0 = get_a0( res_dir, filename )
+    uzlab = read_raw_data.get_particle_field(filename, species, 'uz')/scc.c
+    select_particles = (uzlab > 5.)
+    uzlab = uzlab[ select_particles ]
+    uzmean = np.mean(uzlab)
+
+# --- gather and rank 0 plots --- #
+# ------------------------------- #
+# Gather particle quantities to rank 0 to plot history of quantities.
+UZMEAN = comm_world.gather(uzmean, root=0)
+ZWIN = comm_world.gather(zwin, root=0)
+A0 = comm_world.gather(a0, root=0)
+# Rank 0 does the plot.
+if me == 0:
+    # Convert to numpy arrays
+    UZMEAN, ZWIN, A0 = convert_to_np_array([UZMEAN, ZWIN, A0])
+    # Plot and save
+    fig = plt.figure()
+    plt.subplot(2,1,1)
+    plt.plot(ZWIN*1.e3, UZMEAN)
+    plt.xlabel('z (mm)')
+    plt.ylabel('uz')
+    plt.title( 'beam energy' )
+    plt.grid()
+    plt.subplot(2,1,2)
+    plt.plot(ZWIN*1.e3, A0)
+    plt.ylabel('a0')
+    plt.title( 'beam propag angle' )
+    plt.grid()
+    fig.savefig('./image.png', bbox_inches='tight')
+    plt.close()