11 files changed, 557 insertions, 101 deletions
diff --git a/Docs/source/visualization/plot_parallel.rst b/Docs/source/visualization/plot_parallel.rst
new file mode 100644
index 000000000..75f8559e1
--- /dev/null
+++ b/Docs/source/visualization/plot_parallel.rst
@@ -0,0 +1,63 @@
+Out-of-the-box plotting script
+==============================
+
+A ready-to-use python script for plotting simulation results is available at
+:download:`plot_parallel.py<../../../Tools/plot_parallel.py>`. Feel free to
+use it out-of-the-box or to modify it to suit your needs.
+
+Dependencies
+------------
+
+Most of its dependencies are standard Python packages, that come with a default
+Anaconda installation or can be installed with ``pip`` or ``conda``:
+`os, matplotlib, sys, argparse, matplotlib, scipy`.
+
+Additional dependencies are ``yt >= 3.5`` ( or ``yt >= 3.6`` if you are using
+rigid injection, see section :doc:`yt` on how to install ``yt``), and ``mpi4py``.
+
+Run serial
+----------
+
+Executing the script with
+
+::
+
+    python plot_parallel.py
+
+will loop through plotfiles named ``plt?????`` (e.g., ``plt00000``, ``plt00100`` etc.)
+and save one image per plotfile. For a 2D simulation, a 2D colormap of the Ez
+field is plotted by default, with 1/20 of particles of each species (with different colors).
+For a 3D simulation, a 2D colormap of the central slices in `y` is plotted, and particles
+are handled the same way.
+
+The script reads command-line options (which field and particle species, rendering with
+`yt` or `matplotlib`, etc.). For the full list of options, run
+
+::
+
+    python plot_parallel.py --help
+
+In particular, option ``--plot_Ey_max_evolution`` shows you how to plot the evolution of
+a scalar quantity over time (by default, the max of the Ey field). Feel free to modify it
+to plot the evolution of other quantities.
+
+Run parallel
+------------
+
+To execute the script in parallel, you can run for instance
+
+::
+
+    mpirun -np 4 python plot_parallel.py --parallel
+
+In this case, MPI ranks will share the plotfiles to process as evenly as possible.
+Note that each plotfile is still processed in serial. When option
+``--plot_Ey_max_evolution`` is on, the scalar quantity is gathered to rank 0, and
+rank 0 plots the image.
+
+If all dependencies are satisfied, the script can be used on Summit or Cori. For
+instance, the following batch script illustrates how to submit a post-processing
+batch job on Cori haswell with some options:
+
+.. literalinclude:: ../../../Tools/cori_postproc_script.sh
+    :language: bash
+\ No newline at end of file
diff --git a/Docs/source/visualization/visualization.rst b/Docs/source/visualization/visualization.rst
index 1ddd8218f..76acd1bb6 100644
--- a/Docs/source/visualization/visualization.rst
+++ b/Docs/source/visualization/visualization.rst
@@ -21,6 +21,7 @@ This section describes some of the tools available to visualize the data:
    picviewer
    openpmdviewer
    advanced
+   plot_parallel
 
 
 In addition, WarpX also has In-Situ Visualization capabilities (i.e.
diff --git a/Docs/source/visualization/yt.rst b/Docs/source/visualization/yt.rst
index cfc859ffa..7c52f81f8 100644
--- a/Docs/source/visualization/yt.rst
+++ b/Docs/source/visualization/yt.rst
@@ -20,6 +20,14 @@ or with the `Anaconda distribution <https://anaconda.org/>`__ of python (recomme
 
     conda install -c conda-forge yt
 
+The latest version of `yt` can be required for advanced options (e.g., rigid
+injection for particles). To built `yt` directly from source, you can use
+
+::
+
+    pip install git+https://github.com/yt-project/yt.git
+
+
 Visualizing the data
 --------------------
 
diff --git a/Source/BoundaryConditions/WarpX_PML_kernels.H b/Source/BoundaryConditions/WarpX_PML_kernels.H
index 23d19e2e8..8f779a5c2 100644
--- a/Source/BoundaryConditions/WarpX_PML_kernels.H
+++ b/Source/BoundaryConditions/WarpX_PML_kernels.H
@@ -99,7 +99,7 @@ void warpx_push_pml_bx_ckc(int i, int j, int k, Array4<Real> const&Bx,
                           - Ey(i+1,j+1,k  ,0) - Ey(i+1,j+1,k  ,1) - Ey(i+1,j+1,k  ,2)
                           + Ey(i-1,j+1,k+1,0) + Ey(i-1,j+1,k+1,1) + Ey(i-1,j+1,k+1,2)
                           - Ey(i-1,j+1,k  ,0) - Ey(i-1,j+1,k  ,1) - Ey(i-1,j+1,k  ,2)
-                          + Ey(i+1,j-1,k+1,0) + Ey(i+1,j-1,k+1,1) + Ey(i+1,j-2,k+1,2)
+                          + Ey(i+1,j-1,k+1,0) + Ey(i+1,j-1,k+1,1) + Ey(i+1,j-1,k+1,2)
                           - Ey(i+1,j-1,k  ,0) - Ey(i+1,j-1,k  ,1) - Ey(i+1,j-1,k  ,2)
                           + Ey(i-1,j-1,k+1,0) + Ey(i-1,j-1,k+1,1) + Ey(i-1,j-1,k+1,1)
                           - Ey(i-1,j-1,k  ,0) - Ey(i-1,j-1,k  ,1) - Ey(i-1,j-1,k  ,2));
diff --git a/Source/Diagnostics/FieldIO.H b/Source/Diagnostics/FieldIO.H
index f08d85f2d..7cdc9b710 100644
--- a/Source/Diagnostics/FieldIO.H
+++ b/Source/Diagnostics/FieldIO.H
@@ -99,7 +99,7 @@ getReversedVec( const amrex::Real* v );
 void
 WriteOpenPMDFields( const std::string& filename,
                     const std::vector<std::string>& varnames,
-                    const amrex::MultiFab& mf, const Geometry& geom,
+                    const amrex::MultiFab& mf, const amrex::Geometry& geom,
                     const int iteration, const double time );
 #endif // WARPX_USE_OPENPMD
 
diff --git a/Source/Parallelization/Make.package b/Source/Parallelization/Make.package
index 3d1fcf1da..c74583522 100644
--- a/Source/Parallelization/Make.package
+++ b/Source/Parallelization/Make.package
@@ -1,6 +1,7 @@
 CEXE_sources += WarpXComm.cpp
 CEXE_sources += WarpXRegrid.cpp
 CEXE_headers += WarpXSumGuardCells.H
+CEXE_headers += WarpXComm_K.H
 
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Parallelization
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Parallelization
diff --git a/Source/Parallelization/WarpXComm.cpp b/Source/Parallelization/WarpXComm.cpp
index e24dd772c..990d0f988 100644
--- a/Source/Parallelization/WarpXComm.cpp
+++ b/Source/Parallelization/WarpXComm.cpp
@@ -1,9 +1,8 @@
+#include <WarpXComm_K.H>
 #include <WarpX.H>
 #include <WarpX_f.H>
 #include <WarpXSumGuardCells.H>
 
-#include <AMReX_FillPatchUtil_F.H>
-
 #include <algorithm>
 #include <cstdlib>
 
@@ -52,8 +51,6 @@ WarpX::UpdateAuxilaryData ()
 {
     BL_PROFILE("UpdateAuxilaryData()");
 
-    const int use_limiter = 0;
-
     for (int lev = 1; lev <= finest_level; ++lev)
     {
         const auto& crse_period = Geom(lev-1).periodicity();
@@ -81,57 +78,37 @@ WarpX::UpdateAuxilaryData ()
             MultiFab::Subtract(dBy, *Bfield_cp[lev][1], 0, 0, Bfield_cp[lev][1]->nComp(), ng);
             MultiFab::Subtract(dBz, *Bfield_cp[lev][2], 0, 0, Bfield_cp[lev][2]->nComp(), ng);
 
-            const Real* dx = Geom(lev-1).CellSize();
             const int refinement_ratio = refRatio(lev-1)[0];
+            AMREX_ALWAYS_ASSERT(refinement_ratio == 2);
+
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
+            for (MFIter mfi(*Bfield_aux[lev][0]); mfi.isValid(); ++mfi)
             {
-                std::array<FArrayBox,3> bfab;
-                for (MFIter mfi(*Bfield_aux[lev][0]); mfi.isValid(); ++mfi)
+                Array4<Real> const& bx_aux = Bfield_aux[lev][0]->array(mfi);
+                Array4<Real> const& by_aux = Bfield_aux[lev][1]->array(mfi);
+                Array4<Real> const& bz_aux = Bfield_aux[lev][2]->array(mfi);
+                Array4<Real const> const& bx_fp = Bfield_fp[lev][0]->const_array(mfi);
+                Array4<Real const> const& by_fp = Bfield_fp[lev][1]->const_array(mfi);
+                Array4<Real const> const& bz_fp = Bfield_fp[lev][2]->const_array(mfi);
+                Array4<Real const> const& bx_c = dBx.const_array(mfi);
+                Array4<Real const> const& by_c = dBy.const_array(mfi);
+                Array4<Real const> const& bz_c = dBz.const_array(mfi);
+
+                amrex::ParallelFor(Box(bx_aux), Box(by_aux), Box(bz_aux),
+                [=] AMREX_GPU_DEVICE (int j, int k, int l) noexcept
                 {
-                    Box ccbx = mfi.fabbox();
-                    ccbx.enclosedCells();
-                    ccbx.coarsen(refinement_ratio).refine(refinement_ratio); // so that ccbx is coarsenable
-
-                    const FArrayBox& cxfab = dBx[mfi];
-                    const FArrayBox& cyfab = dBy[mfi];
-                    const FArrayBox& czfab = dBz[mfi];
-                    bfab[0].resize(amrex::convert(ccbx,Bx_nodal_flag));
-                    bfab[1].resize(amrex::convert(ccbx,By_nodal_flag));
-                    bfab[2].resize(amrex::convert(ccbx,Bz_nodal_flag));
-
-#if (AMREX_SPACEDIM == 3)
-                    amrex_interp_div_free_bfield(ccbx.loVect(), ccbx.hiVect(),
-                                                 BL_TO_FORTRAN_ANYD(bfab[0]),
-                                                 BL_TO_FORTRAN_ANYD(bfab[1]),
-                                                 BL_TO_FORTRAN_ANYD(bfab[2]),
-                                                 BL_TO_FORTRAN_ANYD(cxfab),
-                                                 BL_TO_FORTRAN_ANYD(cyfab),
-                                                 BL_TO_FORTRAN_ANYD(czfab),
-                                                 dx, &refinement_ratio,&use_limiter);
-#else
-                    amrex_interp_div_free_bfield(ccbx.loVect(), ccbx.hiVect(),
-                                                 BL_TO_FORTRAN_ANYD(bfab[0]),
-                                                 BL_TO_FORTRAN_ANYD(bfab[2]),
-                                                 BL_TO_FORTRAN_ANYD(cxfab),
-                                                 BL_TO_FORTRAN_ANYD(czfab),
-                                                 dx, &refinement_ratio,&use_limiter);
-                    amrex_interp_cc_bfield(ccbx.loVect(), ccbx.hiVect(),
-                                           BL_TO_FORTRAN_ANYD(bfab[1]),
-                                           BL_TO_FORTRAN_ANYD(cyfab),
-                                           &refinement_ratio,&use_limiter);
-#endif
-
-                    for (int idim = 0; idim < 3; ++idim)
-                    {
-                        FArrayBox& aux = (*Bfield_aux[lev][idim])[mfi];
-                        FArrayBox& fp  =  (*Bfield_fp[lev][idim])[mfi];
-                        const Box& bx = aux.box();
-                        aux.copy(fp, bx, 0, bx, 0, 1);
-                        aux.plus(bfab[idim], bx, bx, 0, 0, 1);
-                    }
-                }
+                    warpx_interp_bfield_x(j,k,l, bx_aux, bx_fp, bx_c);
+                },
+                [=] AMREX_GPU_DEVICE (int j, int k, int l) noexcept
+                {
+                    warpx_interp_bfield_y(j,k,l, by_aux, by_fp, by_c);
+                },
+                [=] AMREX_GPU_DEVICE (int j, int k, int l) noexcept
+                {
+                    warpx_interp_bfield_z(j,k,l, bz_aux, bz_fp, bz_c);
+                });
             }
         }
 
@@ -156,56 +133,34 @@ WarpX::UpdateAuxilaryData ()
             MultiFab::Subtract(dEy, *Efield_cp[lev][1], 0, 0, Efield_cp[lev][1]->nComp(), ng);
             MultiFab::Subtract(dEz, *Efield_cp[lev][2], 0, 0, Efield_cp[lev][2]->nComp(), ng);
 
-            const int refinement_ratio = refRatio(lev-1)[0];
 #ifdef _OPEMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
+            for (MFIter mfi(*Efield_aux[lev][0]); mfi.isValid(); ++mfi)
             {
-                std::array<FArrayBox,3> efab;
-                for (MFIter mfi(*Efield_aux[lev][0]); mfi.isValid(); ++mfi)
+                Array4<Real> const& ex_aux = Efield_aux[lev][0]->array(mfi);
+                Array4<Real> const& ey_aux = Efield_aux[lev][1]->array(mfi);
+                Array4<Real> const& ez_aux = Efield_aux[lev][2]->array(mfi);
+                Array4<Real const> const& ex_fp = Efield_fp[lev][0]->const_array(mfi);
+                Array4<Real const> const& ey_fp = Efield_fp[lev][1]->const_array(mfi);
+                Array4<Real const> const& ez_fp = Efield_fp[lev][2]->const_array(mfi);
+                Array4<Real const> const& ex_c = dEx.const_array(mfi);
+                Array4<Real const> const& ey_c = dEy.const_array(mfi);
+                Array4<Real const> const& ez_c = dEz.const_array(mfi);
+
+                amrex::ParallelFor(Box(ex_aux), Box(ey_aux), Box(ez_aux),
+                [=] AMREX_GPU_DEVICE (int j, int k, int l) noexcept
                 {
-                    Box ccbx = mfi.fabbox();
-                    ccbx.enclosedCells();
-                    ccbx.coarsen(refinement_ratio).refine(refinement_ratio); // so that ccbx is coarsenable
-
-                    const FArrayBox& cxfab = dEx[mfi];
-                    const FArrayBox& cyfab = dEy[mfi];
-                    const FArrayBox& czfab = dEz[mfi];
-                    efab[0].resize(amrex::convert(ccbx,Ex_nodal_flag));
-                    efab[1].resize(amrex::convert(ccbx,Ey_nodal_flag));
-                    efab[2].resize(amrex::convert(ccbx,Ez_nodal_flag));
-
-#if (AMREX_SPACEDIM == 3)
-                    amrex_interp_efield(ccbx.loVect(), ccbx.hiVect(),
-                                        BL_TO_FORTRAN_ANYD(efab[0]),
-                                        BL_TO_FORTRAN_ANYD(efab[1]),
-                                        BL_TO_FORTRAN_ANYD(efab[2]),
-                                        BL_TO_FORTRAN_ANYD(cxfab),
-                                        BL_TO_FORTRAN_ANYD(cyfab),
-                                        BL_TO_FORTRAN_ANYD(czfab),
-                                        &refinement_ratio,&use_limiter);
-#else
-                    amrex_interp_efield(ccbx.loVect(), ccbx.hiVect(),
-                                        BL_TO_FORTRAN_ANYD(efab[0]),
-                                        BL_TO_FORTRAN_ANYD(efab[2]),
-                                        BL_TO_FORTRAN_ANYD(cxfab),
-                                        BL_TO_FORTRAN_ANYD(czfab),
-                                        &refinement_ratio,&use_limiter);
-                    amrex_interp_nd_efield(ccbx.loVect(), ccbx.hiVect(),
-                                           BL_TO_FORTRAN_ANYD(efab[1]),
-                                           BL_TO_FORTRAN_ANYD(cyfab),
-                                           &refinement_ratio);
-#endif
-
-                    for (int idim = 0; idim < 3; ++idim)
-                    {
-                        FArrayBox& aux = (*Efield_aux[lev][idim])[mfi];
-                        FArrayBox& fp  =  (*Efield_fp[lev][idim])[mfi];
-                        const Box& bx = aux.box();
-                        aux.copy(fp, bx, 0, bx, 0, Efield_fp[lev][idim]->nComp());
-                        aux.plus(efab[idim], bx, bx, 0, 0, Efield_fp[lev][idim]->nComp());
-                    }
-                }
+                    warpx_interp_efield_x(j,k,l, ex_aux, ex_fp, ex_c);
+                },
+                [=] AMREX_GPU_DEVICE (int j, int k, int l) noexcept
+                {
+                    warpx_interp_efield_y(j,k,l, ey_aux, ey_fp, ey_c);
+                },
+                [=] AMREX_GPU_DEVICE (int j, int k, int l) noexcept
+                {
+                    warpx_interp_efield_z(j,k,l, ez_aux, ez_fp, ez_c);
+                });
             }
         }
     }
diff --git a/Source/Parallelization/WarpXComm_K.H b/Source/Parallelization/WarpXComm_K.H
new file mode 100644
index 000000000..093323ec3
--- /dev/null
+++ b/Source/Parallelization/WarpXComm_K.H
@@ -0,0 +1,161 @@
+#ifndef WARPX_COMM_K_H_
+#define WARPX_COMM_K_H_
+
+#include <AMReX_FArrayBox.H>
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_bfield_x (int j, int k, int l,
+                            amrex::Array4<amrex::Real> const& Bxa,
+                            amrex::Array4<amrex::Real const> const& Bxf,
+                            amrex::Array4<amrex::Real const> const& Bxc)
+{
+    using namespace amrex;
+
+    int lg = amrex::coarsen(l,2);
+    int kg = amrex::coarsen(k,2);
+    int jg = amrex::coarsen(j,2);
+
+    Real wx = (j == jg*2) ? 0.0 : 0.5;
+    Real owx = 1.0-wx;
+    Bxa(j,k,l) = owx * Bxc(jg,kg,lg) + wx * Bxc(jg+1,kg,lg) + Bxf(j,k,l);
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_bfield_y (int j, int k, int l,
+                            amrex::Array4<amrex::Real> const& Bya,
+                            amrex::Array4<amrex::Real const> const& Byf,
+                            amrex::Array4<amrex::Real const> const& Byc)
+{
+    using namespace amrex;
+
+    int lg = amrex::coarsen(l,2);
+    int kg = amrex::coarsen(k,2);
+    int jg = amrex::coarsen(j,2);
+
+    // Note that for 2d, l=0, because the amrex convention is used here.
+
+#if (AMREX_SPACEDIM == 3)
+    Real wy = (k == kg*2) ? 0.0 : 0.5;
+    Real owy = 1.0-wy;
+    Bya(j,k,l) = owy * Byc(jg,kg,lg) + wy * Byc(jg,kg+1,lg) + Byf(j,k,l);
+#else
+    Bya(j,k,l) = Byc(jg,kg,lg) + Byf(j,k,l);
+#endif
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_bfield_z (int j, int k, int l,
+                            amrex::Array4<amrex::Real> const& Bza,
+                            amrex::Array4<amrex::Real const> const& Bzf,
+                            amrex::Array4<amrex::Real const> const& Bzc)
+{
+    using namespace amrex;
+
+    int lg = amrex::coarsen(l,2);
+    int kg = amrex::coarsen(k,2);
+    int jg = amrex::coarsen(j,2);
+
+    // Note that for 2d, l=0, because the amrex convention is used here.
+
+#if (AMREX_SPACEDIM == 3)
+    Real wz = (l == lg*2) ? 0.0 : 0.5;
+    Real owz = 1.0-wz;
+    Bza(j,k,l) = owz * Bzc(jg,kg,lg) + owz * Bzc(jg,kg,lg+1) + Bzf(j,k,l);
+#else
+    Real wy = (k == kg*2) ? 0.0 : 0.5;
+    Real owy = 1.0-wy;
+    Bza(j,k,l) = owy * Bzc(jg,kg,lg) + owy * Bzc(jg,kg+1,lg) + Bzf(j,k,l);
+#endif
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_efield_x (int j, int k, int l,
+                            amrex::Array4<amrex::Real> const& Exa,
+                            amrex::Array4<amrex::Real const> const& Exf,
+                            amrex::Array4<amrex::Real const> const& Exc)
+{
+    using namespace amrex;
+
+    int lg = amrex::coarsen(l,2);
+    int kg = amrex::coarsen(k,2);
+    int jg = amrex::coarsen(j,2);
+
+    Real wy = (k == kg*2) ? 0.0 : 0.5;
+    Real owy = 1.0-wy;
+
+#if (AMREX_SPACEDIM == 3)
+    Real wz = (l == lg*2) ? 0.0 : 0.5;
+    Real owz = 1.0-wz;
+    Exa(j,k,l) = owy * owz * Exc(jg  ,kg  ,lg  )
+        +         wy * owz * Exc(jg  ,kg+1,lg  )
+        +        owy *  wz * Exc(jg  ,kg  ,lg+1)
+        +         wy *  wz * Exc(jg  ,kg+1,lg+1)
+        + Exf(j,k,l);
+#else
+    Exa(j,k,l) = owy * Exc(jg,kg,lg) + wy * Exc(jg,kg+1,lg) + Exf(j,k,l);
+#endif
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_efield_y (int j, int k, int l,
+                            amrex::Array4<amrex::Real> const& Eya,
+                            amrex::Array4<amrex::Real const> const& Eyf,
+                            amrex::Array4<amrex::Real const> const& Eyc)
+{
+    using namespace amrex;
+
+    int lg = amrex::coarsen(l,2);
+    int kg = amrex::coarsen(k,2);
+    int jg = amrex::coarsen(j,2);
+
+    Real wx = (j == jg*2) ? 0.0 : 0.5;
+    Real owx = 1.0-wx;
+
+#if (AMREX_SPACEDIM == 3)
+    Real wz = (l == lg*2) ? 0.0 : 0.5;
+    Real owz = 1.0-wz;
+    Eya(j,k,l) = owx * owz * Eyc(jg  ,kg  ,lg  )
+        +         wx * owz * Eyc(jg+1,kg  ,lg  )
+        +        owx *  wz * Eyc(jg  ,kg  ,lg+1)
+        +         wx *  wz * Eyc(jg+1,kg  ,lg+1)
+        + Eyf(j,k,l);
+#else
+    Real wy = (k == kg*2) ? 0.0 : 0.5;
+    Real owy = 1.0-wy;
+    Eya(j,k,l) = owx * owy * Eyc(jg  ,kg  ,lg)
+        +         wx * owy * Eyc(jg+1,kg  ,lg)
+        +        owx *  wy * Eyc(jg  ,kg+1,lg)
+        +         wx *  wy * Eyc(jg+1,kg+1,lg)
+        + Eyf(j,k,l);
+#endif
+}
+
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+void warpx_interp_efield_z (int j, int k, int l,
+                            amrex::Array4<amrex::Real> const& Eza,
+                            amrex::Array4<amrex::Real const> const& Ezf,
+                            amrex::Array4<amrex::Real const> const& Ezc)
+{
+    using namespace amrex;
+
+    int lg = amrex::coarsen(l,2);
+    int kg = amrex::coarsen(k,2);
+    int jg = amrex::coarsen(j,2);
+
+    Real wx = (j == jg*2) ? 0.0 : 0.5;
+    Real owx = 1.0-wx;
+
+#if (AMREX_SPACEDIM == 3)
+    Real wy = (k == kg*2) ? 0.0 : 0.5;
+    Real owy = 1.0-wy;
+    Eza(j,k,l) = owx * owy * Ezc(jg  ,kg  ,lg  )
+        +         wx * owy * Ezc(jg+1,kg  ,lg  )
+        +        owx *  wy * Ezc(jg  ,kg+1,lg  )
+        +         wx *  wy * Ezc(jg+1,kg+1,lg  )
+        + Ezf(j,k,l);
+#else
+    Eza(j,k,l) = owx * Ezc(jg,kg,lg) + wx * Ezc(jg+1,kg,lg) + Ezf(j,k,l);
+#endif
+}
+
+#endif
diff --git a/Source/Parallelization/WarpXSumGuardCells.H b/Source/Parallelization/WarpXSumGuardCells.H
index 24ad1b80f..ce353c2b6 100644
--- a/Source/Parallelization/WarpXSumGuardCells.H
+++ b/Source/Parallelization/WarpXSumGuardCells.H
@@ -15,7 +15,7 @@
  *    updates both the *valid* cells and *guard* cells. (This is because a
  *    spectral solver requires the value of the sources over a large stencil.)
  */
-void
+inline void
 WarpXSumGuardCells(amrex::MultiFab& mf, const amrex::Periodicity& period,
                    const int icomp=0, const int ncomp=1){
 #ifdef WARPX_USE_PSATD
@@ -43,7 +43,7 @@ WarpXSumGuardCells(amrex::MultiFab& mf, const amrex::Periodicity& period,
  * Note: `i_comp` is the component where the results will be stored in `dst`;
  *       The component from which we copy in `src` is always 0.
  */
-void
+inline void
 WarpXSumGuardCells(amrex::MultiFab& dst, amrex::MultiFab& src,
                    const amrex::Periodicity& period,
                    const int icomp=0, const int ncomp=1){
@@ -54,7 +54,7 @@ WarpXSumGuardCells(amrex::MultiFab& dst, amrex::MultiFab& src,
     // Update only the valid cells
     const amrex::IntVect n_updated_guards = amrex::IntVect::TheZeroVector();
 #endif
-    src.SumBoundary(icomp, ncomp, n_updated_guards, period);
+    src.SumBoundary(0, ncomp, n_updated_guards, period);
     amrex::Copy( dst, src, 0, icomp, ncomp, n_updated_guards );
 }
 
diff --git a/Tools/cori_postproc_script.sh b/Tools/cori_postproc_script.sh
new file mode 100644
index 000000000..5e847c7cc
--- /dev/null
+++ b/Tools/cori_postproc_script.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+#SBATCH --job-name=postproc
+#SBATCH --time=00:20:00
+#SBATCH -C haswell
+#SBATCH -N 8
+#SBATCH -q regular
+#SBATCH -e postproce.txt
+#SBATCH -o postproco.txt
+#SBATCH --mail-type=end
+#SBATCH --account=m2852
+
+export OMP_NUM_THREADS=1
+
+# Requires python3 and yt > 3.5
+srun -n 32 -c 16 python plot_parallel.py --path <path/to/plotfiles> --plotlib=yt --parallel
diff --git a/Tools/plot_parallel.py b/Tools/plot_parallel.py
new file mode 100644
index 000000000..2041e7935
--- /dev/null
+++ b/Tools/plot_parallel.py
@@ -0,0 +1,252 @@
+import os, glob, matplotlib, sys, argparse
+import yt ; yt.funcs.mylog.setLevel(50)
+import numpy as np
+import matplotlib.pyplot as plt
+import scipy.constants as scc
+
+'''
+This script loops over all WarpX plotfiles in a directory and, for each
+plotfile, saves an image showing the field and particles.
+
+Requires yt>3.5 and Python3
+
+It can be run serial:
+> python plot_parallel.py --path <path/to/plt/files>
+or parallel
+> mpirun -np 32 python plot_parallel.py --path <path/to/plt/files> --parallel
+When running parallel, the plotfiles are distributed as evenly as possible
+between MPI ranks.
+
+This script also proposes an option to plot one quantity over all timesteps.
+The data of all plotfiles are gathered to rank 0, and the quantity evolution
+is plotted and saved to file. For the illustration, this quantity is the max
+of Ey. Use " --plot_Ey_max_evolution " to activate this option.
+
+To get help, run
+> python plot_parallel --help
+'''
+
+# Parse command line for options.
+parser = argparse.ArgumentParser()
+parser.add_argument('--path', dest='path', default='.',
+                    help='path to plotfiles. Plotfiles names must be plt?????')
+parser.add_argument('--plotlib', dest='plotlib', default='yt',
+                    choices=['yt','matplotlib'],
+                    help='Plotting library to use')
+parser.add_argument('--field', dest='field', default='Ez',
+                    help='Which field to plot, e.g., Ez, By, jx or rho. The central slice in y is plotted')
+parser.add_argument('--pjump', dest='pjump', default=20,
+                    help='When plotlib=matplotlib, we plot every pjump particle')
+parser.add_argument('--use_vmax', dest='use_vmax', default=False,
+                    help='Whether to put bounds to field colormap')
+parser.add_argument('--vmax', dest='vmax', default=1.e12,
+                    help='If use_vmax=True, the colormab will have bounds [-vamx, vmax]')
+parser.add_argument('--slicewidth', dest='slicewidth', default=10.e-6,
+                    help='Only particles with -slicewidth/2<y<slicewidth/2 are plotted')
+parser.add_argument('--parallel', dest='parallel', action='store_true', default=False,
+                    help='whether or not to do the analysis in parallel (e.g., 1 plotfile per MPI rank)')
+parser.add_argument('--species', dest='pslist', nargs='+', type=str, default=None,
+                    help='Species to be plotted, e.g., " --species beam plasma_e ". By default, all species in the simulation are shown')
+parser.add_argument('--plot_Ey_max_evolution', dest='plot_Ey_max_evolution', action='store_true', default=False,
+                    help='Whether to plot evolution of max(Ey) to illustrate how to plot one quantity across all plotfiles.')
+args = parser.parse_args()
+
+# Sanity check
+if int(sys.version[0]) != 3:
+    print('WARNING: Parallel analysis was only tested with Python3')
+
+matplotlib.rcParams.update({'font.size': 14})
+pscolor = ['r','g','b','k','m','c','y','w']
+pssize = 1.
+yt_slicedir = {2:2, 3:1}
+yt_aspect = {2:.05, 3:20}
+
+# Get list of particle species.
+def get_species(a_file_list):
+    # if user-specified, just return the user list
+    if args.pslist is not None:
+        return args.pslist
+    # otherwise, loop over all plotfiles to get particle species list
+    pslist = []
+    for filename in a_file_list:
+        ds = yt.load( filename )
+        # get list of species in current plotfile
+        pslist_plotfile = list( set( [x[0] for x in ds.field_list
+                                      if x[1][:9]=='particle_'] ) )
+        # append species in current plotfile to pslist, and uniquify
+        pslist = list( set( pslist + pslist_plotfile ) )
+    pslist.sort()
+    return pslist
+
+def plot_snapshot(filename):
+    print( filename )
+    # Load plotfile
+    ds = yt.load( filename )
+    # Get number of dimension
+    dim = ds.dimensionality
+
+    # Plot field colormap
+    if plotlib == 'matplotlib':
+        plt.figure(figsize=(12,7))
+        # Read field quantities from yt dataset
+        all_data_level_0 = ds.covering_grid(level=0,left_edge=ds.domain_left_edge, dims=ds.domain_dimensions)
+        F = all_data_level_0['boxlib', args.field].v.squeeze()
+        if dim == 3:
+            F = F[:,int(F.shape[1]+.5)//2,:]
+        extent = [ds.domain_left_edge[dim-1], ds.domain_right_edge[dim-1],
+                  ds.domain_left_edge[0], ds.domain_right_edge[0]]
+        # Plot field quantities with matplotlib
+        plt.imshow(F, aspect='auto', extent=extent, origin='lower')
+        plt.colorbar()
+        plt.xlim(ds.domain_left_edge[dim-1], ds.domain_right_edge[dim-1])
+        plt.ylim(ds.domain_left_edge[0], ds.domain_right_edge[0])
+        if args.use_vmax:
+            plt.clim(-args.vmax, args.vmax)
+    if plotlib == 'yt':
+        # Directly plot with yt
+        sl = yt.SlicePlot(ds, yt_slicedir[dim], args.field, aspect=yt_aspect[dim])
+
+    # Plot particle quantities
+    for ispecies, pspecies in enumerate(pslist):
+        if pspecies in [x[0] for x in ds.field_list]:
+            if plotlib == 'matplotlib':
+                # Read particle quantities from yt dataset
+                xp = ad[pspecies, 'particle_position_x'].v
+                if dim == 3:
+                    yp = ad[pspecies, 'particle_position_y'].v
+                    zp = ad[pspecies, 'particle_position_z'].v
+                    select = yp**2<(args.slicewidth/2)**2
+                    xp = xp[select] ; yp = yp[select] ; zp = zp[select]
+                if dim == 2:
+                    zp = ad[pspecies, 'particle_position_y'].v
+                # Select randomly one every pjump particles
+                random_indices = np.random.choice(xp.shape[0], int(xp.shape[0]/args.pjump))
+                if dim == 2:
+                    xp=xp[random_indices] ; zp=zp[random_indices]
+                if dim == 3:
+                    xp=xp[random_indices] ; yp=yp[random_indices] ; zp=zp[random_indices]
+                plt.scatter(zp,xp,c=pscolor[ispecies],s=pssize, linewidth=pssize,marker=',')
+            if plotlib == 'yt':
+                # Directly plot particles with yt
+                sl.annotate_particles(width=(args.slicewidth, 'm'), p_size=pssize,
+                                      ptype=pspecies, col=pscolor[ispecies])
+    # Add labels to plot and save
+    iteration = int(filename[-5:])
+    if plotlib == 'matplotlib':
+        plt.xlabel('z (m)')
+        plt.ylabel('x (m)')
+        plt.title(args.field + ' at iteration ' + str(iteration) +
+                  ', time = ' + str(ds.current_time))
+        plt.savefig(args.path + '/plt_' + args.field + '_' + plotlib + '_' +
+                    str(iteration).zfill(5) + '.png', bbox_inches='tight', dpi=300)
+        plt.close()
+    if plotlib == 'yt':
+        sl.annotate_grids()
+        sl.save(args.path + '/plt_' + args.field + '_' + plotlib + '_' +
+                str(iteration).zfill(5) + '.png')
+
+# Compute max of field a_field in plotfile filename
+def get_field_max( filename, a_field ):
+    # Load plotfile
+    ds = yt.load( filename )
+    # Get number of dimension
+    dim = ds.dimensionality
+    # Read field quantities from yt dataset
+    all_data_level_0 = ds.covering_grid(level=0,left_edge=ds.domain_left_edge, dims=ds.domain_dimensions)
+    F = all_data_level_0['boxlib', a_field].v.squeeze()
+    zwin = (ds.domain_left_edge[dim-1]+ds.domain_right_edge[dim-1])/2
+    maxF = np.amax(F)
+    return zwin, maxF
+
+def plot_field_max():
+    plt.figure()
+    plt.plot(zwin_arr, maxF_arr)
+    plt.xlabel('z (m)')
+    plt.ylabel('Field (S.I.)')
+    plt.title('Field max evolution')
+    plt.savefig('max_field_evolution.pdf', bbox_inches='tight')
+
+### Analysis ###
+
+# Get list of plotfiles
+plotlib  = args.plotlib
+plot_Ey_max_evolution = args.plot_Ey_max_evolution
+file_list = glob.glob(args.path + '/plt?????')
+file_list.sort()
+nfiles = len(file_list)
+number_list = range(nfiles)
+
+if args.parallel:
+    ### Parallel analysis ###
+    # Split plotfile list among MPI ranks
+    from mpi4py import MPI
+    comm_world = MPI.COMM_WORLD
+    rank = comm_world.Get_rank()
+    size = comm_world.Get_size()
+    max_buf_size = nfiles//size+1
+    if rank == 0:
+        print('Parallel analysis')
+        print('number of MPI ranks: ' + str(size))
+        print('Number of plotfiles: %s' %nfiles)
+    # List of files processed by current MPI rank
+    my_list = file_list[ (rank*nfiles)//size : ((rank+1)*nfiles)//size ]
+    my_number_list = number_list[ (rank*nfiles)//size : ((rank+1)*nfiles)//size ]
+    my_nfiles = len( my_list )
+    nfiles_list = None
+    nfiles_list = comm_world.gather(my_nfiles, root=0)
+    # Get list of particles to plot
+    pslist = get_species(file_list);
+    if rank == 0:
+        print('list of species: ', pslist)
+    if plot_Ey_max_evolution:
+        my_zwin = np.zeros( max_buf_size )
+        my_maxF = np.zeros( max_buf_size )
+    # Loop over files and
+    # - plot field snapshot
+    # - store window position and field max in arrays
+    for count, filename in enumerate(my_list):
+        plot_snapshot( filename )
+        if plot_Ey_max_evolution:
+            my_zwin[count], my_maxF[count] = get_field_max( filename, 'Ey' )
+
+    if plot_Ey_max_evolution:
+        # Gather window position and field max arrays to rank 0
+        zwin_rbuf = None
+        maxF_rbuf = None
+        if rank == 0:
+            zwin_rbuf = np.empty([size, max_buf_size], dtype='d')
+            maxF_rbuf = np.empty([size, max_buf_size], dtype='d')
+        comm_world.Gather(my_zwin, zwin_rbuf, root=0)
+        comm_world.Gather(my_maxF, maxF_rbuf, root=0)
+        # Re-format 2D arrays zwin_rbuf and maxF_rbuf on rank 0
+        # into 1D arrays, and plot them
+        if rank == 0:
+            zwin_arr = np.zeros( nfiles )
+            maxF_arr = np.zeros( nfiles )
+            istart = 0
+            for i in range(size):
+                nelem = nfiles_list[i]
+                zwin_arr[istart:istart+nelem] = zwin_rbuf[i,0:nelem]
+                maxF_arr[istart:istart+nelem] = maxF_rbuf[i,0:nelem]
+                istart += nelem
+            # Plot evolution of field max
+            plot_field_max()
+else:
+    ### Serial analysis ###
+    print('Serial analysis')
+    print('Number of plotfiles: %s' %nfiles)
+    pslist = get_species(file_list);
+    print('list of species: ', pslist)
+    if plot_Ey_max_evolution:
+        zwin_arr = np.zeros( nfiles )
+        maxF_arr = np.zeros( nfiles )
+    # Loop over files and
+    # - plot field snapshot
+    # - store window position and field max in arrays
+    for count, filename in enumerate(file_list):
+        plot_snapshot( filename )
+        if plot_Ey_max_evolution:
+            zwin_arr[count], maxF_arr[count] = get_field_max( filename, 'Ey' )
+    # Plot evolution of field max
+    if plot_Ey_max_evolution:
+        plot_field_max()