7 files changed, 1445 insertions, 0 deletions
diff --git a/Source/FieldSolver/Make.package b/Source/FieldSolver/Make.package
new file mode 100644
index 000000000..42eb6a6f0
--- /dev/null
+++ b/Source/FieldSolver/Make.package
@@ -0,0 +1,15 @@
+CEXE_headers += WarpX_K.H
+CEXE_sources += WarpXPushFieldsEM.cpp
+ifeq ($(USE_PSATD),TRUE)
+  CEXE_sources += WarpXFFT.cpp
+  F90EXE_sources += WarpX_fft.F90
+endif
+ifeq ($(USE_OPENBC_POISSON),TRUE)
+  F90EXE_sources += openbc_poisson_solver.F90
+endif
+ifeq ($DO_ELECTROSTATIC,TRUE)
+  F90EXE_sources += solve_E_nodal.F90
+endif
+
+INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/FieldSolver
+VPATH_LOCATIONS   += $(WARPX_HOME)/Source/FieldSolver
diff --git a/Source/FieldSolver/WarpXFFT.cpp b/Source/FieldSolver/WarpXFFT.cpp
new file mode 100644
index 000000000..f09290f29
--- /dev/null
+++ b/Source/FieldSolver/WarpXFFT.cpp
@@ -0,0 +1,451 @@
+
+#include <WarpX.H>
+#include <WarpX_f.H>
+#include <AMReX_iMultiFab.H>
+
+using namespace amrex;
+
+constexpr int WarpX::FFTData::N;
+
+namespace {
+static std::unique_ptr<WarpX::FFTData> nullfftdata; // This for process with nz_fft=0
+
+/** \brief Returns an "owner mask" which 1 for all cells, except
+ *  for the duplicated (physical) cells of a nodal grid.
+ *
+ *  More precisely, for these cells (which are represented on several grids)
+ *  the owner mask is 1 only if these cells are at the lower left end of
+ *  the local grid - or if these cells are at the end of the physical domain
+ *  Therefore, there for these cells, there will be only one grid for
+ *  which the owner mask is non-zero.
+ */
+static iMultiFab
+BuildFFTOwnerMask (const MultiFab& mf, const Geometry& geom)
+{
+    const BoxArray& ba = mf.boxArray();
+    const DistributionMapping& dm = mf.DistributionMap();
+    iMultiFab mask(ba, dm, 1, 0);
+    const int owner = 1;
+    const int nonowner = 0;
+    mask.setVal(owner);
+
+    const Box& domain_box = amrex::convert(geom.Domain(), ba.ixType());
+
+    AMREX_ASSERT(ba.complementIn(domain_box).isEmpty());
+
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+    for (MFIter mfi(mask); mfi.isValid(); ++mfi)
+    {
+        IArrayBox& fab = mask[mfi];
+        const Box& bx = fab.box();
+        Box bx2 = bx;
+        for (int idim = 0; idim < AMREX_SPACEDIM; ++idim) {
+            // Detect nodal dimensions
+            if (bx2.type(idim) == IndexType::NODE) {
+                // Make sure that this grid does not touch the end of
+                // the physical domain.
+                if (bx2.bigEnd(idim) < domain_box.bigEnd(idim)) {
+                    bx2.growHi(idim, -1);
+                }
+            }
+        }
+        const BoxList& bl = amrex::boxDiff(bx, bx2);
+        // Set owner mask in these cells
+        for (const auto& b : bl) {
+            fab.setVal(nonowner, b, 0, 1);
+        }
+    }
+
+    return mask;
+}
+
+/** \brief Copy the data from the FFT grid to the regular grid
+ *
+ * Because, for nodal grid, some cells are duplicated on several boxes,
+ * special care has to be taken in order to have consistent values on
+ * each boxes when copying this data. Here this is done by setting a
+ * mask, where, for these duplicated cells, the mask is non-zero on only
+ * one box.
+ */
+static void
+CopyDataFromFFTToValid (MultiFab& mf, const MultiFab& mf_fft, const BoxArray& ba_valid_fft, const Geometry& geom)
+{
+    auto idx_type = mf_fft.ixType();
+    MultiFab mftmp(amrex::convert(ba_valid_fft,idx_type), mf_fft.DistributionMap(), 1, 0);
+
+    const iMultiFab& mask = BuildFFTOwnerMask(mftmp, geom);
+
+    // Local copy: whenever an MPI rank owns both the data from the FFT
+    // grid and from the regular grid, for overlapping region, copy it locally
+#ifdef _OPENMP
+#pragma omp parallel
+#endif
+    for (MFIter mfi(mftmp,true); mfi.isValid(); ++mfi)
+    {
+        const Box& bx = mfi.tilebox();
+        FArrayBox& dstfab = mftmp[mfi];
+
+        const FArrayBox& srcfab = mf_fft[mfi];
+        const Box& srcbox = srcfab.box();
+
+        if (srcbox.contains(bx))
+        {
+            // Copy the interior region (without guard cells)
+            dstfab.copy(srcfab, bx, 0, bx, 0, 1);
+            // Set the value to 0 whenever the mask is 0
+            // (i.e. for nodal duplicated cells, there is a single box
+            // for which the mask is different than 0)
+            // if mask == 0, set value to zero
+            dstfab.setValIfNot(0.0, bx, mask[mfi], 0, 1);
+        }
+    }
+
+    // Global copy: Get the remaining the data from other procs
+    // Use ParallelAdd instead of ParallelCopy, so that the value from
+    // the cell that has non-zero mask is the one which is retained.
+    mf.setVal(0.0, 0);
+    mf.ParallelAdd(mftmp);
+}
+
+}
+
+void
+WarpX::AllocLevelDataFFT (int lev)
+{
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(lev == 0, "PSATD doesn't work with mesh refinement yet");
+
+    static_assert(std::is_standard_layout<FFTData>::value, "FFTData must have standard layout");
+    static_assert(sizeof(FFTData) == sizeof(void*)*FFTData::N, "sizeof FFTData is wrong");
+
+    InitFFTComm(lev);
+
+    BoxArray ba_fp_fft;
+    DistributionMapping dm_fp_fft;
+    FFTDomainDecomposition(lev, ba_fp_fft, dm_fp_fft, ba_valid_fp_fft[lev], domain_fp_fft[lev],
+                           geom[lev].Domain());
+
+    // rho2 has one extra ghost cell, so that it's safe to deposit charge density after
+    // pushing particle.
+
+    Efield_fp_fft[lev][0].reset(new MultiFab(amrex::convert(ba_fp_fft,Ex_nodal_flag),
+                                             dm_fp_fft, 1, 0));
+    Efield_fp_fft[lev][1].reset(new MultiFab(amrex::convert(ba_fp_fft,Ey_nodal_flag),
+                                             dm_fp_fft, 1, 0));
+    Efield_fp_fft[lev][2].reset(new MultiFab(amrex::convert(ba_fp_fft,Ez_nodal_flag),
+                                             dm_fp_fft, 1, 0));
+    Bfield_fp_fft[lev][0].reset(new MultiFab(amrex::convert(ba_fp_fft,Bx_nodal_flag),
+                                             dm_fp_fft, 1, 0));
+    Bfield_fp_fft[lev][1].reset(new MultiFab(amrex::convert(ba_fp_fft,By_nodal_flag),
+                                             dm_fp_fft, 1, 0));
+    Bfield_fp_fft[lev][2].reset(new MultiFab(amrex::convert(ba_fp_fft,Bz_nodal_flag),
+                                             dm_fp_fft, 1, 0));
+    current_fp_fft[lev][0].reset(new MultiFab(amrex::convert(ba_fp_fft,jx_nodal_flag),
+                                              dm_fp_fft, 1, 0));
+    current_fp_fft[lev][1].reset(new MultiFab(amrex::convert(ba_fp_fft,jy_nodal_flag),
+                                              dm_fp_fft, 1, 0));
+    current_fp_fft[lev][2].reset(new MultiFab(amrex::convert(ba_fp_fft,jz_nodal_flag),
+                                              dm_fp_fft, 1, 0));
+    rho_fp_fft[lev].reset(new MultiFab(amrex::convert(ba_fp_fft,IntVect::TheNodeVector()),
+                                       dm_fp_fft, 2, 0));
+
+    dataptr_fp_fft[lev].reset(new LayoutData<FFTData>(ba_fp_fft, dm_fp_fft));
+
+    if (lev > 0)
+    {
+        BoxArray ba_cp_fft;
+        DistributionMapping dm_cp_fft;
+        FFTDomainDecomposition(lev, ba_cp_fft, dm_cp_fft, ba_valid_cp_fft[lev], domain_cp_fft[lev],
+                               amrex::coarsen(geom[lev].Domain(),2));
+
+        Efield_cp_fft[lev][0].reset(new MultiFab(amrex::convert(ba_cp_fft,Ex_nodal_flag),
+                                                 dm_cp_fft, 1, 0));
+        Efield_cp_fft[lev][1].reset(new MultiFab(amrex::convert(ba_cp_fft,Ey_nodal_flag),
+                                                 dm_cp_fft, 1, 0));
+        Efield_cp_fft[lev][2].reset(new MultiFab(amrex::convert(ba_cp_fft,Ez_nodal_flag),
+                                                 dm_cp_fft, 1, 0));
+        Bfield_cp_fft[lev][0].reset(new MultiFab(amrex::convert(ba_cp_fft,Bx_nodal_flag),
+                                                 dm_cp_fft, 1, 0));
+        Bfield_cp_fft[lev][1].reset(new MultiFab(amrex::convert(ba_cp_fft,By_nodal_flag),
+                                                 dm_cp_fft, 1, 0));
+        Bfield_cp_fft[lev][2].reset(new MultiFab(amrex::convert(ba_cp_fft,Bz_nodal_flag),
+                                                 dm_cp_fft, 1, 0));
+        current_cp_fft[lev][0].reset(new MultiFab(amrex::convert(ba_cp_fft,jx_nodal_flag),
+                                                  dm_cp_fft, 1, 0));
+        current_cp_fft[lev][1].reset(new MultiFab(amrex::convert(ba_cp_fft,jy_nodal_flag),
+                                                  dm_cp_fft, 1, 0));
+        current_cp_fft[lev][2].reset(new MultiFab(amrex::convert(ba_cp_fft,jz_nodal_flag),
+                                                  dm_cp_fft, 1, 0));
+        rho_cp_fft[lev].reset(new MultiFab(amrex::convert(ba_cp_fft,IntVect::TheNodeVector()),
+                                           dm_cp_fft, 2, 0));
+
+        dataptr_cp_fft[lev].reset(new LayoutData<FFTData>(ba_cp_fft, dm_cp_fft));
+    }
+
+    InitFFTDataPlan(lev);
+}
+
+/** \brief Create MPI sub-communicators for each FFT group,
+ *         and put them in PICSAR module
+ *
+ * These communicators are passed to the parallel FFTW library, in order
+ * to perform a global FFT within each FFT group.
+ */
+void
+WarpX::InitFFTComm (int lev)
+{
+    int nprocs = ParallelDescriptor::NProcs();
+    ngroups_fft = std::min(ngroups_fft, nprocs);
+
+    // # of processes in the subcommunicator
+    int np_fft = nprocs / ngroups_fft;
+    AMREX_ALWAYS_ASSERT_WITH_MESSAGE(np_fft*ngroups_fft == nprocs,
+        "Number of processes must be divisible by number of FFT groups");
+
+    int myproc = ParallelDescriptor::MyProc();
+    // my color in ngroups_fft subcommunicators.  0 <= color_fft < ngroups_fft
+    color_fft[lev] = myproc / np_fft;
+    MPI_Comm_split(ParallelDescriptor::Communicator(), color_fft[lev], myproc, &comm_fft[lev]);
+
+    int fcomm = MPI_Comm_c2f(comm_fft[lev]);
+    // Set the communicator of the PICSAR module to the one we just created
+    warpx_fft_mpi_init(fcomm);
+}
+
+/** \brief Perform domain decomposition for the FFTW
+ *
+ *  Attribute one (unique) box to each proc, in such a way that:
+ *    - The global domain is divided among FFT groups,
+ *      with additional guard cells around each FFT group
+ *    - The domain associated to an FFT group (with its guard cells)
+ *      is further divided in sub-subdomains along z, so as to distribute
+ *      it among the procs within an FFT group
+ *
+ *  The attribution is done by setting (within this function):
+ *  - ba_fft: the BoxArray representing the final set of sub-domains for the FFT
+ *            (includes/covers the guard cells of the FFT groups)
+ *  - dm_fft: the mapping between these sub-domains and the corresponding proc
+ *              (imposes one unique box for each proc)
+ *  - ba_valid: the BoxArray that contains valid part of the sub-domains of ba_fft
+ *            (i.e. does not include/cover the guard cells of the FFT groups)
+ *  - domain_fft: a Box that represent the domain of the FFT group for the current proc
+ */
+void
+WarpX::FFTDomainDecomposition (int lev, BoxArray& ba_fft, DistributionMapping& dm_fft,
+                               BoxArray& ba_valid, Box& domain_fft, const Box& domain)
+{
+
+    IntVect nguards_fft(AMREX_D_DECL(nox_fft/2,noy_fft/2,noz_fft/2));
+
+    int nprocs = ParallelDescriptor::NProcs();
+
+    BoxList bl(domain, ngroups_fft);  // This does a multi-D domain decomposition for groups
+    AMREX_ALWAYS_ASSERT(bl.size() == ngroups_fft);
+    const Vector<Box>& bldata = bl.data();
+
+    // This is the domain for the FFT sub-group (including guard cells)
+    domain_fft = amrex::grow(bldata[color_fft[lev]], nguards_fft);
+    // Ask FFTW to chop the current FFT sub-group domain in the z-direction
+    // and give a chunk to each MPI rank in the current sub-group.
+    int nz_fft, z0_fft;
+
+    warpx_fft_domain_decomp(&nz_fft, &z0_fft, WARPX_TO_FORTRAN_BOX(domain_fft));
+    // Each MPI rank adds a box with its chunk of the FFT grid
+    // (given by the above decomposition) to the list `bx_fft`,
+    // then list is shared among all MPI ranks via AllGather
+    Vector<Box> bx_fft;
+    if (nz_fft > 0) {
+        Box b = domain_fft;
+        b.setRange(AMREX_SPACEDIM-1, z0_fft+domain_fft.smallEnd(AMREX_SPACEDIM-1), nz_fft);
+        bx_fft.push_back(b);
+    } else {
+        // Add empty box for the AllGather call
+        bx_fft.push_back(Box());
+    }
+    amrex::AllGatherBoxes(bx_fft);
+    AMREX_ASSERT(bx_fft.size() == ParallelDescriptor::NProcs());
+    // Build pmap and bx_fft without the empty boxes
+    Vector<int> pmap;
+    for (int i = 0; i < bx_fft.size(); ++i) {
+        if (bx_fft[i].ok()) {
+            pmap.push_back(i);
+        }
+    }
+    bx_fft.erase(std::remove_if(bx_fft.begin(),bx_fft.end(),
+                                [](Box const& b) { return b.isEmpty(); }),
+                 bx_fft.end());
+    AMREX_ASSERT(bx_fft.size() == pmap.size());
+
+    // Define the AMReX objects for the FFT grid: BoxArray and DistributionMapping
+    ba_fft.define(BoxList(std::move(bx_fft)));
+    dm_fft.define(std::move(pmap));
+
+    // For communication between WarpX normal domain and FFT domain, we need to create a
+    // special BoxArray ba_valid
+    const Box foobox(-nguards_fft-2, -nguards_fft-2);
+
+    BoxList bl_valid; // List of boxes: will be filled by the valid part of the subdomains of ba_fft
+    bl_valid.reserve(ba_fft.size());
+    int np_fft = nprocs / ngroups_fft;
+    for (int i = 0; i < ba_fft.size(); ++i)
+    {
+        int igroup = dm_fft[i] / np_fft; // This should be consistent with InitFFTComm
+        const Box& bx = ba_fft[i] & bldata[igroup]; // Intersection with the domain of
+                                                    // the FFT group *without* guard cells
+        if (bx.ok())
+        {
+            bl_valid.push_back(bx);
+        }
+        else
+        {
+            bl_valid.push_back(foobox);
+        }
+    }
+
+    ba_valid.define(std::move(bl_valid));
+}
+
+/** /brief Set all the flags and metadata of the PICSAR FFT module.
+ *         Allocate the auxiliary arrays of `fft_data`
+ *
+ * Note: dataptr_data is a stuct containing 22 pointers to arrays
+ * 1-11: padded arrays in real space ; 12-22 arrays for the fields in Fourier space
+ */
+void
+WarpX::InitFFTDataPlan (int lev)
+{
+    auto dx_fp = CellSize(lev);
+
+    if (Efield_fp_fft[lev][0]->local_size() == 1)
+      //Only one FFT patch on this MPI
+    {
+        for (MFIter mfi(*Efield_fp_fft[lev][0]); mfi.isValid(); ++mfi)
+        {
+            warpx_fft_dataplan_init(&nox_fft, &noy_fft, &noz_fft,
+                                    (*dataptr_fp_fft[lev])[mfi].data, &FFTData::N,
+                                    dx_fp.data(), &dt[lev], &fftw_plan_measure, &WarpX::do_nodal );
+        }
+    }
+    else if (Efield_fp_fft[lev][0]->local_size() == 0)
+      // No FFT patch on this MPI rank (may happen with FFTW)
+      // Still need to call the MPI-FFT initialization routines
+    {
+	nullfftdata.reset(new FFTData());
+        warpx_fft_dataplan_init(&nox_fft, &noy_fft, &noz_fft,
+                                nullfftdata->data, &FFTData::N,
+                                dx_fp.data(), &dt[lev], &fftw_plan_measure,
+                                &WarpX::do_nodal );
+    }
+    else
+    {
+      // Multiple FFT patches on this MPI rank
+        amrex::Abort("WarpX::InitFFTDataPlan: TODO");
+    }
+
+    if (lev > 0)
+    {
+        amrex::Abort("WarpX::InitFFTDataPlan: TODO");
+    }
+}
+
+void
+WarpX::FreeFFT (int lev)
+{
+    nullfftdata.reset();
+
+    warpx_fft_nullify();
+
+    if (comm_fft[lev] != MPI_COMM_NULL) {
+        MPI_Comm_free(&comm_fft[lev]);
+    }
+    comm_fft[lev] = MPI_COMM_NULL;
+}
+
+void
+WarpX::PushPSATD (amrex::Real a_dt)
+{
+    for (int lev = 0; lev <= finest_level; ++lev) {
+        AMREX_ALWAYS_ASSERT_WITH_MESSAGE(dt[lev] == a_dt, "dt must be consistent");
+        PushPSATD(lev, a_dt);
+    }
+}
+
+void
+WarpX::PushPSATD (int lev, amrex::Real /* dt */)
+{
+    BL_PROFILE_VAR_NS("WarpXFFT::CopyDualGrid", blp_copy);
+    BL_PROFILE_VAR_NS("PICSAR::FftPushEB", blp_push_eb);
+
+    auto period_fp = geom[lev].periodicity();
+
+    BL_PROFILE_VAR_START(blp_copy);
+    Efield_fp_fft[lev][0]->ParallelCopy(*Efield_fp[lev][0], 0, 0, 1, 0, 0, period_fp);
+    Efield_fp_fft[lev][1]->ParallelCopy(*Efield_fp[lev][1], 0, 0, 1, 0, 0, period_fp);
+    Efield_fp_fft[lev][2]->ParallelCopy(*Efield_fp[lev][2], 0, 0, 1, 0, 0, period_fp);
+    Bfield_fp_fft[lev][0]->ParallelCopy(*Bfield_fp[lev][0], 0, 0, 1, 0, 0, period_fp);
+    Bfield_fp_fft[lev][1]->ParallelCopy(*Bfield_fp[lev][1], 0, 0, 1, 0, 0, period_fp);
+    Bfield_fp_fft[lev][2]->ParallelCopy(*Bfield_fp[lev][2], 0, 0, 1, 0, 0, period_fp);
+    current_fp_fft[lev][0]->ParallelCopy(*current_fp[lev][0], 0, 0, 1, 0, 0, period_fp);
+    current_fp_fft[lev][1]->ParallelCopy(*current_fp[lev][1], 0, 0, 1, 0, 0, period_fp);
+    current_fp_fft[lev][2]->ParallelCopy(*current_fp[lev][2], 0, 0, 1, 0, 0, period_fp);
+    rho_fp_fft[lev]->ParallelCopy(*rho_fp[lev], 0, 0, 2, 0, 0, period_fp);
+    BL_PROFILE_VAR_STOP(blp_copy);
+
+    BL_PROFILE_VAR_START(blp_push_eb);
+    if (Efield_fp_fft[lev][0]->local_size() == 1)
+       //Only one FFT patch on this MPI
+    {
+	for (MFIter mfi(*Efield_fp_fft[lev][0]); mfi.isValid(); ++mfi)
+	{
+            warpx_fft_push_eb(WARPX_TO_FORTRAN_ANYD((*Efield_fp_fft[lev][0])[mfi]),
+                              WARPX_TO_FORTRAN_ANYD((*Efield_fp_fft[lev][1])[mfi]),
+                              WARPX_TO_FORTRAN_ANYD((*Efield_fp_fft[lev][2])[mfi]),
+                              WARPX_TO_FORTRAN_ANYD((*Bfield_fp_fft[lev][0])[mfi]),
+                              WARPX_TO_FORTRAN_ANYD((*Bfield_fp_fft[lev][1])[mfi]),
+                              WARPX_TO_FORTRAN_ANYD((*Bfield_fp_fft[lev][2])[mfi]),
+                              WARPX_TO_FORTRAN_ANYD((*current_fp_fft[lev][0])[mfi]),
+                              WARPX_TO_FORTRAN_ANYD((*current_fp_fft[lev][1])[mfi]),
+                              WARPX_TO_FORTRAN_ANYD((*current_fp_fft[lev][2])[mfi]),
+                              WARPX_TO_FORTRAN_N_ANYD((*rho_fp_fft[lev])[mfi],0),
+                              WARPX_TO_FORTRAN_N_ANYD((*rho_fp_fft[lev])[mfi],1));
+	}
+    }
+    else if (Efield_fp_fft[lev][0]->local_size() == 0)
+      // No FFT patch on this MPI rank
+      // Still need to call the MPI-FFT routine.
+    {
+	FArrayBox fab(Box(IntVect::TheZeroVector(), IntVect::TheUnitVector()));
+	warpx_fft_push_eb(WARPX_TO_FORTRAN_ANYD(fab),
+			  WARPX_TO_FORTRAN_ANYD(fab),
+			  WARPX_TO_FORTRAN_ANYD(fab),
+			  WARPX_TO_FORTRAN_ANYD(fab),
+			  WARPX_TO_FORTRAN_ANYD(fab),
+			  WARPX_TO_FORTRAN_ANYD(fab),
+			  WARPX_TO_FORTRAN_ANYD(fab),
+			  WARPX_TO_FORTRAN_ANYD(fab),
+			  WARPX_TO_FORTRAN_ANYD(fab),
+			  WARPX_TO_FORTRAN_ANYD(fab),
+			  WARPX_TO_FORTRAN_ANYD(fab));	
+    }
+    else
+      // Multiple FFT patches on this MPI rank
+    {
+	amrex::Abort("WarpX::PushPSATD: TODO");
+    }
+    BL_PROFILE_VAR_STOP(blp_push_eb);
+
+    BL_PROFILE_VAR_START(blp_copy);
+    CopyDataFromFFTToValid(*Efield_fp[lev][0], *Efield_fp_fft[lev][0], ba_valid_fp_fft[lev], geom[lev]);
+    CopyDataFromFFTToValid(*Efield_fp[lev][1], *Efield_fp_fft[lev][1], ba_valid_fp_fft[lev], geom[lev]);
+    CopyDataFromFFTToValid(*Efield_fp[lev][2], *Efield_fp_fft[lev][2], ba_valid_fp_fft[lev], geom[lev]);
+    CopyDataFromFFTToValid(*Bfield_fp[lev][0], *Bfield_fp_fft[lev][0], ba_valid_fp_fft[lev], geom[lev]);
+    CopyDataFromFFTToValid(*Bfield_fp[lev][1], *Bfield_fp_fft[lev][1], ba_valid_fp_fft[lev], geom[lev]);
+    CopyDataFromFFTToValid(*Bfield_fp[lev][2], *Bfield_fp_fft[lev][2], ba_valid_fp_fft[lev], geom[lev]);
+    BL_PROFILE_VAR_STOP(blp_copy);
+
+    if (lev > 0)
+    {
+        amrex::Abort("WarpX::PushPSATD: TODO");
+    }
+}
diff --git a/Source/FieldSolver/WarpXPushFieldsEM.cpp b/Source/FieldSolver/WarpXPushFieldsEM.cpp
new file mode 100644
index 000000000..6d588ae86
--- /dev/null
+++ b/Source/FieldSolver/WarpXPushFieldsEM.cpp
@@ -0,0 +1,428 @@
+
+#include <cmath>
+#include <limits>
+
+#include <WarpX.H>
+#include <WarpXConst.H>
+#include <WarpX_f.H>
+#include <WarpX_K.H>
+#ifdef WARPX_USE_PY
+#include <WarpX_py.H>
+#endif
+
+#ifdef BL_USE_SENSEI_INSITU
+#include <AMReX_AmrMeshInSituBridge.H>
+#endif
+
+using namespace amrex;
+
+void
+WarpX::EvolveB (Real dt)
+{
+    for (int lev = 0; lev <= finest_level; ++lev) {
+        EvolveB(lev, dt);
+    }
+}
+
+void
+WarpX::EvolveB (int lev, Real dt)
+{
+    BL_PROFILE("WarpX::EvolveB()");
+    EvolveB(lev, PatchType::fine, dt);
+    if (lev > 0)
+    {
+        EvolveB(lev, PatchType::coarse, dt);
+    }
+}
+
+void
+WarpX::EvolveB (int lev, PatchType patch_type, amrex::Real dt)
+{
+    const int patch_level = (patch_type == PatchType::fine) ? lev : lev-1;
+    const std::array<Real,3>& dx = WarpX::CellSize(patch_level);
+    Real dtsdx = dt/dx[0], dtsdy = dt/dx[1], dtsdz = dt/dx[2];
+
+    MultiFab *Ex, *Ey, *Ez, *Bx, *By, *Bz;
+    if (patch_type == PatchType::fine)
+    {
+        Ex = Efield_fp[lev][0].get();
+        Ey = Efield_fp[lev][1].get();
+        Ez = Efield_fp[lev][2].get();
+        Bx = Bfield_fp[lev][0].get();
+        By = Bfield_fp[lev][1].get();
+        Bz = Bfield_fp[lev][2].get();
+    }
+    else
+    {
+        Ex = Efield_cp[lev][0].get();
+        Ey = Efield_cp[lev][1].get();
+        Ez = Efield_cp[lev][2].get();
+        Bx = Bfield_cp[lev][0].get();
+        By = Bfield_cp[lev][1].get();
+        Bz = Bfield_cp[lev][2].get();
+    }
+
+    MultiFab* cost = costs[lev].get();
+    const IntVect& rr = (lev > 0) ? refRatio(lev-1) : IntVect::TheUnitVector();
+
+    // Loop through the grids, and over the tiles within each grid
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for ( MFIter mfi(*Bx, TilingIfNotGPU()); mfi.isValid(); ++mfi )
+    {
+        Real wt = amrex::second();
+
+        const Box& tbx  = mfi.tilebox(Bx_nodal_flag);
+        const Box& tby  = mfi.tilebox(By_nodal_flag);
+        const Box& tbz  = mfi.tilebox(Bz_nodal_flag);
+
+        if (do_nodal) {
+            auto const& Bxfab = Bx->array(mfi);
+            auto const& Byfab = By->array(mfi);
+            auto const& Bzfab = Bz->array(mfi);
+            auto const& Exfab = Ex->array(mfi);
+            auto const& Eyfab = Ey->array(mfi);
+            auto const& Ezfab = Ez->array(mfi);
+            amrex::ParallelFor(tbx,
+            [=] AMREX_GPU_DEVICE (int j, int k, int l)
+            {
+                warpx_push_bx_nodal(j,k,l,Bxfab,Eyfab,Ezfab,dtsdy,dtsdz);
+            });
+            amrex::ParallelFor(tby,
+            [=] AMREX_GPU_DEVICE (int j, int k, int l)
+            {
+                warpx_push_by_nodal(j,k,l,Byfab,Exfab,Ezfab,dtsdx,dtsdz);
+            });
+            amrex::ParallelFor(tbz,
+            [=] AMREX_GPU_DEVICE (int j, int k, int l)
+            {
+                warpx_push_bz_nodal(j,k,l,Bzfab,Exfab,Eyfab,dtsdx,dtsdy);
+            });
+        } else {
+            // Call picsar routine for each tile
+            warpx_push_bvec(
+		      tbx.loVect(), tbx.hiVect(),
+		      tby.loVect(), tby.hiVect(),
+		      tbz.loVect(), tbz.hiVect(),
+		      BL_TO_FORTRAN_3D((*Ex)[mfi]),
+		      BL_TO_FORTRAN_3D((*Ey)[mfi]),
+		      BL_TO_FORTRAN_3D((*Ez)[mfi]),
+		      BL_TO_FORTRAN_3D((*Bx)[mfi]),
+		      BL_TO_FORTRAN_3D((*By)[mfi]),
+		      BL_TO_FORTRAN_3D((*Bz)[mfi]),
+                      &dtsdx, &dtsdy, &dtsdz,
+		      &WarpX::maxwell_fdtd_solver_id);
+        }
+
+        if (cost) {
+            Box cbx = mfi.tilebox(IntVect{AMREX_D_DECL(0,0,0)});
+            if (patch_type == PatchType::coarse) cbx.refine(rr);
+            wt = (amrex::second() - wt) / cbx.d_numPts();
+            auto costfab = cost->array(mfi);
+            amrex::ParallelFor(cbx,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                costfab(i,j,k) += wt;
+            });
+        }
+    }
+
+    if (do_pml && pml[lev]->ok())
+    {
+        const auto& pml_B = (patch_type == PatchType::fine) ? pml[lev]->GetB_fp() : pml[lev]->GetB_cp();
+        const auto& pml_E = (patch_type == PatchType::fine) ? pml[lev]->GetE_fp() : pml[lev]->GetE_cp();
+
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+        for ( MFIter mfi(*pml_B[0], TilingIfNotGPU()); mfi.isValid(); ++mfi )
+        {
+            const Box& tbx  = mfi.tilebox(Bx_nodal_flag);
+            const Box& tby  = mfi.tilebox(By_nodal_flag);
+            const Box& tbz  = mfi.tilebox(Bz_nodal_flag);
+
+            WRPX_PUSH_PML_BVEC(
+			     tbx.loVect(), tbx.hiVect(),
+			     tby.loVect(), tby.hiVect(),
+			     tbz.loVect(), tbz.hiVect(),
+			     BL_TO_FORTRAN_3D((*pml_E[0])[mfi]),
+			     BL_TO_FORTRAN_3D((*pml_E[1])[mfi]),
+			     BL_TO_FORTRAN_3D((*pml_E[2])[mfi]),
+			     BL_TO_FORTRAN_3D((*pml_B[0])[mfi]),
+			     BL_TO_FORTRAN_3D((*pml_B[1])[mfi]),
+			     BL_TO_FORTRAN_3D((*pml_B[2])[mfi]),
+                             &dtsdx, &dtsdy, &dtsdz,
+			     &WarpX::maxwell_fdtd_solver_id);
+        }
+    }
+}
+
+void
+WarpX::EvolveE (Real dt)
+{
+    for (int lev = 0; lev <= finest_level; ++lev)
+    {
+        EvolveE(lev, dt);
+    }
+}
+
+void
+WarpX::EvolveE (int lev, Real dt)
+{
+    BL_PROFILE("WarpX::EvolveE()");
+    EvolveE(lev, PatchType::fine, dt);
+    if (lev > 0)
+    {
+        EvolveE(lev, PatchType::coarse, dt);
+    }
+}
+
+void
+WarpX::EvolveE (int lev, PatchType patch_type, amrex::Real dt)
+{
+    const Real mu_c2_dt = (PhysConst::mu0*PhysConst::c*PhysConst::c) * dt;
+    const Real c2dt = (PhysConst::c*PhysConst::c) * dt;
+
+    int patch_level = (patch_type == PatchType::fine) ? lev : lev-1;
+    const std::array<Real,3>& dx = WarpX::CellSize(patch_level);
+    Real dtsdx_c2 = c2dt/dx[0], dtsdy_c2 = c2dt/dx[1], dtsdz_c2 = c2dt/dx[2];
+
+    MultiFab *Ex, *Ey, *Ez, *Bx, *By, *Bz, *jx, *jy, *jz, *F;
+    if (patch_type == PatchType::fine)
+    {
+        Ex = Efield_fp[lev][0].get();
+        Ey = Efield_fp[lev][1].get();
+        Ez = Efield_fp[lev][2].get();
+        Bx = Bfield_fp[lev][0].get();
+        By = Bfield_fp[lev][1].get();
+        Bz = Bfield_fp[lev][2].get();
+        jx = current_fp[lev][0].get();
+        jy = current_fp[lev][1].get();
+        jz = current_fp[lev][2].get();
+        F  = F_fp[lev].get();
+    }
+    else if (patch_type == PatchType::coarse)
+    {
+        Ex = Efield_cp[lev][0].get();
+        Ey = Efield_cp[lev][1].get();
+        Ez = Efield_cp[lev][2].get();
+        Bx = Bfield_cp[lev][0].get();
+        By = Bfield_cp[lev][1].get();
+        Bz = Bfield_cp[lev][2].get();
+        jx = current_cp[lev][0].get();
+        jy = current_cp[lev][1].get();
+        jz = current_cp[lev][2].get();
+        F  = F_cp[lev].get();
+    }
+
+    MultiFab* cost = costs[lev].get();
+    const IntVect& rr = (lev > 0) ? refRatio(lev-1) : IntVect::TheUnitVector();
+
+    // Loop through the grids, and over the tiles within each grid
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+    for ( MFIter mfi(*Ex, TilingIfNotGPU()); mfi.isValid(); ++mfi )
+    {
+        Real wt = amrex::second();
+
+        const Box& tex  = mfi.tilebox(Ex_nodal_flag);
+        const Box& tey  = mfi.tilebox(Ey_nodal_flag);
+        const Box& tez  = mfi.tilebox(Ez_nodal_flag);
+
+        if (do_nodal) {
+            auto const& Exfab = Ex->array(mfi);
+            auto const& Eyfab = Ey->array(mfi);
+            auto const& Ezfab = Ez->array(mfi);
+            auto const& Bxfab = Bx->array(mfi);
+            auto const& Byfab = By->array(mfi);
+            auto const& Bzfab = Bz->array(mfi);
+            auto const& jxfab = jx->array(mfi);
+            auto const& jyfab = jy->array(mfi);
+            auto const& jzfab = jz->array(mfi);
+            amrex::ParallelFor(tex,
+            [=] AMREX_GPU_DEVICE (int j, int k, int l)
+            {
+                warpx_push_ex_nodal(j,k,l,Exfab,Byfab,Bzfab,jxfab,mu_c2_dt,dtsdy_c2,dtsdz_c2);
+            });
+            amrex::ParallelFor(tey,
+            [=] AMREX_GPU_DEVICE (int j, int k, int l)
+            {
+                warpx_push_ey_nodal(j,k,l,Eyfab,Bxfab,Bzfab,jyfab,mu_c2_dt,dtsdx_c2,dtsdz_c2);
+            });
+            amrex::ParallelFor(tez,
+            [=] AMREX_GPU_DEVICE (int j, int k, int l)
+            {
+                warpx_push_ez_nodal(j,k,l,Ezfab,Bxfab,Byfab,jzfab,mu_c2_dt,dtsdx_c2,dtsdy_c2);
+            });
+        } else {
+            // Call picsar routine for each tile
+            warpx_push_evec(
+		      tex.loVect(), tex.hiVect(),
+		      tey.loVect(), tey.hiVect(),
+		      tez.loVect(), tez.hiVect(),
+		      BL_TO_FORTRAN_3D((*Ex)[mfi]),
+		      BL_TO_FORTRAN_3D((*Ey)[mfi]),
+		      BL_TO_FORTRAN_3D((*Ez)[mfi]),
+		      BL_TO_FORTRAN_3D((*Bx)[mfi]),
+		      BL_TO_FORTRAN_3D((*By)[mfi]),
+		      BL_TO_FORTRAN_3D((*Bz)[mfi]),
+		      BL_TO_FORTRAN_3D((*jx)[mfi]),
+		      BL_TO_FORTRAN_3D((*jy)[mfi]),
+		      BL_TO_FORTRAN_3D((*jz)[mfi]),
+		      &mu_c2_dt,
+		      &dtsdx_c2, &dtsdy_c2, &dtsdz_c2);
+        }
+
+        if (F)
+        {
+            warpx_push_evec_f(
+			  tex.loVect(), tex.hiVect(),
+			  tey.loVect(), tey.hiVect(),
+			  tez.loVect(), tez.hiVect(),
+			  BL_TO_FORTRAN_3D((*Ex)[mfi]),
+			  BL_TO_FORTRAN_3D((*Ey)[mfi]),
+			  BL_TO_FORTRAN_3D((*Ez)[mfi]),
+			  BL_TO_FORTRAN_3D((*F)[mfi]),
+                          &dtsdx_c2, &dtsdy_c2, &dtsdz_c2,
+			  &WarpX::maxwell_fdtd_solver_id);
+        }
+
+        if (cost) {
+            Box cbx = mfi.tilebox(IntVect{AMREX_D_DECL(0,0,0)});
+            if (patch_type == PatchType::coarse) cbx.refine(rr);
+            wt = (amrex::second() - wt) / cbx.d_numPts();
+            auto costfab = cost->array(mfi);
+            amrex::ParallelFor(cbx,
+            [=] AMREX_GPU_DEVICE (int i, int j, int k)
+            {
+                costfab(i,j,k) += wt;
+            });
+        }
+    }
+
+    if (do_pml && pml[lev]->ok())
+    {
+        if (F) pml[lev]->ExchangeF(patch_type, F);
+
+        const auto& pml_B = (patch_type == PatchType::fine) ? pml[lev]->GetB_fp() : pml[lev]->GetB_cp();
+        const auto& pml_E = (patch_type == PatchType::fine) ? pml[lev]->GetE_fp() : pml[lev]->GetE_cp();
+        const auto& pml_F = (patch_type == PatchType::fine) ? pml[lev]->GetF_fp() : pml[lev]->GetF_cp();
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+        for ( MFIter mfi(*pml_E[0], TilingIfNotGPU()); mfi.isValid(); ++mfi )
+        {
+            const Box& tex  = mfi.tilebox(Ex_nodal_flag);
+            const Box& tey  = mfi.tilebox(Ey_nodal_flag);
+            const Box& tez  = mfi.tilebox(Ez_nodal_flag);
+
+            WRPX_PUSH_PML_EVEC(
+			     tex.loVect(), tex.hiVect(),
+			     tey.loVect(), tey.hiVect(),
+			     tez.loVect(), tez.hiVect(),
+			     BL_TO_FORTRAN_3D((*pml_E[0])[mfi]),
+			     BL_TO_FORTRAN_3D((*pml_E[1])[mfi]),
+			     BL_TO_FORTRAN_3D((*pml_E[2])[mfi]),
+			     BL_TO_FORTRAN_3D((*pml_B[0])[mfi]),
+			     BL_TO_FORTRAN_3D((*pml_B[1])[mfi]),
+			     BL_TO_FORTRAN_3D((*pml_B[2])[mfi]),
+                             &dtsdx_c2, &dtsdy_c2, &dtsdz_c2);
+
+            if (pml_F)
+            {
+                WRPX_PUSH_PML_EVEC_F(
+				   tex.loVect(), tex.hiVect(),
+				   tey.loVect(), tey.hiVect(),
+				   tez.loVect(), tez.hiVect(),
+				   BL_TO_FORTRAN_3D((*pml_E[0])[mfi]),
+				   BL_TO_FORTRAN_3D((*pml_E[1])[mfi]),
+				   BL_TO_FORTRAN_3D((*pml_E[2])[mfi]),
+				   BL_TO_FORTRAN_3D((*pml_F   )[mfi]),
+                                   &dtsdx_c2, &dtsdy_c2, &dtsdz_c2,
+				   &WarpX::maxwell_fdtd_solver_id);
+            }
+        }
+    }
+}
+
+void
+WarpX::EvolveF (Real dt, DtType dt_type)
+{
+    if (!do_dive_cleaning) return;
+
+    for (int lev = 0; lev <= finest_level; ++lev)
+    {
+        EvolveF(lev, dt, dt_type);
+    }
+}
+
+void
+WarpX::EvolveF (int lev, Real dt, DtType dt_type)
+{
+    if (!do_dive_cleaning) return;
+
+    EvolveF(lev, PatchType::fine, dt, dt_type);
+    if (lev > 0) EvolveF(lev, PatchType::coarse, dt, dt_type);
+}
+
+void
+WarpX::EvolveF (int lev, PatchType patch_type, Real dt, DtType dt_type)
+{
+    if (!do_dive_cleaning) return;
+
+    BL_PROFILE("WarpX::EvolveF()");
+
+    static constexpr Real mu_c2 = PhysConst::mu0*PhysConst::c*PhysConst::c;
+
+    int patch_level = (patch_type == PatchType::fine) ? lev : lev-1;
+    const auto& dx = WarpX::CellSize(patch_level);
+    const std::array<Real,3> dtsdx {dt/dx[0], dt/dx[1], dt/dx[2]};
+
+    MultiFab *Ex, *Ey, *Ez, *rho, *F;
+    if (patch_type == PatchType::fine)
+    {
+        Ex = Efield_fp[lev][0].get();
+        Ey = Efield_fp[lev][1].get();
+        Ez = Efield_fp[lev][2].get();
+        rho = rho_fp[lev].get();
+        F = F_fp[lev].get();
+    }
+    else
+    {
+        Ex = Efield_cp[lev][0].get();
+        Ey = Efield_cp[lev][1].get();
+        Ez = Efield_cp[lev][2].get();
+        rho = rho_cp[lev].get();
+        F = F_cp[lev].get();
+    }
+
+    const int rhocomp = (dt_type == DtType::FirstHalf) ? 0 : 1;
+
+    MultiFab src(rho->boxArray(), rho->DistributionMap(), 1, 0);
+    ComputeDivE(src, 0, {Ex,Ey,Ez}, dx);
+    MultiFab::Saxpy(src, -mu_c2, *rho, rhocomp, 0, 1, 0);
+    MultiFab::Saxpy(*F, dt, src, 0, 0, 1, 0);
+
+    if (do_pml && pml[lev]->ok())
+    {
+        const auto& pml_F = (patch_type == PatchType::fine) ? pml[lev]->GetF_fp() : pml[lev]->GetF_cp();
+        const auto& pml_E = (patch_type == PatchType::fine) ? pml[lev]->GetE_fp() : pml[lev]->GetE_cp();
+
+#ifdef _OPENMP
+#pragma omp parallel if (Gpu::notInLaunchRegion())
+#endif
+        for ( MFIter mfi(*pml_F, TilingIfNotGPU()); mfi.isValid(); ++mfi )
+        {
+            const Box& bx = mfi.tilebox();
+            WRPX_PUSH_PML_F(bx.loVect(), bx.hiVect(),
+			  BL_TO_FORTRAN_ANYD((*pml_F   )[mfi]),
+			  BL_TO_FORTRAN_ANYD((*pml_E[0])[mfi]),
+			  BL_TO_FORTRAN_ANYD((*pml_E[1])[mfi]),
+			  BL_TO_FORTRAN_ANYD((*pml_E[2])[mfi]),
+			  &dtsdx[0], &dtsdx[1], &dtsdx[2]);
+        }
+    }
+}
+
diff --git a/Source/FieldSolver/WarpX_K.H b/Source/FieldSolver/WarpX_K.H
new file mode 100644
index 000000000..2772b764d
--- /dev/null
+++ b/Source/FieldSolver/WarpX_K.H
@@ -0,0 +1,97 @@
+#ifndef WARPX_K_H_
+#define WARPX_K_H_
+
+#include <AMReX_FArrayBox.H>
+
+using namespace amrex;
+
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+void warpx_push_bx_nodal (int j, int k, int l, Array4<Real> const& Bx,
+                          Array4<Real const> const& Ey, Array4<Real const> const& Ez,
+                          Real dtsdy, Real dtsdz)
+{
+#if (AMREX_SPACEDIM == 3)
+    Bx(j,k,l) = Bx(j,k,l) - 0.5*dtsdy * (Ez(j,k+1,l  ) - Ez(j,k-1,l  ))
+                          + 0.5*dtsdz * (Ey(j,k  ,l+1) - Ey(j,k  ,l-1));
+#else
+    Bx(j,k,0) = Bx(j,k,0) + 0.5*dtsdz * (Ey(j,k+1,0) - Ey(j,k-1,0));
+#endif
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+void warpx_push_by_nodal (int j, int k, int l, Array4<Real> const& By,
+                          Array4<Real const> const& Ex, Array4<Real const> const& Ez,
+                          Real dtsdx, Real dtsdz)
+{
+#if (AMREX_SPACEDIM == 3)
+    By(j,k,l) = By(j,k,l) + 0.5*dtsdx * (Ez(j+1,k,l  ) - Ez(j-1,k,l  ))
+                          - 0.5*dtsdz * (Ex(j  ,k,l+1) - Ex(j  ,k,l-1));
+#else
+    By(j,k,0) = By(j,k,0) + 0.5*dtsdx * (Ez(j+1,k  ,0) - Ez(j-1,k  ,0))
+                          - 0.5*dtsdz * (Ex(j  ,k+1,0) - Ex(j  ,k-1,0));
+#endif
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+void warpx_push_bz_nodal (int j, int k, int l, Array4<Real> const& Bz,
+                          Array4<Real const> const& Ex, Array4<Real const> const& Ey,
+                          Real dtsdx, Real dtsdy)
+{
+#if (AMREX_SPACEDIM == 3)
+    Bz(j,k,l) = Bz(j,k,l) - 0.5*dtsdx * (Ey(j+1,k  ,l) - Ey(j-1,k  ,l))
+                          + 0.5*dtsdy * (Ex(j  ,k+1,l) - Ex(j  ,k-1,l));
+#else
+    Bz(j,k,0) = Bz(j,k,0) - 0.5*dtsdx * (Ey(j+1,k  ,0) - Ey(j-1,k  ,0));
+#endif
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+void warpx_push_ex_nodal (int j, int k, int l, Array4<Real> const& Ex,
+                          Array4<Real const> const& By, Array4<Real const> const& Bz,
+                          Array4<Real const> const& jx,
+                          Real mudt, Real dtsdy, Real dtsdz)
+{
+#if (AMREX_SPACEDIM == 3)
+    Ex(j,k,l) = Ex(j,k,l) + 0.5*dtsdy * (Bz(j,k+1,l  ) - Bz(j,k-1,l  ))
+                          - 0.5*dtsdz * (By(j,k  ,l+1) - By(j,k  ,l-1))
+        - mudt  * jx(j,k,l);
+#else
+    Ex(j,k,0) = Ex(j,k,0) - 0.5*dtsdz * (By(j,k+1,0) - By(j,k-1,0))
+        - mudt  * jx(j,k,0);
+#endif
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+void warpx_push_ey_nodal (int j, int k, int l, Array4<Real> const& Ey,
+                          Array4<Real const> const& Bx, Array4<Real const> const& Bz,
+                          Array4<Real const> const& jy,
+                          Real mudt, Real dtsdx, Real dtsdz)
+{
+#if (AMREX_SPACEDIM == 3)
+    Ey(j,k,l) = Ey(j,k,l) - 0.5*dtsdx * (Bz(j+1,k,l  ) - Bz(j-1,k,l  ))
+                          + 0.5*dtsdz * (Bx(j  ,k,l+1) - Bx(j  ,k,l-1))
+        - mudt  * jy(j,k,l);
+#else
+    Ey(j,k,0) = Ey(j,k,0) - 0.5*dtsdx * (Bz(j+1,k  ,0) - Bz(j-1,k  ,0))
+                          + 0.5*dtsdz * (Bx(j  ,k+1,0) - Bx(j  ,k-1,0))
+        - mudt  * jy(j,k,0);
+#endif
+}
+
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+void warpx_push_ez_nodal (int j, int k, int l, Array4<Real> const& Ez,
+                          Array4<Real const> const& Bx, Array4<Real const> const& By,
+                          Array4<Real const> const& jz,
+                          Real mudt, Real dtsdx, Real dtsdy)
+{
+#if (AMREX_SPACEDIM == 3)
+    Ez(j,k,l) = Ez(j,k,l) + 0.5*dtsdx * (By(j+1,k  ,l) - By(j-1,k  ,l))
+                          - 0.5*dtsdy * (Bx(j  ,k+1,l) - Bx(j  ,k-1,l))
+        - mudt  * jz(j,k,l);
+#else
+    Ez(j,k,0) = Ez(j,k,0) + 0.5*dtsdx * (By(j+1,k,0) - By(j-1,k,0))
+        - mudt  * jz(j,k,0);
+#endif
+}
+
+#endif
diff --git a/Source/FieldSolver/WarpX_fft.F90 b/Source/FieldSolver/WarpX_fft.F90
new file mode 100644
index 000000000..35357a63f
--- /dev/null
+++ b/Source/FieldSolver/WarpX_fft.F90
@@ -0,0 +1,319 @@
+
+module warpx_fft_module
+  use amrex_error_module, only : amrex_error, amrex_abort
+  use amrex_fort_module, only : amrex_real
+  use iso_c_binding
+  implicit none
+
+  include 'fftw3-mpi.f03'
+
+  private
+  public :: warpx_fft_mpi_init, warpx_fft_domain_decomp, warpx_fft_dataplan_init, warpx_fft_nullify, &
+       warpx_fft_push_eb
+
+contains
+
+!> @brief
+!! Set the communicator of the PICSAR module to the one that is passed in argument
+  subroutine warpx_fft_mpi_init (fcomm) bind(c,name='warpx_fft_mpi_init')
+    use shared_data, only : comm, rank, nproc
+    integer, intent(in), value :: fcomm
+
+    integer :: ierr, lnproc, lrank
+
+    comm = fcomm
+
+    call mpi_comm_size(comm, lnproc, ierr)
+    nproc = lnproc
+
+    call mpi_comm_rank(comm, lrank, ierr)
+    rank = lrank
+
+#ifdef _OPENMP
+    ierr = fftw_init_threads()
+    if (ierr.eq.0) call amrex_error("fftw_init_threads failed")
+#endif
+    call fftw_mpi_init()
+#ifdef _OPENMP
+    call dfftw_init_threads(ierr)
+    if (ierr.eq.0) call amrex_error("dfftw_init_threads failed")
+#endif
+  end subroutine warpx_fft_mpi_init
+
+!> @brief
+!! Ask FFTW to do domain decomposition.
+!
+! This is always a 1d domain decomposition along z ; it is typically
+! done on the *FFT sub-groups*, not the all domain
+  subroutine warpx_fft_domain_decomp (warpx_local_nz, warpx_local_z0, global_lo, global_hi) &
+       bind(c,name='warpx_fft_domain_decomp')
+    use picsar_precision, only : idp
+    use shared_data, only : comm, &
+         nx_global, ny_global, nz_global, & ! size of global FFT
+         nx, ny, nz ! size of local subdomains
+    use mpi_fftw3, only : local_nz, local_z0, fftw_mpi_local_size_3d, alloc_local
+
+    integer, intent(out) :: warpx_local_nz, warpx_local_z0
+    integer, dimension(3), intent(in) :: global_lo, global_hi
+
+    nx_global = INT(global_hi(1)-global_lo(1)+1,idp)
+    ny_global = INT(global_hi(2)-global_lo(2)+1,idp)
+    nz_global = INT(global_hi(3)-global_lo(3)+1,idp)
+
+    alloc_local = fftw_mpi_local_size_3d( &
+         INT(nz_global,C_INTPTR_T), &
+         INT(ny_global,C_INTPTR_T), &
+         INT(nx_global,C_INTPTR_T)/2+1, &
+         comm, local_nz, local_z0)
+
+    nx = nx_global
+    ny = ny_global
+    nz = local_nz
+
+    warpx_local_nz = local_nz
+    warpx_local_z0 = local_z0
+  end subroutine warpx_fft_domain_decomp
+
+
+!> @brief
+!! Set all the flags and metadata of the PICSAR FFT module.
+!! Allocate the auxiliary arrays of `fft_data`
+!!
+!! Note: fft_data is a stuct containing 22 pointers to arrays
+!! 1-11: padded arrays in real space ; 12-22 arrays for the fields in Fourier space
+  subroutine warpx_fft_dataplan_init (nox, noy, noz, fft_data, ndata, dx_wrpx, dt_wrpx, fftw_measure, do_nodal) &
+       bind(c,name='warpx_fft_dataplan_init')
+    USE picsar_precision, only: idp
+    use shared_data, only : c_dim,  p3dfft_flag, fftw_plan_measure, &
+         fftw_with_mpi, fftw_threads_ok, fftw_hybrid, fftw_mpi_transpose, &
+         nx, ny, nz, & ! size of local subdomains
+         nkx, nky, nkz, & ! size of local ffts
+         iz_min_r, iz_max_r, iy_min_r, iy_max_r, ix_min_r, ix_max_r, & ! loop bounds
+         dx, dy, dz
+    use fields, only : nxguards, nyguards, nzguards,  & ! size of guard regions
+         ex_r, ey_r, ez_r, bx_r, by_r, bz_r, &
+         jx_r, jy_r, jz_r, rho_r, rhoold_r, &
+         exf, eyf, ezf, bxf, byf, bzf, &
+         jxf, jyf, jzf, rhof, rhooldf, &
+         l_spectral, l_staggered, norderx, nordery, norderz
+    use mpi_fftw3, only : alloc_local
+    use omp_lib, only: omp_get_max_threads
+    USE gpstd_solver, only: init_gpstd
+    USE fourier_psaotd, only: init_plans_fourier_mpi
+    use params, only : dt
+
+    integer, intent(in) :: nox, noy, noz, ndata
+    integer, intent(in) :: fftw_measure
+    integer, intent(in) :: do_nodal
+    type(c_ptr), intent(inout) :: fft_data(ndata)
+    real(c_double), intent(in) :: dx_wrpx(3), dt_wrpx
+
+    integer(idp) :: nopenmp
+    integer :: nx_padded
+    integer, dimension(3) :: shp
+    integer(kind=c_size_t) :: sz
+
+    ! No need to distinguish physical and guard cells for the global FFT;
+    ! only nx+2*nxguards counts. Thus we declare 0 guard cells for simplicity
+    nxguards = 0_idp
+    nyguards = 0_idp
+    nzguards = 0_idp
+
+    ! For the calculation of the modified [k] vectors
+    if (do_nodal == 0) then
+       l_staggered = .TRUE.
+    else
+       l_staggered = .FALSE.
+    endif
+    norderx = int(nox, idp)
+    nordery = int(noy, idp)
+    norderz = int(noz, idp)
+    ! Define parameters of FFT plans
+    c_dim = INT(AMREX_SPACEDIM,idp)   ! Dimensionality of the simulation (2d/3d)
+    fftw_with_mpi = .TRUE. ! Activate MPI FFTW
+    fftw_hybrid = .FALSE.   ! FFT per MPI subgroup (instead of global)
+    fftw_mpi_transpose = .FALSE. ! Do not transpose the data
+    fftw_plan_measure = (fftw_measure .ne. 0)
+    p3dfft_flag = .FALSE.
+    l_spectral  = .TRUE.   ! Activate spectral Solver, using FFT
+#ifdef _OPENMP
+    fftw_threads_ok = .TRUE.
+    nopenmp = OMP_GET_MAX_THREADS()
+#else
+    fftw_threads_ok = .FALSE.
+    nopenmp = 1
+#endif
+
+    ! Allocate padded arrays for MPI FFTW
+    nx_padded = 2*(nx/2 + 1)
+    shp = [nx_padded, int(ny), int(nz)]
+    sz = 2*alloc_local
+    fft_data(1) = fftw_alloc_real(sz)
+    call c_f_pointer(fft_data(1), ex_r, shp)
+    fft_data(2) = fftw_alloc_real(sz)
+    call c_f_pointer(fft_data(2), ey_r, shp)
+    fft_data(3) = fftw_alloc_real(sz)
+    call c_f_pointer(fft_data(3), ez_r, shp)
+    fft_data(4) = fftw_alloc_real(sz)
+    call c_f_pointer(fft_data(4), bx_r, shp)
+    fft_data(5) = fftw_alloc_real(sz)
+    call c_f_pointer(fft_data(5), by_r, shp)
+    fft_data(6) = fftw_alloc_real(sz)
+    call c_f_pointer(fft_data(6), bz_r, shp)
+    fft_data(7) = fftw_alloc_real(sz)
+    call c_f_pointer(fft_data(7), jx_r, shp)
+    fft_data(8) = fftw_alloc_real(sz)
+    call c_f_pointer(fft_data(8), jy_r, shp)
+    fft_data(9) = fftw_alloc_real(sz)
+    call c_f_pointer(fft_data(9), jz_r, shp)
+    fft_data(10) = fftw_alloc_real(sz)
+    call c_f_pointer(fft_data(10), rho_r, shp)
+    fft_data(11) = fftw_alloc_real(sz)
+    call c_f_pointer(fft_data(11), rhoold_r, shp)
+
+    ! Set array bounds when copying ex to ex_r in PICSAR
+    ix_min_r = 1; ix_max_r = nx
+    iy_min_r = 1; iy_max_r = ny
+    iz_min_r = 1; iz_max_r = nz
+    ! Allocate Fourier space fields of the same size
+    nkx = nx/2 + 1
+    nky = ny
+    nkz = nz
+    shp = [int(nkx), int(nky), int(nkz)]
+    sz = alloc_local
+    fft_data(12) = fftw_alloc_complex(sz)
+    call c_f_pointer(fft_data(12), exf, shp)
+    fft_data(13) = fftw_alloc_complex(sz)
+    call c_f_pointer(fft_data(13), eyf, shp)
+    fft_data(14) = fftw_alloc_complex(sz)
+    call c_f_pointer(fft_data(14), ezf, shp)
+    fft_data(15) = fftw_alloc_complex(sz)
+    call c_f_pointer(fft_data(15), bxf, shp)
+    fft_data(16) = fftw_alloc_complex(sz)
+    call c_f_pointer(fft_data(16), byf, shp)
+    fft_data(17) = fftw_alloc_complex(sz)
+    call c_f_pointer(fft_data(17), bzf, shp)
+    fft_data(18) = fftw_alloc_complex(sz)
+    call c_f_pointer(fft_data(18), jxf, shp)
+    fft_data(19) = fftw_alloc_complex(sz)
+    call c_f_pointer(fft_data(19), jyf, shp)
+    fft_data(20) = fftw_alloc_complex(sz)
+    call c_f_pointer(fft_data(20), jzf, shp)
+    fft_data(21) = fftw_alloc_complex(sz)
+    call c_f_pointer(fft_data(21), rhof, shp)
+    fft_data(22) = fftw_alloc_complex(sz)
+    call c_f_pointer(fft_data(22), rhooldf, shp)
+!$acc enter data create (exf,eyf,ezf,bxf,byf,bzf,jxf,jyf,jzf,rhof,rhooldf)
+    if (ndata < 22) then
+       call amrex_abort("size of fft_data is too small")
+    end if
+
+    dx = dx_wrpx(1)
+    dy = dx_wrpx(2)
+    dz = dx_wrpx(3)
+    dt = dt_wrpx
+
+    ! Initialize the matrix blocks for the PSATD solver
+    CALL init_gpstd()
+    ! Initialize the plans for fftw with MPI
+    CALL init_plans_fourier_mpi(nopenmp)
+
+  end subroutine warpx_fft_dataplan_init
+
+
+  subroutine warpx_fft_nullify () bind(c,name='warpx_fft_nullify')
+    use fields, only: ex_r, ey_r, ez_r, bx_r, by_r, bz_r, &
+         jx_r, jy_r, jz_r, rho_r, rhoold_r, &
+         exf, eyf, ezf, bxf, byf, bzf, &
+         jxf, jyf, jzf, rhof, rhooldf
+    use mpi_fftw3, only : plan_r2c_mpi, plan_c2r_mpi
+    nullify(ex_r)
+    nullify(ey_r)
+    nullify(ez_r)
+    nullify(bx_r)
+    nullify(by_r)
+    nullify(bz_r)
+    nullify(jx_r)
+    nullify(jy_r)
+    nullify(jz_r)
+    nullify(rho_r)
+    nullify(rhoold_r)
+    nullify(exf)
+    nullify(eyf)
+    nullify(ezf)
+    nullify(bxf)
+    nullify(byf)
+    nullify(bzf)
+    nullify(jxf)
+    nullify(jyf)
+    nullify(jzf)
+    nullify(rhof)
+    nullify(rhooldf)
+    call fftw_destroy_plan(plan_r2c_mpi)
+    call fftw_destroy_plan(plan_c2r_mpi)
+    call fftw_mpi_cleanup()
+  end subroutine warpx_fft_nullify
+
+
+  subroutine warpx_fft_push_eb ( &
+       ex_wrpx, exlo, exhi, &
+       ey_wrpx, eylo, eyhi, &
+       ez_wrpx, ezlo, ezhi, &
+       bx_wrpx, bxlo, bxhi, &
+       by_wrpx, bylo, byhi, &
+       bz_wrpx, bzlo, bzhi, &
+       jx_wrpx, jxlo, jxhi, &
+       jy_wrpx, jylo, jyhi, &
+       jz_wrpx, jzlo, jzhi, &
+       rhoold_wrpx, r1lo, r1hi, &
+       rho_wrpx, r2lo, r2hi) &
+       bind(c,name='warpx_fft_push_eb')
+
+    use fields, only: ex, ey, ez, bx, by, bz, jx, jy, jz
+    use shared_data, only: rhoold, rho
+    use constants, only: num
+
+    integer, dimension(3), intent(in) :: exlo, exhi, eylo, eyhi, ezlo, ezhi, bxlo, bxhi, &
+         bylo, byhi, bzlo, bzhi, jxlo, jxhi, jylo, jyhi, jzlo, jzhi, r1lo, r1hi, r2lo, r2hi
+    REAL(num), INTENT(INOUT), TARGET :: ex_wrpx(0:exhi(1)-exlo(1),0:exhi(2)-exlo(2),0:exhi(3)-exlo(3))
+    REAL(num), INTENT(INOUT), TARGET :: ey_wrpx(0:eyhi(1)-eylo(1),0:eyhi(2)-eylo(2),0:eyhi(3)-eylo(3))
+    REAL(num), INTENT(INOUT), TARGET :: ez_wrpx(0:ezhi(1)-ezlo(1),0:ezhi(2)-ezlo(2),0:ezhi(3)-ezlo(3))
+    REAL(num), INTENT(INOUT), TARGET :: bx_wrpx(0:bxhi(1)-bxlo(1),0:bxhi(2)-bxlo(2),0:bxhi(3)-bxlo(3))
+    REAL(num), INTENT(INOUT), TARGET :: by_wrpx(0:byhi(1)-bylo(1),0:byhi(2)-bylo(2),0:byhi(3)-bylo(3))
+    REAL(num), INTENT(INOUT), TARGET :: bz_wrpx(0:bzhi(1)-bzlo(1),0:bzhi(2)-bzlo(2),0:bzhi(3)-bzlo(3))
+    REAL(num), INTENT(INOUT), TARGET :: jx_wrpx(0:jxhi(1)-jxlo(1),0:jxhi(2)-jxlo(2),0:jxhi(3)-jxlo(3))
+    REAL(num), INTENT(INOUT), TARGET :: jy_wrpx(0:jyhi(1)-jylo(1),0:jyhi(2)-jylo(2),0:jyhi(3)-jylo(3))
+    REAL(num), INTENT(INOUT), TARGET :: jz_wrpx(0:jzhi(1)-jzlo(1),0:jzhi(2)-jzlo(2),0:jzhi(3)-jzlo(3))
+    REAL(num), INTENT(INOUT), TARGET :: rhoold_wrpx(0:r1hi(1)-r1lo(1),0:r1hi(2)-r1lo(2),0:r1hi(3)-r1lo(3))
+    REAL(num), INTENT(INOUT), TARGET :: rho_wrpx(0:r2hi(1)-r2lo(1),0:r2hi(2)-r2lo(2),0:r2hi(3)-r2lo(3))
+
+    ! Point the fields in the PICSAR modules to the fields provided by WarpX
+    ex => ex_wrpx
+    ey => ey_wrpx
+    ez => ez_wrpx
+    bx => bx_wrpx
+    by => by_wrpx
+    bz => bz_wrpx
+    jx => jx_wrpx
+    jy => jy_wrpx
+    jz => jz_wrpx
+    rho => rho_wrpx
+    rhoold => rhoold_wrpx
+
+    ! Call the corresponding PICSAR function
+    CALL push_psatd_ebfield()
+
+    ex => null()
+    ey => null()
+    ez => null()
+    bx => null()
+    by => null()
+    bz => null()
+    jx => null()
+    jy => null()
+    jz => null()
+    rho => null()
+    rhoold => null()
+  end subroutine warpx_fft_push_eb
+
+end module warpx_fft_module
diff --git a/Source/FieldSolver/openbc_poisson_solver.F90 b/Source/FieldSolver/openbc_poisson_solver.F90
new file mode 100644
index 000000000..c18b1db24
--- /dev/null
+++ b/Source/FieldSolver/openbc_poisson_solver.F90
@@ -0,0 +1,62 @@
+
+module warpx_openbc_module
+
+  implicit none
+
+  integer, parameter :: idecomp = 0 ! 0=xyz, 1=xy, 2=yz, 3=xz, 4=x, 5=y, 6=z.
+  integer, parameter :: igfflag = 1 ! =0 for ordinary 1/r Green function; 
+                                    ! =1 for integrated Green function
+
+  integer, save :: gb_lo(3), gb_hi(3)
+  integer, save :: lc_lo(3), lc_hi(3)
+
+contains
+
+    subroutine warpx_openbc_decompose(glo, ghi, lo, hi) bind(c,name='warpx_openbc_decompose')
+
+    use mpi
+
+    integer, intent(in) :: glo(3), ghi(3)
+    integer, intent(out) :: lo(3), hi(3)
+    integer :: myrank, mprocs, ierr, npx, npy, npz
+
+    call MPI_Comm_size(MPI_COMM_WORLD, mprocs, ierr);
+    call MPI_COMM_RANK(MPI_COMM_WORLD,myrank,ierr)
+
+    gb_lo = glo + 1  ! +1 because AMReX's domain index starts with 0
+    gb_hi = ghi + 2  ! +2 to nodalize it
+
+    call procgriddecomp(mprocs, gb_lo(1),gb_hi(1), &
+         &                      gb_lo(2),gb_hi(2), &
+         &                      gb_lo(3),gb_hi(3), &
+         &                      idecomp, npx, npy, npz)
+
+    call decompose(myrank,mprocs, gb_lo(1),gb_hi(1), &
+         &                        gb_lo(2),gb_hi(2), &
+         &                        gb_lo(3),gb_hi(3), &
+         idecomp,                 lc_lo(1), lc_hi(1), &
+         &                        lc_lo(2), lc_hi(2), &
+         &                        lc_lo(3), lc_hi(3))
+
+    lo = lc_lo - 1  ! AMReX's domain index starts with zero.
+    hi = lc_hi - 1
+
+  end subroutine warpx_openbc_decompose
+
+  subroutine warpx_openbc_potential(rho, phi, dx) &
+       bind(C, name="warpx_openbc_potential")
+
+    double precision, intent(in ) :: dx(3)
+    double precision, intent(in ) :: rho(lc_lo(1):lc_hi(1),lc_lo(2):lc_hi(2),lc_lo(3):lc_hi(3))
+    double precision, intent(out) :: phi(lc_lo(1):lc_hi(1),lc_lo(2):lc_hi(2),lc_lo(3):lc_hi(3))
+
+    integer :: ierr
+
+    call openbcpotential(rho, phi, dx(1), dx(2), dx(3), &
+         lc_lo(1), lc_hi(1), lc_lo(2), lc_hi(2), lc_lo(3), lc_hi(3), &
+         gb_lo(1), gb_hi(1), gb_lo(2), gb_hi(2), gb_lo(3), gb_hi(3), &
+         idecomp, igfflag, ierr)
+
+  end subroutine warpx_openbc_potential
+
+end module warpx_openbc_module
diff --git a/Source/FieldSolver/solve_E_nodal.F90 b/Source/FieldSolver/solve_E_nodal.F90
new file mode 100644
index 000000000..e5ac50b2a
--- /dev/null
+++ b/Source/FieldSolver/solve_E_nodal.F90
@@ -0,0 +1,73 @@
+module warpx_ES_solve_E_nodal
+
+  use iso_c_binding
+  use amrex_fort_module, only : amrex_real
+
+  implicit none
+
+contains
+
+! This routine computes the node-centered electric field given a node-centered phi.
+! The gradient is computed using 2nd-order centered differences. It assumes the 
+! Boundary conditions have already been set and that you have two rows of ghost cells
+! for phi and one row of ghost cells for Ex, Ey, and Ez.
+! Note that this routine includes the minus sign in E = - grad phi.
+!
+! Arguments:
+!     lo, hi:     The corners of the valid box over which the gradient is taken
+!     Ex, Ey, Ez: The electric field in the x, y, and z directions.
+!     dx:         The cell spacing
+!
+  subroutine warpx_compute_E_nodal_3d (lo, hi, phi, Ex, Ey, Ez, dx) &
+       bind(c,name='warpx_compute_E_nodal_3d')
+    integer(c_int),   intent(in)    :: lo(3), hi(3)
+    real(amrex_real), intent(in)    :: dx(3)
+    real(amrex_real), intent(in   ) :: phi(lo(1)-2:hi(1)+2,lo(2)-2:hi(2)+2,lo(3)-2:hi(3)+2)
+    real(amrex_real), intent(inout) :: Ex (lo(1)-1:hi(1)+1,lo(2)-1:hi(2)+1,lo(3)-1:hi(3)+1)
+    real(amrex_real), intent(inout) :: Ey (lo(1)-1:hi(1)+1,lo(2)-1:hi(2)+1,lo(3)-1:hi(3)+1)
+    real(amrex_real), intent(inout) :: Ez (lo(1)-1:hi(1)+1,lo(2)-1:hi(2)+1,lo(3)-1:hi(3)+1)
+
+    integer :: i, j, k
+    real(amrex_real) :: fac(3)
+
+    fac = 0.5d0 / dx
+
+    do k = lo(3)-1, hi(3)+1
+       do j = lo(2)-1, hi(2)+1
+          do i = lo(1)-1, hi(1)+1
+             
+             Ex(i,j,k) = fac(1) * (phi(i-1,j,k) - phi(i+1,j,k))
+             Ey(i,j,k) = fac(2) * (phi(i,j-1,k) - phi(i,j+1,k))
+             Ez(i,j,k) = fac(3) * (phi(i,j,k-1) - phi(i,j,k+1))
+
+          end do
+       end do
+    end do
+
+  end subroutine warpx_compute_E_nodal_3d
+
+  subroutine warpx_compute_E_nodal_2d (lo, hi, phi, Ex, Ey, dx) &
+       bind(c,name='warpx_compute_E_nodal_2d')
+    integer(c_int),   intent(in)    :: lo(2), hi(2)
+    real(amrex_real), intent(in)    :: dx(2)
+    real(amrex_real), intent(in   ) :: phi(lo(1)-2:hi(1)+2,lo(2)-2:hi(2)+2)
+    real(amrex_real), intent(inout) :: Ex (lo(1)-1:hi(1)+1,lo(2)-1:hi(2)+1)
+    real(amrex_real), intent(inout) :: Ey (lo(1)-1:hi(1)+1,lo(2)-1:hi(2)+1)
+
+    integer :: i, j
+    real(amrex_real) :: fac(2)
+
+    fac = 0.5d0 / dx
+
+    do j = lo(2)-1, hi(2)+1
+       do i = lo(1)-1, hi(1)+1
+             
+          Ex(i,j) = fac(1) * (phi(i-1,j) - phi(i+1,j))
+          Ey(i,j) = fac(2) * (phi(i,j-1) - phi(i,j+1))
+
+       end do
+    end do
+
+  end subroutine warpx_compute_E_nodal_2d
+
+end module warpx_ES_solve_E_nodal