10 files changed, 162 insertions, 69 deletions
diff --git a/Source/Evolve/WarpXEvolveEM.cpp b/Source/Evolve/WarpXEvolveEM.cpp
index a5d68e4f9..4f33694cd 100644
--- a/Source/Evolve/WarpXEvolveEM.cpp
+++ b/Source/Evolve/WarpXEvolveEM.cpp
@@ -84,12 +84,14 @@ WarpX::EvolveEM (int numsteps)
                             *Bfield_aux[lev][0],*Bfield_aux[lev][1],*Bfield_aux[lev][2]);
             }
             is_synchronized = false;
+
         } else {
             // Beyond one step, we have E^{n} and B^{n}.
             // Particles have p^{n-1/2} and x^{n}.
             FillBoundaryE();
             FillBoundaryB();
             UpdateAuxilaryData();
+
         }
 
         if (do_subcycling == 0 || finest_level == 0) {
@@ -283,6 +285,7 @@ WarpX::OneStep_nosub (Real cur_time)
     if (warpx_py_beforedeposition) warpx_py_beforedeposition();
 #endif
     PushParticlesandDepose(cur_time);
+
 #ifdef WARPX_USE_PY
     if (warpx_py_afterdeposition) warpx_py_afterdeposition();
 #endif
diff --git a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H
index 0487e5226..12718e38b 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H
+++ b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.H
@@ -8,14 +8,18 @@
  */
 class PsatdAlgorithm : public SpectralBaseAlgorithm
 {
+
     public:
         PsatdAlgorithm(const SpectralKSpace& spectral_kspace,
                          const amrex::DistributionMapping& dm,
                          const int norder_x, const int norder_y,
-                         const int norder_z, const bool nodal,
-                         const amrex::Real dt);
-        // Redefine update equation from base class
-        virtual void pushSpectralFields(SpectralFieldData& f) const override final;
+                         const int norder_z, const bool nodal, const amrex::Real dt);
+
+        void InitializeSpectralCoefficients(const SpectralKSpace& spectral_kspace,
+                                    const amrex::DistributionMapping& dm, 
+                                    const amrex::Real dt);
+
+        void pushSpectralFields(SpectralFieldData& f) const override final;
 
     private:
         SpectralCoefficients C_coef, S_ck_coef, X1_coef, X2_coef, X3_coef;
diff --git a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.cpp b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.cpp
index 37892d35a..d45b01bda 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.cpp
+++ b/Source/FieldSolver/SpectralSolver/SpectralAlgorithms/PsatdAlgorithm.cpp
@@ -22,59 +22,8 @@ PsatdAlgorithm::PsatdAlgorithm(const SpectralKSpace& spectral_kspace,
     X2_coef = SpectralCoefficients(ba, dm, 1, 0);
     X3_coef = SpectralCoefficients(ba, dm, 1, 0);
 
-    // Fill them with the right values:
-    // Loop over boxes and allocate the corresponding coefficients
-    // for each box owned by the local MPI proc
-    for (MFIter mfi(ba, dm); mfi.isValid(); ++mfi){
-
-        const Box& bx = ba[mfi];
-
-        // Extract pointers for the k vectors
-        const Real* modified_kx = modified_kx_vec[mfi].dataPtr();
-#if (AMREX_SPACEDIM==3)
-        const Real* modified_ky = modified_ky_vec[mfi].dataPtr();
-#endif
-        const Real* modified_kz = modified_kz_vec[mfi].dataPtr();
-        // Extract arrays for the coefficients
-        Array4<Real> C = C_coef[mfi].array();
-        Array4<Real> S_ck = S_ck_coef[mfi].array();
-        Array4<Real> X1 = X1_coef[mfi].array();
-        Array4<Real> X2 = X2_coef[mfi].array();
-        Array4<Real> X3 = X3_coef[mfi].array();
-
-        // Loop over indices within one box
-        ParallelFor(bx,
-        [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
-        {
-            // Calculate norm of vector
-            const Real k_norm = std::sqrt(
-                std::pow(modified_kx[i], 2) +
-#if (AMREX_SPACEDIM==3)
-                std::pow(modified_ky[j], 2) +
-                std::pow(modified_kz[k], 2));
-#else
-                std::pow(modified_kz[j], 2));
-#endif
-
-            // Calculate coefficients
-            constexpr Real c = PhysConst::c;
-            constexpr Real ep0 = PhysConst::ep0;
-            if (k_norm != 0){
-                C(i,j,k) = std::cos(c*k_norm*dt);
-                S_ck(i,j,k) = std::sin(c*k_norm*dt)/(c*k_norm);
-                X1(i,j,k) = (1. - C(i,j,k))/(ep0 * c*c * k_norm*k_norm);
-                X2(i,j,k) = (1. - S_ck(i,j,k)/dt)/(ep0 * k_norm*k_norm);
-                X3(i,j,k) = (C(i,j,k) - S_ck(i,j,k)/dt)/(ep0 * k_norm*k_norm);
-            } else { // Handle k_norm = 0, by using the analytical limit
-                C(i,j,k) = 1.;
-                S_ck(i,j,k) = dt;
-                X1(i,j,k) = 0.5 * dt*dt / ep0;
-                X2(i,j,k) = c*c * dt*dt / (6.*ep0);
-                X3(i,j,k) = - c*c * dt*dt / (3.*ep0);
-            }
-        });
-    }
-};
+    InitializeSpectralCoefficients(spectral_kspace, dm, dt);
+}
 
 /* Advance the E and B field in spectral space (stored in `f`)
  * over one time step */
@@ -130,13 +79,14 @@ PsatdAlgorithm::pushSpectralFields(SpectralFieldData& f) const{
 #endif
             constexpr Real c2 = PhysConst::c*PhysConst::c;
             constexpr Real inv_ep0 = 1./PhysConst::ep0;
-            constexpr Complex I = Complex{0,1};
+            const Complex I = Complex{0,1};
             const Real C = C_arr(i,j,k);
             const Real S_ck = S_ck_arr(i,j,k);
             const Real X1 = X1_arr(i,j,k);
             const Real X2 = X2_arr(i,j,k);
             const Real X3 = X3_arr(i,j,k);
 
+
             // Update E (see WarpX online documentation: theory section)
             fields(i,j,k,Idx::Ex) = C*Ex_old
                         + S_ck*(c2*I*(ky*Bz_old - kz*By_old) - inv_ep0*Jx)
@@ -160,3 +110,63 @@ PsatdAlgorithm::pushSpectralFields(SpectralFieldData& f) const{
         });
     }
 };
+
+void PsatdAlgorithm::InitializeSpectralCoefficients(const SpectralKSpace& spectral_kspace,
+                                    const amrex::DistributionMapping& dm,
+                                    const amrex::Real dt)
+{
+    const BoxArray& ba = spectral_kspace.spectralspace_ba;
+    // Fill them with the right values:
+    // Loop over boxes and allocate the corresponding coefficients
+    // for each box owned by the local MPI proc
+    for (MFIter mfi(ba, dm); mfi.isValid(); ++mfi){
+
+        const Box& bx = ba[mfi];
+
+        // Extract pointers for the k vectors
+        const Real* modified_kx = modified_kx_vec[mfi].dataPtr();
+#if (AMREX_SPACEDIM==3)
+        const Real* modified_ky = modified_ky_vec[mfi].dataPtr();
+#endif
+        const Real* modified_kz = modified_kz_vec[mfi].dataPtr();
+        // Extract arrays for the coefficients
+        Array4<Real> C = C_coef[mfi].array();
+        Array4<Real> S_ck = S_ck_coef[mfi].array();
+        Array4<Real> X1 = X1_coef[mfi].array();
+        Array4<Real> X2 = X2_coef[mfi].array();
+        Array4<Real> X3 = X3_coef[mfi].array();
+
+        // Loop over indices within one box
+        ParallelFor(bx,
+        [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept
+        {
+            // Calculate norm of vector
+            const Real k_norm = std::sqrt(
+                std::pow(modified_kx[i], 2) +
+#if (AMREX_SPACEDIM==3)
+                std::pow(modified_ky[j], 2) +
+                std::pow(modified_kz[k], 2));
+#else
+                std::pow(modified_kz[j], 2));
+#endif
+
+
+            // Calculate coefficients
+            constexpr Real c = PhysConst::c;
+            constexpr Real ep0 = PhysConst::ep0;
+            if (k_norm != 0){
+                C(i,j,k) = std::cos(c*k_norm*dt);
+                S_ck(i,j,k) = std::sin(c*k_norm*dt)/(c*k_norm);
+                X1(i,j,k) = (1. - C(i,j,k))/(ep0 * c*c * k_norm*k_norm);
+                X2(i,j,k) = (1. - S_ck(i,j,k)/dt)/(ep0 * k_norm*k_norm);
+                X3(i,j,k) = (C(i,j,k) - S_ck(i,j,k)/dt)/(ep0 * k_norm*k_norm);
+            } else { // Handle k_norm = 0, by using the analytical limit
+                C(i,j,k) = 1.;
+                S_ck(i,j,k) = dt;
+                X1(i,j,k) = 0.5 * dt*dt / ep0;
+                X2(i,j,k) = c*c * dt*dt / (6.*ep0);
+                X3(i,j,k) = - c*c * dt*dt / (3.*ep0);
+            }
+        });
+     }
+}
diff --git a/Source/FieldSolver/SpectralSolver/SpectralFieldData.H b/Source/FieldSolver/SpectralSolver/SpectralFieldData.H
index 8e58aa1d8..7954414b8 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralFieldData.H
+++ b/Source/FieldSolver/SpectralSolver/SpectralFieldData.H
@@ -25,7 +25,7 @@ class SpectralFieldData
     // (plans are only initialized for the boxes that are owned by
     // the local MPI rank)
 #ifdef AMREX_USE_GPU
-    // Add cuFFT-specific code
+    using FFTplans = amrex::LayoutData<cufftHandle>;
 #else
     using FFTplans = amrex::LayoutData<fftw_plan>;
 #endif
diff --git a/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp b/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp
index 02fa2015f..a2b695568 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp
+++ b/Source/FieldSolver/SpectralSolver/SpectralFieldData.cpp
@@ -53,7 +53,38 @@ SpectralFieldData::SpectralFieldData( const BoxArray& realspace_ba,
         // the FFT plan, the valid dimensions are those of the real-space box.
         IntVect fft_size = realspace_ba[mfi].length();
 #ifdef AMREX_USE_GPU
-        // Add cuFFT-specific code
+        // Create cuFFT plans
+        // Creating 3D plan for real to complex -- double precision
+        // Assuming CUDA is used for programming GPU 
+        // Note that D2Z is inherently forward plan 
+        // and  Z2D is inherently backward plan 
+        cufftResult result;
+#if (AMREX_SPACEDIM == 3)
+        result = cufftPlan3d( &forward_plan[mfi], fft_size[2], 
+                              fft_size[1],fft_size[0], CUFFT_D2Z);
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " cufftplan3d forward failed! \n";
+        }
+
+        result = cufftPlan3d( &backward_plan[mfi], fft_size[2], 
+                              fft_size[1], fft_size[0], CUFFT_Z2D);
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " cufftplan3d backward failed! \n";
+        }
+#else
+        result = cufftPlan2d( &forward_plan[mfi], fft_size[1], 
+                              fft_size[0], CUFFT_D2Z );
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " cufftplan2d forward failed! \n";
+        }
+
+        result = cufftPlan2d( &backward_plan[mfi], fft_size[1], 
+                               fft_size[0], CUFFT_Z2D );
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " cufftplan2d backward failed! \n";
+        }
+#endif
+
 #else
         // Create FFTW plans
         forward_plan[mfi] =
@@ -86,7 +117,9 @@ SpectralFieldData::~SpectralFieldData()
     if (tmpRealField.size() > 0){
         for ( MFIter mfi(tmpRealField); mfi.isValid(); ++mfi ){
 #ifdef AMREX_USE_GPU
-            // Add cuFFT-specific code
+            // Destroy cuFFT plans
+            cufftDestroy( forward_plan[mfi] );
+            cufftDestroy( backward_plan[mfi] );
 #else
             // Destroy FFTW plans
             fftw_destroy_plan( forward_plan[mfi] );
@@ -129,14 +162,25 @@ SpectralFieldData::ForwardTransform( const MultiFab& mf,
             Array4<Real> tmp_arr = tmpRealField[mfi].array();
             ParallelFor( realspace_bx,
             [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
-                tmp_arr(i,j,k) = mf_arr(i,j,k,i_comp);
+                tmp_arr(i,j,k) = mf_arr(i,j,k,i_comp);                
             });
         }
 
         // Perform Fourier transform from `tmpRealField` to `tmpSpectralField`
 #ifdef AMREX_USE_GPU
-        // Add cuFFT-specific code ; make sure that this is done on the same
-        // GPU stream as the above copy
+        // Perform Fast Fourier Transform on GPU using cuFFT 
+        // make sure that this is done on the same 
+        // GPU stream as the above copy        
+        cufftResult result;
+        cudaStream_t stream = amrex::Gpu::Device::cudaStream(); 
+        cufftSetStream ( forward_plan[mfi], stream);
+        result = cufftExecD2Z( forward_plan[mfi], 
+                               tmpRealField[mfi].dataPtr(), 
+                               reinterpret_cast<cuDoubleComplex*>(
+                               tmpSpectralField[mfi].dataPtr()) );
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " forward transform using cufftExecD2Z failed ! \n";
+        }
 #else
         fftw_execute( forward_plan[mfi] );
 #endif
@@ -155,6 +199,7 @@ SpectralFieldData::ForwardTransform( const MultiFab& mf,
             const Complex* zshift_arr = zshift_FFTfromCell[mfi].dataPtr();
             // Loop over indices within one box
             const Box spectralspace_bx = tmpSpectralField[mfi].box();
+
             ParallelFor( spectralspace_bx,
             [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
                 Complex spectral_field_value = tmp_arr(i,j,k);
@@ -207,6 +252,7 @@ SpectralFieldData::BackwardTransform( MultiFab& mf,
             const Complex* zshift_arr = zshift_FFTtoCell[mfi].dataPtr();
             // Loop over indices within one box
             const Box spectralspace_bx = tmpSpectralField[mfi].box();
+
             ParallelFor( spectralspace_bx,
             [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
                 Complex spectral_field_value = field_arr(i,j,k,field_index);
@@ -225,8 +271,19 @@ SpectralFieldData::BackwardTransform( MultiFab& mf,
 
         // Perform Fourier transform from `tmpSpectralField` to `tmpRealField`
 #ifdef AMREX_USE_GPU
-        // Add cuFFT-specific code ; make sure that this is done on the same
+        // Perform Fast Fourier Transform on GPU using cuFFT. 
+        // make sure that this is done on the same 
         // GPU stream as the above copy
+        cufftResult result;
+        cudaStream_t stream = amrex::Gpu::Device::cudaStream(); 
+        cufftSetStream ( backward_plan[mfi], stream);
+        result = cufftExecZ2D( backward_plan[mfi], 
+                               reinterpret_cast<cuDoubleComplex*>(
+                               tmpSpectralField[mfi].dataPtr()),
+                               tmpRealField[mfi].dataPtr() );
+        if ( result != CUFFT_SUCCESS ) {
+           amrex::Print() << " Backward transform using cufftexecZ2D failed! \n";
+        }
 #else
         fftw_execute( backward_plan[mfi] );
 #endif
@@ -240,6 +297,7 @@ SpectralFieldData::BackwardTransform( MultiFab& mf,
             Array4<const Real> tmp_arr = tmpRealField[mfi].array();
             // Normalization: divide by the number of points in realspace
             const Real inv_N = 1./realspace_bx.numPts();
+
             ParallelFor( realspace_bx,
             [=] AMREX_GPU_DEVICE(int i, int j, int k) noexcept {
                 // Copy and normalize field
diff --git a/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp b/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp
index 2fe78cedd..6fe5e3939 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp
+++ b/Source/FieldSolver/SpectralSolver/SpectralKSpace.cpp
@@ -142,9 +142,14 @@ SpectralKSpace::getSpectralShiftFactor( const DistributionMapping& dm,
             case ShiftType::TransformFromCellCentered: sign = -1.; break;
             case ShiftType::TransformToCellCentered: sign = 1.;
         }
-        constexpr Complex I{0,1};
+        const Complex I{0,1};
         for (int i=0; i<k.size(); i++ ){
+#ifdef AMREX_USE_GPU
+            shift[i] = thrust::exp( I*sign*k[i]*0.5*dx[i_dim] );
+#else
             shift[i] = std::exp( I*sign*k[i]*0.5*dx[i_dim] );
+#endif
+        
         }
     }
     return shift_factor;
diff --git a/Source/FieldSolver/WarpXFFT.cpp b/Source/FieldSolver/WarpXFFT.cpp
index 1cf5460f2..13d92f6f3 100644
--- a/Source/FieldSolver/WarpXFFT.cpp
+++ b/Source/FieldSolver/WarpXFFT.cpp
@@ -56,6 +56,7 @@ BuildFFTOwnerMask (const MultiFab& mf, const Geometry& geom)
         for (const auto& b : bl) {
             fab.setVal(nonowner, b, 0, 1);
         }
+
     }
 
     return mask;
@@ -89,7 +90,7 @@ CopyDataFromFFTToValid (MultiFab& mf, const MultiFab& mf_fft, const BoxArray& ba
 
         const FArrayBox& srcfab = mf_fft[mfi];
         const Box& srcbox = srcfab.box();
-
+ 
         if (srcbox.contains(bx))
         {
             // Copy the interior region (without guard cells)
@@ -107,6 +108,8 @@ CopyDataFromFFTToValid (MultiFab& mf, const MultiFab& mf_fft, const BoxArray& ba
     // the cell that has non-zero mask is the one which is retained.
     mf.setVal(0.0, 0);
     mf.ParallelAdd(mftmp);
+
+  
 }
 
 }
@@ -407,6 +410,7 @@ WarpX::PushPSATD (int lev, amrex::Real /* dt */)
 
     BL_PROFILE_VAR_START(blp_push_eb);
     if (fft_hybrid_mpi_decomposition){
+#ifndef AMREX_USE_CUDA // When running on CPU: use PICSAR code
         if (Efield_fp_fft[lev][0]->local_size() == 1)
            //Only one FFT patch on this MPI
         {
@@ -447,6 +451,9 @@ WarpX::PushPSATD (int lev, amrex::Real /* dt */)
         {
     	amrex::Abort("WarpX::PushPSATD: TODO");
         }
+#else // AMREX_USE_CUDA is defined ; running on GPU
+        amrex::Abort("The option `psatd.fft_hybrid_mpi_decomposition` does not work on GPU.");
+#endif
     } else {
         // Not using the hybrid decomposition
         auto& solver = *spectral_solver_fp[lev];
@@ -474,6 +481,7 @@ WarpX::PushPSATD (int lev, amrex::Real /* dt */)
         solver.BackwardTransform(*Bfield_fp_fft[lev][0], SpectralFieldIndex::Bx);
         solver.BackwardTransform(*Bfield_fp_fft[lev][1], SpectralFieldIndex::By);
         solver.BackwardTransform(*Bfield_fp_fft[lev][2], SpectralFieldIndex::Bz);
+
     }
     BL_PROFILE_VAR_STOP(blp_push_eb);
 
@@ -490,4 +498,7 @@ WarpX::PushPSATD (int lev, amrex::Real /* dt */)
     {
         amrex::Abort("WarpX::PushPSATD: TODO");
     }
+
 }
+
+
diff --git a/Source/Initialization/WarpXInitData.cpp b/Source/Initialization/WarpXInitData.cpp
index 0f33d1a0f..d583b2b0f 100644
--- a/Source/Initialization/WarpXInitData.cpp
+++ b/Source/Initialization/WarpXInitData.cpp
@@ -322,6 +322,7 @@ WarpX::InitLevelData (int lev, Real time)
 void
 WarpX::InitLevelDataFFT (int lev, Real time)
 {
+ 
     Efield_fp_fft[lev][0]->setVal(0.0);
     Efield_fp_fft[lev][1]->setVal(0.0);
     Efield_fp_fft[lev][2]->setVal(0.0);
@@ -346,6 +347,7 @@ WarpX::InitLevelDataFFT (int lev, Real time)
         current_cp_fft[lev][2]->setVal(0.0);
         rho_cp_fft[lev]->setVal(0.0);
     }
+
 }
 
 #endif
diff --git a/Source/Parallelization/WarpXComm.cpp b/Source/Parallelization/WarpXComm.cpp
index 5c9fa144f..00dcb85d0 100644
--- a/Source/Parallelization/WarpXComm.cpp
+++ b/Source/Parallelization/WarpXComm.cpp
@@ -246,7 +246,7 @@ void
 WarpX::FillBoundaryE (int lev, PatchType patch_type)
 {
     if (patch_type == PatchType::fine)
-    {
+    {        
         if (do_pml && pml[lev]->ok())
         {
     	    pml[lev]->ExchangeE(patch_type,
diff --git a/Source/Particles/WarpXParticleContainer.cpp b/Source/Particles/WarpXParticleContainer.cpp
index 9791eee80..47d57294d 100644
--- a/Source/Particles/WarpXParticleContainer.cpp
+++ b/Source/Particles/WarpXParticleContainer.cpp
@@ -669,7 +669,7 @@ WarpXParticleContainer::DepositCharge ( WarpXParIter& pti, RealVector& wp,
       const std::array<Real,3>& cxyzmin_tile = WarpX::LowerCorner(ctilebox, lev-1);
 
 #ifdef AMREX_USE_GPU
-      data_ptr = (*crhomf)[pti].dataPtr();
+      data_ptr = (*crhomf)[pti].dataPtr(icomp);
       auto rholen = (*crhomf)[pti].length();
 #else
       tile_box = amrex::convert(ctilebox, IntVect::TheUnitVector());