1 files changed, 29 insertions, 15 deletions
diff --git a/Source/WarpXEvolve.cpp b/Source/WarpXEvolve.cpp
index 7c1629e94..6d917e36d 100644
--- a/Source/WarpXEvolve.cpp
+++ b/Source/WarpXEvolve.cpp
@@ -155,6 +155,12 @@ WarpX::EvolveEM (int numsteps)
             mypc->Redistribute();
         }
 
+	bool to_sort = (sort_int > 0) && ((step+1) % sort_int == 0);
+	if (to_sort) {
+	    amrex::Print() << "re-sorting particles \n";
+	    mypc->SortParticlesByCell();
+	}
+
         amrex::Print()<< "STEP " << step+1 << " ends." << " TIME = " << cur_time
                       << " DT = " << dt[0] << "\n";
         Real walltime_end_step = amrex::second();
@@ -492,9 +498,9 @@ WarpX::EvolveB (int lev, PatchType patch_type, amrex::Real dt)
 
     // Loop through the grids, and over the tiles within each grid
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-    for ( MFIter mfi(*Bx,true); mfi.isValid(); ++mfi )
+    for ( MFIter mfi(*Bx, TilingIfNotGPU()); mfi.isValid(); ++mfi )
     {
         Real wt = amrex::second();
 
@@ -519,8 +525,12 @@ WarpX::EvolveB (int lev, PatchType patch_type, amrex::Real dt)
         if (cost) {
             Box cbx = mfi.tilebox(IntVect{AMREX_D_DECL(0,0,0)});
             if (patch_type == PatchType::coarse) cbx.refine(rr);
-            wt = (amrex::second() - wt) / cbx.d_numPts();
-            (*cost)[mfi].plus(wt, cbx);
+            wt = (amrex::second() - wt) / cbx.d_numPts();\
+            FArrayBox* costfab = cost->fabPtr(mfi);
+            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( cbx, work_box,
+            {
+                costfab->plus(wt, work_box);
+            });
         }
     }
 
@@ -530,9 +540,9 @@ WarpX::EvolveB (int lev, PatchType patch_type, amrex::Real dt)
         const auto& pml_E = (patch_type == PatchType::fine) ? pml[lev]->GetE_fp() : pml[lev]->GetE_cp();
 
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-        for ( MFIter mfi(*pml_B[0],true); mfi.isValid(); ++mfi )
+        for ( MFIter mfi(*pml_B[0], TilingIfNotGPU()); mfi.isValid(); ++mfi )
         {
             const Box& tbx  = mfi.tilebox(Bx_nodal_flag);
             const Box& tby  = mfi.tilebox(By_nodal_flag);
@@ -617,9 +627,9 @@ WarpX::EvolveE (int lev, PatchType patch_type, amrex::Real dt)
 
     // Loop through the grids, and over the tiles within each grid
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-    for ( MFIter mfi(*Ex,true); mfi.isValid(); ++mfi )
+    for ( MFIter mfi(*Ex, TilingIfNotGPU()); mfi.isValid(); ++mfi )
     {
         Real wt = amrex::second();
 
@@ -662,7 +672,11 @@ WarpX::EvolveE (int lev, PatchType patch_type, amrex::Real dt)
             Box cbx = mfi.tilebox(IntVect{AMREX_D_DECL(0,0,0)});
             if (patch_type == PatchType::coarse) cbx.refine(rr);
             wt = (amrex::second() - wt) / cbx.d_numPts();
-            (*cost)[mfi].plus(wt, cbx);
+            FArrayBox* costfab = cost->fabPtr(mfi);
+            AMREX_LAUNCH_HOST_DEVICE_LAMBDA ( cbx, work_box,
+            {
+                costfab->plus(wt, work_box);
+            });
         }
     }
 
@@ -674,9 +688,9 @@ WarpX::EvolveE (int lev, PatchType patch_type, amrex::Real dt)
         const auto& pml_E = (patch_type == PatchType::fine) ? pml[lev]->GetE_fp() : pml[lev]->GetE_cp();
         const auto& pml_F = (patch_type == PatchType::fine) ? pml[lev]->GetF_fp() : pml[lev]->GetF_cp();
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-        for ( MFIter mfi(*pml_E[0],true); mfi.isValid(); ++mfi )
+        for ( MFIter mfi(*pml_E[0], TilingIfNotGPU()); mfi.isValid(); ++mfi )
         {
             const Box& tex  = mfi.tilebox(Ex_nodal_flag);
             const Box& tey  = mfi.tilebox(Ey_nodal_flag);
@@ -775,9 +789,9 @@ WarpX::EvolveF (int lev, PatchType patch_type, Real dt, DtType dt_type)
         const auto& pml_E = (patch_type == PatchType::fine) ? pml[lev]->GetE_fp() : pml[lev]->GetE_cp();
 
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-        for ( MFIter mfi(*pml_F,true); mfi.isValid(); ++mfi )
+        for ( MFIter mfi(*pml_F, TilingIfNotGPU()); mfi.isValid(); ++mfi )
         {
             const Box& bx = mfi.tilebox();
             WRPX_PUSH_PML_F(bx.loVect(), bx.hiVect(),
@@ -821,9 +835,9 @@ WarpX::DampPML (int lev, PatchType patch_type)
                                                               : pml[lev]->GetMultiSigmaBox_cp();
 
 #ifdef _OPENMP
-#pragma omp parallel
+#pragma omp parallel if (Gpu::notInLaunchRegion())
 #endif
-        for ( MFIter mfi(*pml_E[0],true); mfi.isValid(); ++mfi )
+        for ( MFIter mfi(*pml_E[0], TilingIfNotGPU()); mfi.isValid(); ++mfi )
         {
             const Box& tex  = mfi.tilebox(Ex_nodal_flag);
             const Box& tey  = mfi.tilebox(Ey_nodal_flag);