2 files changed, 57 insertions, 31 deletions
diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 138f28105..b4f96476e 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -42,8 +42,6 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
 {
     BL_PROFILE("PPC::Evolve::partition");
 
-    auto& aos = pti.GetArrayOfStructs();
-
     // Initialize temporary arrays
     Gpu::ManagedDeviceVector<int> inexflag;
     inexflag.resize(np);
@@ -67,10 +65,12 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     // reorder the particles, and `sep` is the position in the array that
     // separates the particles that deposit/gather on the fine patch (first part)
     // and the particles that deposit/gather in the buffers (last part)
-    long n_fine = iteratorDistance(pid.begin(), sep); // Number of particles on fine patch
+    long n_fine = iteratorDistance(pid.begin(), sep);
+    // Number of particles on fine patch, i.e. outside of the larger buffer
 
     // Second, among particles that are in the larger buffer, partition
     // particles into the smaller buffer
+
     if (WarpX::n_current_deposition_buffer == WarpX::n_field_gather_buffer) {
         // No need to do anything if the buffers have the same size
         nfine_current = nfine_gather = iteratorDistance(pid.begin(), sep);
@@ -95,6 +95,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
             amrex::ParallelFor( np - n_fine,
                fillBufferFlag(pti, bmasks, inexflag, Geom(lev), n_fine) );
             auto sep2 = stablePartition( sep, pid.end(), inexflag );
+
             if (bmasks == gather_masks) {
                 nfine_gather = iteratorDistance(pid.begin(), sep2);
             } else {
@@ -111,39 +112,39 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
         nfine_gather = 0;
     }
 
+    // Reorder the actual particle array, using the `pid` indices
     if (nfine_current != np || nfine_gather != np)
     {
+        // Extract simple pointer
+        long* AMREX_RESTRICT pid_ptr = pid.dataPtr();
 
+        // Temporary array for particle AoS
         ParticleVector particle_tmp;
         particle_tmp.resize(np);
-
-        for (long ip = 0; ip < np; ++ip) {
-            particle_tmp[ip] = aos[pid[ip]];
-        }
+        ParticleType* AMREX_RESTRICT particle_tmp_ptr = particle_tmp.dataPtr();
+
+        // Copy the particle AoS
+        auto& aos = pti.GetArrayOfStructs();
+        ParticleType* AMREX_RESTRICT aos_ptr = &(aos()[0]);
+        amrex::ParallelFor( np,
+            [=] AMREX_GPU_DEVICE (long ip) {
+                particle_tmp_ptr[ip] = aos_ptr[pid_ptr[ip]];
+            }
+        );
         std::swap(aos(), particle_tmp);
 
+        // Temporary array for particle individual attributes
         RealVector tmp;
         tmp.resize(np);
 
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = wp[pid[ip]];
-        }
+        // Copy individual attributes
+        amrex::ParallelFor( np, copyAndReorder<Real>( wp, tmp, pid ) );
         std::swap(wp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uxp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uxp, tmp, pid ) );
         std::swap(uxp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uyp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uyp, tmp, pid ) );
         std::swap(uyp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uzp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uzp, tmp, pid ) );
         std::swap(uzp, tmp);
     }
-
 }
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
index a17fe85db..80fc4aab6 100644
--- a/Source/Particles/Sorting/SortingUtils.H
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -27,17 +27,16 @@ ForwardIterator stablePartition(ForwardIterator index_begin,
     // On GPU: Use thrust
     int* AMREX_RESTRICT predicate_ptr = predicate.dataPtr();
     ForwardIterator sep = thrust::stable_partition(
-                    thrust::cuda::par(Cuda::The_ThrustCachedAllocator()),
-                    index_begin, index_end,
-                    AMREX_GPU_HOST_DEVICE
-                    [predicate_ptr](long i) { return predicate_ptr[i]; }
-                );
+        thrust::cuda::par(Cuda::The_ThrustCachedAllocator()),
+        index_begin, index_end,
+        [predicate_ptr] AMREX_GPU_DEVICE (long i) { return predicate_ptr[i]; }
+    );
 #else
     // On CPU: Use std library
     ForwardIterator sep = std::stable_partition(
-                    index_begin, index_end,
-                    [&predicate](long i) { return predicate[i]; }
-                );
+        index_begin, index_end,
+        [&predicate](long i) { return predicate[i]; }
+    );
 #endif
     return sep;
 }
@@ -102,4 +101,30 @@ class fillBufferFlag
         long m_start_index;
 };
 
+// TODO: Add documentation
+template <typename T>
+class copyAndReorder
+{
+    public:
+        copyAndReorder(
+            amrex::Gpu::ManagedDeviceVector<T>& src,
+            amrex::Gpu::ManagedDeviceVector<T>& dst,
+            amrex::Gpu::ManagedDeviceVector<long>& indices ) {
+            // Extract simple structure that can be used directly on the GPU
+            m_src_ptr = src.dataPtr();
+            m_dst_ptr = dst.dataPtr();
+            m_indices_ptr = indices.dataPtr();
+        };
+
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        void operator()( const long ip ) const {
+            m_dst_ptr[ip] = m_src_ptr[ m_indices_ptr[ip] ];
+        };
+
+    private:
+        T* m_src_ptr;
+        T* m_dst_ptr;
+        long* m_indices_ptr;
+};
+
 #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_