3 files changed, 281 insertions, 50 deletions
diff --git a/Source/Particles/Sorting/Make.package b/Source/Particles/Sorting/Make.package
index 750d2607e..f9c708e5b 100644
--- a/Source/Particles/Sorting/Make.package
+++ b/Source/Particles/Sorting/Make.package
@@ -1,3 +1,4 @@
+CEXE_headers += SortingUtils.H
 CEXE_sources += Partition.cpp
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Particles/Sorting
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Particles/Sorting
diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 683dbbd04..e88af017f 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -1,5 +1,6 @@
-#include <WarpX.H>
+#include <SortingUtils.H>
 #include <PhysicalParticleContainer.H>
+#include <WarpX.H>
 #include <AMReX_Particles.H>
 
 using namespace amrex;
@@ -41,56 +42,64 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
 {
     BL_PROFILE("PPC::Evolve::partition");
 
-    std::vector<bool> inexflag;
+    // Initialize temporary arrays
+    Gpu::DeviceVector<int> inexflag;
     inexflag.resize(np);
+    Gpu::DeviceVector<long> pid;
+    pid.resize(np);
 
-    auto& aos = pti.GetArrayOfStructs();
+    // First, partition particles into the larger buffer
 
-    // We need to partition the large buffer first
+    // - Select the larger buffer
     iMultiFab const* bmasks =
         (WarpX::n_field_gather_buffer >= WarpX::n_current_deposition_buffer) ?
         gather_masks : current_masks;
-    int i = 0;
-    const auto& msk = (*bmasks)[pti];
-    for (const auto& p : aos) {
-        const IntVect& iv = Index(p, lev);
-        inexflag[i++] = msk(iv);
-    }
-
-    Vector<long> pid;
-    pid.resize(np);
-    std::iota(pid.begin(), pid.end(), 0L);
-    auto sep = std::stable_partition(pid.begin(), pid.end(),
-                                     [&inexflag](long id) { return inexflag[id]; });
+    // - For each particle, find whether it is in the larger buffer,
+    //   by looking up the mask. Store the answer in `inexflag`.
+    amrex::ParallelFor( np, fillBufferFlag(pti, bmasks, inexflag, Geom(lev)) );
+    // - Find the indices that reorder particles so that the last particles
+    //   are in the larger buffer
+    fillWithConsecutiveIntegers( pid );
+    auto const sep = stablePartition( pid.begin(), pid.end(), inexflag );
+    // At the end of this step, `pid` contains the indices that should be used to
+    // reorder the particles, and `sep` is the position in the array that
+    // separates the particles that deposit/gather on the fine patch (first part)
+    // and the particles that deposit/gather in the buffers (last part)
+    long const n_fine = iteratorDistance(pid.begin(), sep);
+    // Number of particles on fine patch, i.e. outside of the larger buffer
+
+    // Second, among particles that are in the larger buffer, partition
+    // particles into the smaller buffer
 
     if (WarpX::n_current_deposition_buffer == WarpX::n_field_gather_buffer) {
-        nfine_current = nfine_gather = std::distance(pid.begin(), sep);
-    } else if (sep != pid.end()) {
+        // No need to do anything if the buffers have the same size
+        nfine_current = nfine_gather = iteratorDistance(pid.begin(), sep);
+    } else if (sep == pid.end()) {
+        // No need to do anything if there are no particles in the larger buffer
+        nfine_current = nfine_gather = np;
+    } else {
         int n_buf;
         if (bmasks == gather_masks) {
-            nfine_gather = std::distance(pid.begin(), sep);
+            nfine_gather = n_fine;
             bmasks = current_masks;
             n_buf = WarpX::n_current_deposition_buffer;
         } else {
-            nfine_current = std::distance(pid.begin(), sep);
+            nfine_current = n_fine;
             bmasks = gather_masks;
             n_buf = WarpX::n_field_gather_buffer;
         }
         if (n_buf > 0)
         {
-            const auto& msk2 = (*bmasks)[pti];
-            for (auto it = sep; it != pid.end(); ++it) {
-                const long id = *it;
-                const IntVect& iv = Index(aos[id], lev);
-                inexflag[id] = msk2(iv);
-            }
+            // - For each particle in the large buffer, find whether it is in
+            // the smaller buffer, by looking up the mask. Store the answer in `inexflag`.
+            amrex::ParallelFor( np - n_fine,
+               fillBufferFlagRemainingParticles(pti, bmasks, inexflag, Geom(lev), pid, n_fine) );
+            auto const sep2 = stablePartition( sep, pid.end(), inexflag );
 
-            auto sep2 = std::stable_partition(sep, pid.end(),
-                                              [&inexflag](long id) {return inexflag[id];});
             if (bmasks == gather_masks) {
-                nfine_gather = std::distance(pid.begin(), sep2);
+                nfine_gather = iteratorDistance(pid.begin(), sep2);
             } else {
-                nfine_current = std::distance(pid.begin(), sep2);
+                nfine_current = iteratorDistance(pid.begin(), sep2);
             }
         }
     }
@@ -103,39 +112,38 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
         nfine_gather = 0;
     }
 
+    // Reorder the actual particle array, using the `pid` indices
     if (nfine_current != np || nfine_gather != np)
     {
-
+        // Temporary array for particle AoS
         ParticleVector particle_tmp;
         particle_tmp.resize(np);
 
-        for (long ip = 0; ip < np; ++ip) {
-            particle_tmp[ip] = aos[pid[ip]];
-        }
+        // Copy particle AoS
+        auto& aos = pti.GetArrayOfStructs();
+        amrex::ParallelFor( np,
+            copyAndReorder<ParticleType>( aos(), particle_tmp, pid ) );
         std::swap(aos(), particle_tmp);
 
+        // Temporary array for particle individual attributes
         RealVector tmp;
         tmp.resize(np);
 
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = wp[pid[ip]];
-        }
+        // Copy individual attributes
+        amrex::ParallelFor( np, copyAndReorder<Real>( wp, tmp, pid ) );
         std::swap(wp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uxp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uxp, tmp, pid ) );
         std::swap(uxp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uyp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uyp, tmp, pid ) );
         std::swap(uyp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uzp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uzp, tmp, pid ) );
         std::swap(uzp, tmp);
-    }
 
+        // Make sure that the temporary arrays are not destroyed before
+        // the GPU kernels finish running
+        Gpu::streamSynchronize();
+    }
+    // Make sure that the temporary arrays are not destroyed before
+    // the GPU kernels finish running
+    Gpu::streamSynchronize();
 }
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
new file mode 100644
index 000000000..28a1b73b7
--- /dev/null
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -0,0 +1,222 @@
+#ifndef WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
+#define WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
+
+#include <WarpXParticleContainer.H>
+#include <AMReX_CudaContainers.H>
+#include <AMReX_Gpu.H>
+#ifdef AMREX_USE_GPU
+    #include <thrust/partition.h>
+    #include <thrust/distance.h>
+#endif
+
+/* \brief Fill the elements of the input vector with consecutive integer,
+ *        starting from 0
+ *
+ * \param[inout] v Vector of integers, to be filled by this routine
+ */
+void fillWithConsecutiveIntegers( amrex::Gpu::DeviceVector<long>& v )
+{
+#ifdef AMREX_USE_GPU
+    // On GPU: Use thrust
+    thrust::sequence( v.begin(), v.end() );
+#else
+    // On CPU: Use std library
+    std::iota( v.begin(), v.end(), 0L );
+#endif
+}
+
+/* \brief Find the indices that would reorder the elements of `predicate`
+ * so that the elements with non-zero value precede the other elements
+ *
+ * \param[in, out] index_begin Point to the beginning of the vector which is
+ *            to be filled with these indices
+ * \param[in, out] index_begin Point to the end of the vector which is
+ *            to be filled with these indices
+ * \param[in] Vector that indicates the elements that need to be reordered first
+ */
+template< typename ForwardIterator >
+ForwardIterator stablePartition(ForwardIterator const index_begin,
+                               ForwardIterator const index_end,
+                               amrex::Gpu::DeviceVector<int> const& predicate)
+{
+#ifdef AMREX_USE_GPU
+    // On GPU: Use thrust
+    int const* AMREX_RESTRICT predicate_ptr = predicate.dataPtr();
+    ForwardIterator const sep = thrust::stable_partition(
+        thrust::cuda::par(amrex::Cuda::The_ThrustCachedAllocator()),
+        index_begin, index_end,
+        [predicate_ptr] AMREX_GPU_DEVICE (long i) { return predicate_ptr[i]; }
+    );
+#else
+    // On CPU: Use std library
+    ForwardIterator const sep = std::stable_partition(
+        index_begin, index_end,
+        [&predicate](long i) { return predicate[i]; }
+    );
+#endif
+    return sep;
+}
+
+/* \brief Return the number of elements between `first` and `last`
+ *
+ * \param[in] fist Points to a position in a vector
+ * \param[in] last Points to another position in a vector
+ * \return The number of elements between `first` and `last`
+ */
+template< typename ForwardIterator >
+int iteratorDistance(ForwardIterator const first,
+                     ForwardIterator const last)
+{
+#ifdef AMREX_USE_GPU
+    // On GPU: Use thrust
+    return thrust::distance( first, last );
+#else
+    return std::distance( first, last );
+#endif
+}
+
+/* \brief Functor that fills the elements of the particle array `inexflag`
+ *  with the value of the spatial array `bmasks`, at the corresponding particle position.
+ *
+ * \param[in] pti Contains information on the particle positions
+ * \param[in] bmasks Spatial array, that contains a flag indicating
+ *         whether each cell is part of the gathering/deposition buffers
+ * \param[out] inexflag Vector to be filled with the value of `bmasks`
+ * \param[in] geom Geometry object, necessary to locate particles within the array `bmasks`
+ *
+ */
+class fillBufferFlag
+{
+    public:
+        fillBufferFlag( WarpXParIter const& pti, amrex::iMultiFab const* bmasks,
+                        amrex::Gpu::DeviceVector<int>& inexflag,
+                        amrex::Geometry const& geom ) {
+
+            // Extract simple structure that can be used directly on the GPU
+            m_particles = &(pti.GetArrayOfStructs()[0]);
+            m_buffer_mask = (*bmasks)[pti].array();
+            m_inexflag_ptr = inexflag.dataPtr();
+            m_domain = geom.Domain();
+            for (int idim=0; idim<AMREX_SPACEDIM; idim++) {
+                m_prob_lo[idim] = geom.ProbLo(idim);
+                m_inv_cell_size[idim] = geom.InvCellSize(idim);
+            }
+        };
+
+
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        void operator()( const long i ) const {
+            // Select a particle
+            auto const& p = m_particles[i];
+            // Find the index of the cell where this particle is located
+            amrex::IntVect const iv = amrex::getParticleCell( p,
+                                m_prob_lo, m_inv_cell_size, m_domain );
+            // Find the value of the buffer flag in this cell and
+            // store it at the corresponding particle position in the array `inexflag`
+            m_inexflag_ptr[i] = m_buffer_mask(iv);
+        };
+
+    private:
+        amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> m_prob_lo;
+        amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> m_inv_cell_size;
+        amrex::Box m_domain;
+        int* m_inexflag_ptr;
+        WarpXParticleContainer::ParticleType const* m_particles;
+        amrex::Array4<int const> m_buffer_mask;
+};
+
+/* \brief Functor that fills the elements of the particle array `inexflag`
+ *  with the value of the spatial array `bmasks`, at the corresponding particle position.
+ *
+ * Contrary to `fillBufferFlag`, here this is done only for the particles that
+ * the last elements of `particle_indices` point to (from the element at
+ * index `start_index` in `particle_indices`, to the last element of `particle_indices`)
+ *
+ * \param[in] pti Contains information on the particle positions
+ * \param[in] bmasks Spatial array, that contains a flag indicating
+ *         whether each cell is part of the gathering/deposition buffers
+ * \param[out] inexflag Vector to be filled with the value of `bmasks`
+ * \param[in] geom Geometry object, necessary to locate particles within the array `bmasks`
+ * \param[in] start_index Index that which elements start to be modified
+ */
+class fillBufferFlagRemainingParticles
+{
+    public:
+        fillBufferFlagRemainingParticles(
+                        WarpXParIter const& pti,
+                        amrex::iMultiFab const* bmasks,
+                        amrex::Gpu::DeviceVector<int>& inexflag,
+                        amrex::Geometry const& geom,
+                        amrex::Gpu::DeviceVector<long> const& particle_indices,
+                        long const start_index ) :
+                        m_start_index(start_index) {
+
+            // Extract simple structure that can be used directly on the GPU
+            m_particles = &(pti.GetArrayOfStructs()[0]);
+            m_buffer_mask = (*bmasks)[pti].array();
+            m_inexflag_ptr = inexflag.dataPtr();
+            m_indices_ptr = particle_indices.dataPtr();
+            m_domain = geom.Domain();
+            for (int idim=0; idim<AMREX_SPACEDIM; idim++) {
+                m_prob_lo[idim] = geom.ProbLo(idim);
+                m_inv_cell_size[idim] = geom.InvCellSize(idim);
+            }
+        };
+
+
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        void operator()( const long i ) const {
+            // Select a particle
+            auto const& p = m_particles[m_indices_ptr[i+m_start_index]];
+            // Find the index of the cell where this particle is located
+            amrex::IntVect const iv = amrex::getParticleCell( p,
+                                m_prob_lo, m_inv_cell_size, m_domain );
+            // Find the value of the buffer flag in this cell and
+            // store it at the corresponding particle position in the array `inexflag`
+            m_inexflag_ptr[m_indices_ptr[i+m_start_index]] = m_buffer_mask(iv);
+        };
+
+    private:
+        amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> m_prob_lo;
+        amrex::GpuArray<amrex::Real,AMREX_SPACEDIM> m_inv_cell_size;
+        amrex::Box m_domain;
+        int* m_inexflag_ptr;
+        WarpXParticleContainer::ParticleType const* m_particles;
+        amrex::Array4<int const> m_buffer_mask;
+        long const m_start_index;
+        long const* m_indices_ptr;
+};
+
+/* \brief Functor that copies the elements of `src` into `dst`,
+ *       while reordering them according to `indices`
+ *
+ * \param[in] src Source vector
+ * \param[out] dst Destination vector
+ * \param[in] indices Array of indices that indicate how to reorder elements
+ */
+template <typename T>
+class copyAndReorder
+{
+    public:
+        copyAndReorder(
+            amrex::Gpu::ManagedDeviceVector<T> const& src,
+            amrex::Gpu::ManagedDeviceVector<T>& dst,
+            amrex::Gpu::DeviceVector<long> const& indices ) {
+            // Extract simple structure that can be used directly on the GPU
+            m_src_ptr = src.dataPtr();
+            m_dst_ptr = dst.dataPtr();
+            m_indices_ptr = indices.dataPtr();
+        };
+
+        AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+        void operator()( const long ip ) const {
+            m_dst_ptr[ip] = m_src_ptr[ m_indices_ptr[ip] ];
+        };
+
+    private:
+        T const* m_src_ptr;
+        T* m_dst_ptr;
+        long const* m_indices_ptr;
+};
+
+#endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_