From a7d4ebb9cb2b7528b7b1891eccb794fb6b70cf09 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Wed, 25 Sep 2019 00:50:41 -0700
Subject: Start GPU conversion

---
 Source/Particles/Sorting/Partition.cpp | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'Source/Particles/Sorting/Partition.cpp')
diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 683dbbd04..29e187821 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -40,22 +40,25 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     RealVector& uxp, RealVector& uyp, RealVector& uzp, RealVector& wp)
 {
     BL_PROFILE("PPC::Evolve::partition");
-
-    std::vector<bool> inexflag;
+    Gpu::ManagedDeviceVector<int> inexflag;
     inexflag.resize(np);
 
-    auto& aos = pti.GetArrayOfStructs();
-
     // We need to partition the large buffer first
     iMultiFab const* bmasks =
         (WarpX::n_field_gather_buffer >= WarpX::n_current_deposition_buffer) ?
         gather_masks : current_masks;
-    int i = 0;
-    const auto& msk = (*bmasks)[pti];
-    for (const auto& p : aos) {
-        const IntVect& iv = Index(p, lev);
-        inexflag[i++] = msk(iv);
-    }
+
+    // For each particle, find whether it is in the large buffer, by looking up the mask
+    const Array4<const int>& msk = (*bmasks)[pti].array();
+    auto& aos = pti.GetArrayOfStructs();
+    ParticleType * AMREX_RESTRICT pstructs = &(aos[0]);
+    int * inexflag_ptr = inexflag.dataPtr();
+    amrex::ParallelFor( pti.numParticles(),
+         [=] AMREX_GPU_DEVICE (long i) {
+             const IntVect& iv = Index(pstructs[i], lev);
+             inexflag_ptr[i] = msk(iv);
+         }
+     );
 
     Vector<long> pid;
     pid.resize(np);
-- 
cgit v1.2.3


From feac8d56e5cc84f63f6e7be7e1668113558c9e47 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Wed, 25 Sep 2019 14:53:21 -0700
Subject: Implement `iota` on GPU

---
 Source/Particles/Sorting/Make.package   |  1 +
 Source/Particles/Sorting/Partition.cpp  | 10 ++++++----
 Source/Particles/Sorting/SortingUtils.H | 18 ++++++++++++++++++
 3 files changed, 25 insertions(+), 4 deletions(-)
 create mode 100644 Source/Particles/Sorting/SortingUtils.H

(limited to 'Source/Particles/Sorting/Partition.cpp')

diff --git a/Source/Particles/Sorting/Make.package b/Source/Particles/Sorting/Make.package
index 750d2607e..f9c708e5b 100644
--- a/Source/Particles/Sorting/Make.package
+++ b/Source/Particles/Sorting/Make.package
@@ -1,3 +1,4 @@
+CEXE_headers += SortingUtils.H
 CEXE_sources += Partition.cpp
 INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Particles/Sorting
 VPATH_LOCATIONS   += $(WARPX_HOME)/Source/Particles/Sorting
diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 29e187821..49d4d25a6 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -1,5 +1,6 @@
-#include <WarpX.H>
+#include <SortingUtils.H>
 #include <PhysicalParticleContainer.H>
+#include <WarpX.H>
 #include <AMReX_Particles.H>
 
 using namespace amrex;
@@ -43,7 +44,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     Gpu::ManagedDeviceVector<int> inexflag;
     inexflag.resize(np);
 
-    // We need to partition the large buffer first
+    // Select the largest buffer first
     iMultiFab const* bmasks =
         (WarpX::n_field_gather_buffer >= WarpX::n_current_deposition_buffer) ?
         gather_masks : current_masks;
@@ -60,9 +61,10 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
          }
      );
 
-    Vector<long> pid;
+    // Partition the particles according whether they are in the large buffer or not
+    Gpu::ManagedDeviceVector<long> pid;
     pid.resize(np);
-    std::iota(pid.begin(), pid.end(), 0L);
+    fillWithConsecutiveIntegers( pid );
     auto sep = std::stable_partition(pid.begin(), pid.end(),
                                      [&inexflag](long id) { return inexflag[id]; });
 
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
new file mode 100644
index 000000000..d072ebd2a
--- /dev/null
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -0,0 +1,18 @@
+#ifndef WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
+#define WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
+
+#include <AMReX_Gpu.H>
+#include <AMReX_CudaContainers.H>
+
+// TODO: Add documentation
+void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector<long>& v ) {
+#ifdef AMREX_USE_GPU
+    // On GPU: Use thrust
+    thrust::sequence( v.begin(), v.end() );
+#else
+    // On CPU: Use std library
+    std::iota( v.begin(), v.end(), 0L );
+#endif
+}
+
+#endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
-- 
cgit v1.2.3


From 69b512e9fb315bd987314ec71ff463c6a90afde0 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Wed, 25 Sep 2019 13:12:23 -0700
Subject: Replaced Index function

---
 Source/Particles/Sorting/Partition.cpp  | 15 +++++++++++++--
 Source/Particles/Sorting/SortingUtils.H | 22 ++++++++++++++++++++++
 2 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'Source/Particles/Sorting/Partition.cpp')

diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 49d4d25a6..6874e2df6 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -54,9 +54,20 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     auto& aos = pti.GetArrayOfStructs();
     ParticleType * AMREX_RESTRICT pstructs = &(aos[0]);
     int * inexflag_ptr = inexflag.dataPtr();
-    amrex::ParallelFor( pti.numParticles(),
+    const Geometry& geom = Geom(lev);
+    const Real prob_lo_x = geom.ProbLo(0);
+    const Real prob_lo_y = geom.ProbLo(1);
+    const Real prob_lo_z = geom.ProbLo(2);
+    const Real inv_dx = geom.InvCellSize(0);
+    const Real inv_dy = geom.InvCellSize(1);
+    const Real inv_dz = geom.InvCellSize(2);
+    const IntVect domain_small_end = geom.Domain().smallEnd();
+    amrex::ParallelFor( np,
          [=] AMREX_GPU_DEVICE (long i) {
-             const IntVect& iv = Index(pstructs[i], lev);
+
+             const IntVect& iv = findParticleIndex(pstructs[i],
+                 prob_lo_x, prob_lo_y, prob_lo_z,
+                 inv_dx, inv_dy, inv_dz, domain_small_end );
              inexflag_ptr[i] = msk(iv);
          }
      );
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
index d072ebd2a..7dec21446 100644
--- a/Source/Particles/Sorting/SortingUtils.H
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -15,4 +15,26 @@ void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector<long>& v ) {
 #endif
 }
 
+// TODO: Add documentation
+template< typename PType >
+AMREX_GPU_HOST_DEVICE AMREX_INLINE
+amrex::IntVect findParticleIndex ( PType p,
+                    const amrex::Real prob_lo_x,
+                    const amrex::Real prob_lo_y,
+                    const amrex::Real prob_lo_z,
+                    const amrex::Real inv_dx,
+                    const amrex::Real inv_dy,
+                    const amrex::Real inv_dz,
+                    const amrex::IntVect domain_small_end )
+{
+    amrex::IntVect iv;
+    AMREX_D_TERM(iv[0]=static_cast<int>(floor((p.m_rdata.pos[0]-prob_lo_x)*inv_dx));,
+                 iv[1]=static_cast<int>(floor((p.m_rdata.pos[1]-prob_lo_y)*inv_dy));,
+                 iv[2]=static_cast<int>(floor((p.m_rdata.pos[2]-prob_lo_z)*inv_dz)););
+
+    iv += domain_small_end;
+
+    return iv;
+};
+
 #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
-- 
cgit v1.2.3


From afde2a0f3cca52fde99ff59b72a6630cbea4c391 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Wed, 25 Sep 2019 14:11:16 -0700
Subject: Rewrite as functor

---
 Source/Particles/Sorting/Partition.cpp  | 35 +++++---------------
 Source/Particles/Sorting/SortingUtils.H | 58 +++++++++++++++++++++++----------
 2 files changed, 50 insertions(+), 43 deletions(-)

(limited to 'Source/Particles/Sorting/Partition.cpp')

diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 6874e2df6..7e247bd8c 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -41,40 +41,23 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     RealVector& uxp, RealVector& uyp, RealVector& uzp, RealVector& wp)
 {
     BL_PROFILE("PPC::Evolve::partition");
+
+    auto& aos = pti.GetArrayOfStructs();
     Gpu::ManagedDeviceVector<int> inexflag;
     inexflag.resize(np);
+    Gpu::ManagedDeviceVector<long> pid;
+    pid.resize(np);
 
     // Select the largest buffer first
     iMultiFab const* bmasks =
         (WarpX::n_field_gather_buffer >= WarpX::n_current_deposition_buffer) ?
         gather_masks : current_masks;
 
-    // For each particle, find whether it is in the large buffer, by looking up the mask
-    const Array4<const int>& msk = (*bmasks)[pti].array();
-    auto& aos = pti.GetArrayOfStructs();
-    ParticleType * AMREX_RESTRICT pstructs = &(aos[0]);
-    int * inexflag_ptr = inexflag.dataPtr();
-    const Geometry& geom = Geom(lev);
-    const Real prob_lo_x = geom.ProbLo(0);
-    const Real prob_lo_y = geom.ProbLo(1);
-    const Real prob_lo_z = geom.ProbLo(2);
-    const Real inv_dx = geom.InvCellSize(0);
-    const Real inv_dy = geom.InvCellSize(1);
-    const Real inv_dz = geom.InvCellSize(2);
-    const IntVect domain_small_end = geom.Domain().smallEnd();
-    amrex::ParallelFor( np,
-         [=] AMREX_GPU_DEVICE (long i) {
-
-             const IntVect& iv = findParticleIndex(pstructs[i],
-                 prob_lo_x, prob_lo_y, prob_lo_z,
-                 inv_dx, inv_dy, inv_dz, domain_small_end );
-             inexflag_ptr[i] = msk(iv);
-         }
-     );
-
-    // Partition the particles according whether they are in the large buffer or not
-    Gpu::ManagedDeviceVector<long> pid;
-    pid.resize(np);
+    // For each particle, find whether it is in the large buffer,
+    // by looking up the mask. Store the answer in `inexflag`
+    amrex::ParallelFor( np, fillBufferFlag( pti, bmasks, inexflag, Geom(lev) ) );
+
+    // Partition the particles according to whether they are in the large buffer or not
     fillWithConsecutiveIntegers( pid );
     auto sep = std::stable_partition(pid.begin(), pid.end(),
                                      [&inexflag](long id) { return inexflag[id]; });
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
index 7dec21446..1133ccab5 100644
--- a/Source/Particles/Sorting/SortingUtils.H
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -1,8 +1,9 @@
 #ifndef WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
 #define WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
 
-#include <AMReX_Gpu.H>
+#include <WarpXParticleContainer.H>
 #include <AMReX_CudaContainers.H>
+#include <AMReX_Gpu.H>
 
 // TODO: Add documentation
 void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector<long>& v ) {
@@ -16,25 +17,48 @@ void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector<long>& v ) {
 }
 
 // TODO: Add documentation
-template< typename PType >
-AMREX_GPU_HOST_DEVICE AMREX_INLINE
-amrex::IntVect findParticleIndex ( PType p,
-                    const amrex::Real prob_lo_x,
-                    const amrex::Real prob_lo_y,
-                    const amrex::Real prob_lo_z,
-                    const amrex::Real inv_dx,
-                    const amrex::Real inv_dy,
-                    const amrex::Real inv_dz,
-                    const amrex::IntVect domain_small_end )
+class fillBufferFlag
 {
-    amrex::IntVect iv;
-    AMREX_D_TERM(iv[0]=static_cast<int>(floor((p.m_rdata.pos[0]-prob_lo_x)*inv_dx));,
-                 iv[1]=static_cast<int>(floor((p.m_rdata.pos[1]-prob_lo_y)*inv_dy));,
-                 iv[2]=static_cast<int>(floor((p.m_rdata.pos[2]-prob_lo_z)*inv_dz)););
+    public:
+        fillBufferFlag( WarpXParIter& pti, const amrex::iMultiFab* bmasks,
+                        amrex::Gpu::ManagedDeviceVector<int>& inexflag,
+                        const amrex::Geometry& geom ) {
+
+            // Extract simple structure that can be used directly on the GPU
+            m_particles = &(pti.GetArrayOfStructs()[0]);
+            m_buffer_mask = (*bmasks)[pti].array();
+            m_inexflag_ptr = inexflag.dataPtr();
+            m_domain_small_end = geom.Domain().smallEnd();
+            for (int idim=0; idim<AMREX_SPACEDIM; idim++) {
+                m_prob_lo[idim] = geom.ProbLo(idim);
+                m_inv_cell_size[idim] = geom.InvCellSize(idim);
+            }
+        };
+
 
-    iv += domain_small_end;
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        void operator()( const long i ) const {
+            // Select a particle
+            auto& p = m_particles[i];
+            // Find the index of the cell where this particle is located
+            amrex::IntVect iv;
+            for (int idim=0; idim<AMREX_SPACEDIM; idim++) {
+                iv[idim] = static_cast<int>(floor(
+                    (p.m_rdata.pos[idim]-m_prob_lo[idim])*m_inv_cell_size[idim]
+                ));
+            }
+            // Find the value of the buffer flag in this cell and
+            // store it at the corresponding particle position in the array `inexflag`
+            m_inexflag_ptr[i] = m_buffer_mask(iv);
+        };
 
-    return iv;
+    private:
+        amrex::Real m_prob_lo[AMREX_SPACEDIM];
+        amrex::Real m_inv_cell_size[AMREX_SPACEDIM];
+        amrex::IntVect m_domain_small_end;
+        int* m_inexflag_ptr;
+        WarpXParticleContainer::ParticleType* m_particles;
+        amrex::Array4<const int> m_buffer_mask;
 };
 
 #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
-- 
cgit v1.2.3


From 8575478f370f3db2525a8d5444e6949776e9da29 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Thu, 26 Sep 2019 08:51:00 -0700
Subject: Implemented stable partition

---
 Source/Particles/Sorting/Partition.cpp  | 24 +++++++++++++++---------
 Source/Particles/Sorting/SortingUtils.H | 28 +++++++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 10 deletions(-)

(limited to 'Source/Particles/Sorting/Partition.cpp')

diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 7e247bd8c..3ee49cafe 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -43,24 +43,30 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     BL_PROFILE("PPC::Evolve::partition");
 
     auto& aos = pti.GetArrayOfStructs();
+
+    // Initialize temporary arrays
     Gpu::ManagedDeviceVector<int> inexflag;
     inexflag.resize(np);
     Gpu::ManagedDeviceVector<long> pid;
     pid.resize(np);
 
-    // Select the largest buffer first
+    // First, partition particles in the larger buffer
+
+    // - Select the larger buffer
     iMultiFab const* bmasks =
         (WarpX::n_field_gather_buffer >= WarpX::n_current_deposition_buffer) ?
         gather_masks : current_masks;
-
-    // For each particle, find whether it is in the large buffer,
-    // by looking up the mask. Store the answer in `inexflag`
-    amrex::ParallelFor( np, fillBufferFlag( pti, bmasks, inexflag, Geom(lev) ) );
-
-    // Partition the particles according to whether they are in the large buffer or not
+    // - For each particle, find whether it is in the larger buffer,
+    //   by looking up the mask. Store the answer in `inexflag`.
+    amrex::ParallelFor( np, fillBufferFlag(pti, bmasks, inexflag, Geom(lev)) );
+    // - Find the indices that reorder particles so that the last particles
+    //   are in the larger buffer
     fillWithConsecutiveIntegers( pid );
-    auto sep = std::stable_partition(pid.begin(), pid.end(),
-                                     [&inexflag](long id) { return inexflag[id]; });
+    auto sep = stablePartition( pid.begin(), pid.end(), inexflag );
+    // At the end of this step, `pid` contains the indices that should be used to
+    // reorder the particles, and `sep` is the position in the array that
+    // separates the particles that deposit/gather on the fine patch (first part)
+    // and the particles that deposit/gather in the buffers (last part)
 
     if (WarpX::n_current_deposition_buffer == WarpX::n_field_gather_buffer) {
         nfine_current = nfine_gather = std::distance(pid.begin(), sep);
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
index 1133ccab5..ede59b53b 100644
--- a/Source/Particles/Sorting/SortingUtils.H
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -6,7 +6,8 @@
 #include <AMReX_Gpu.H>
 
 // TODO: Add documentation
-void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector<long>& v ) {
+void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector<long>& v )
+{
 #ifdef AMREX_USE_GPU
     // On GPU: Use thrust
     thrust::sequence( v.begin(), v.end() );
@@ -16,6 +17,31 @@ void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector<long>& v ) {
 #endif
 }
 
+// TODO: Add documentation
+template< typename ForwardIterator >
+ForwardIterator stablePartition(ForwardIterator index_begin,
+                               ForwardIterator index_end,
+                               amrex::Gpu::ManagedDeviceVector<int>& predicate)
+{
+#ifdef AMREX_USE_GPU
+    // On GPU: Use thrust
+    int* AMREX_RESTRICT predicate_ptr = predicate.dataPtr();
+    ForwardIterator sep = thrust::stable_partition(
+                    thrust::cuda::par(Cuda::The_ThrustCachedAllocator()),
+                    index_begin, index_end,
+                    AMREX_GPU_HOST_DEVICE
+                    [predicate_ptr](long i) { return predicate_ptr[i]; }
+                );
+#else
+    // On CPU: Use std library
+    ForwardIterator sep = std::stable_partition(
+                    index_begin, index_end,
+                    [&predicate](long i) { return predicate[i]; }
+                );
+#endif
+    return sep;
+}
+
 // TODO: Add documentation
 class fillBufferFlag
 {
-- 
cgit v1.2.3


From fac016f1173aacbf4500a28ef63ed683365e4d3f Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Thu, 26 Sep 2019 09:00:22 -0700
Subject: Implemented iterator distance

---
 Source/Particles/Sorting/Partition.cpp  | 16 +++++++++++-----
 Source/Particles/Sorting/SortingUtils.H | 13 +++++++++++++
 2 files changed, 24 insertions(+), 5 deletions(-)

(limited to 'Source/Particles/Sorting/Partition.cpp')

diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 3ee49cafe..d64b9fa25 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -50,7 +50,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     Gpu::ManagedDeviceVector<long> pid;
     pid.resize(np);
 
-    // First, partition particles in the larger buffer
+    // First, partition particles into the larger buffer
 
     // - Select the larger buffer
     iMultiFab const* bmasks =
@@ -68,16 +68,22 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     // separates the particles that deposit/gather on the fine patch (first part)
     // and the particles that deposit/gather in the buffers (last part)
 
+    // Second, among particles that are in the larger buffer, partition
+    // particles into the smaller buffer
     if (WarpX::n_current_deposition_buffer == WarpX::n_field_gather_buffer) {
-        nfine_current = nfine_gather = std::distance(pid.begin(), sep);
-    } else if (sep != pid.end()) {
+        // No need to do anything if the buffers have the same size
+        nfine_current = nfine_gather = iteratorDistance(pid.begin(), sep);
+    } else if (sep == pid.end()) {
+        // No need to do anything if there are no particles in the larger buffer
+        nfine_current = nfine_gather = np;
+    } else {
         int n_buf;
         if (bmasks == gather_masks) {
-            nfine_gather = std::distance(pid.begin(), sep);
+            nfine_gather = iteratorDistance(pid.begin(), sep);
             bmasks = current_masks;
             n_buf = WarpX::n_current_deposition_buffer;
         } else {
-            nfine_current = std::distance(pid.begin(), sep);
+            nfine_current = iteratorDistance(pid.begin(), sep);
             bmasks = gather_masks;
             n_buf = WarpX::n_field_gather_buffer;
         }
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
index ede59b53b..7d53a352e 100644
--- a/Source/Particles/Sorting/SortingUtils.H
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -42,6 +42,19 @@ ForwardIterator stablePartition(ForwardIterator index_begin,
     return sep;
 }
 
+// TODO: Add documentation
+template< typename ForwardIterator >
+int iteratorDistance(ForwardIterator first,
+                     ForwardIterator last)
+{
+#ifdef AMREX_USE_GPU
+    // On GPU: Use thrust
+    return thrust::distance( first, last );
+#else
+    return std::distance( first, last );
+#endif
+}
+
 // TODO: Add documentation
 class fillBufferFlag
 {
-- 
cgit v1.2.3


From a95c8f5445b7e8b1fc8808ea5bc30ba3c3d02351 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Thu, 26 Sep 2019 09:53:51 -0700
Subject: Perform partition in smaller buffer

---
 Source/Particles/Sorting/Partition.cpp  | 25 +++++++++++--------------
 Source/Particles/Sorting/SortingUtils.H |  8 +++++---
 2 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'Source/Particles/Sorting/Partition.cpp')

diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index d64b9fa25..138f28105 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -58,7 +58,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
         gather_masks : current_masks;
     // - For each particle, find whether it is in the larger buffer,
     //   by looking up the mask. Store the answer in `inexflag`.
-    amrex::ParallelFor( np, fillBufferFlag(pti, bmasks, inexflag, Geom(lev)) );
+    amrex::ParallelFor( np, fillBufferFlag(pti, bmasks, inexflag, Geom(lev), 0) );
     // - Find the indices that reorder particles so that the last particles
     //   are in the larger buffer
     fillWithConsecutiveIntegers( pid );
@@ -67,6 +67,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     // reorder the particles, and `sep` is the position in the array that
     // separates the particles that deposit/gather on the fine patch (first part)
     // and the particles that deposit/gather in the buffers (last part)
+    long n_fine = iteratorDistance(pid.begin(), sep); // Number of particles on fine patch
 
     // Second, among particles that are in the larger buffer, partition
     // particles into the smaller buffer
@@ -79,29 +80,25 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     } else {
         int n_buf;
         if (bmasks == gather_masks) {
-            nfine_gather = iteratorDistance(pid.begin(), sep);
+            nfine_gather = n_fine;
             bmasks = current_masks;
             n_buf = WarpX::n_current_deposition_buffer;
         } else {
-            nfine_current = iteratorDistance(pid.begin(), sep);
+            nfine_current = n_fine;
             bmasks = gather_masks;
             n_buf = WarpX::n_field_gather_buffer;
         }
         if (n_buf > 0)
         {
-            const auto& msk2 = (*bmasks)[pti];
-            for (auto it = sep; it != pid.end(); ++it) {
-                const long id = *it;
-                const IntVect& iv = Index(aos[id], lev);
-                inexflag[id] = msk2(iv);
-            }
-
-            auto sep2 = std::stable_partition(sep, pid.end(),
-                                              [&inexflag](long id) {return inexflag[id];});
+            // - For each particle in the large buffer, find whether it is in
+            // the smaller buffer, by looking up the mask. Store the answer in `inexflag`.
+            amrex::ParallelFor( np - n_fine,
+               fillBufferFlag(pti, bmasks, inexflag, Geom(lev), n_fine) );
+            auto sep2 = stablePartition( sep, pid.end(), inexflag );
             if (bmasks == gather_masks) {
-                nfine_gather = std::distance(pid.begin(), sep2);
+                nfine_gather = iteratorDistance(pid.begin(), sep2);
             } else {
-                nfine_current = std::distance(pid.begin(), sep2);
+                nfine_current = iteratorDistance(pid.begin(), sep2);
             }
         }
     }
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
index 7d53a352e..a17fe85db 100644
--- a/Source/Particles/Sorting/SortingUtils.H
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -61,7 +61,7 @@ class fillBufferFlag
     public:
         fillBufferFlag( WarpXParIter& pti, const amrex::iMultiFab* bmasks,
                         amrex::Gpu::ManagedDeviceVector<int>& inexflag,
-                        const amrex::Geometry& geom ) {
+                        const amrex::Geometry& geom, long start_index=0 ) {
 
             // Extract simple structure that can be used directly on the GPU
             m_particles = &(pti.GetArrayOfStructs()[0]);
@@ -72,13 +72,14 @@ class fillBufferFlag
                 m_prob_lo[idim] = geom.ProbLo(idim);
                 m_inv_cell_size[idim] = geom.InvCellSize(idim);
             }
+            m_start_index = start_index;
         };
 
 
         AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
         void operator()( const long i ) const {
             // Select a particle
-            auto& p = m_particles[i];
+            auto& p = m_particles[i+m_start_index];
             // Find the index of the cell where this particle is located
             amrex::IntVect iv;
             for (int idim=0; idim<AMREX_SPACEDIM; idim++) {
@@ -88,7 +89,7 @@ class fillBufferFlag
             }
             // Find the value of the buffer flag in this cell and
             // store it at the corresponding particle position in the array `inexflag`
-            m_inexflag_ptr[i] = m_buffer_mask(iv);
+            m_inexflag_ptr[i+m_start_index] = m_buffer_mask(iv);
         };
 
     private:
@@ -98,6 +99,7 @@ class fillBufferFlag
         int* m_inexflag_ptr;
         WarpXParticleContainer::ParticleType* m_particles;
         amrex::Array4<const int> m_buffer_mask;
+        long m_start_index;
 };
 
 #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
-- 
cgit v1.2.3


From 2b08e3082ba46361b943b7ce67f8d21eb835ece9 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Thu, 26 Sep 2019 10:35:48 -0700
Subject: Implement particle copy

---
 Source/Particles/Sorting/Partition.cpp  | 47 +++++++++++++++++----------------
 Source/Particles/Sorting/SortingUtils.H | 41 ++++++++++++++++++++++------
 2 files changed, 57 insertions(+), 31 deletions(-)

(limited to 'Source/Particles/Sorting/Partition.cpp')

diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 138f28105..b4f96476e 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -42,8 +42,6 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
 {
     BL_PROFILE("PPC::Evolve::partition");
 
-    auto& aos = pti.GetArrayOfStructs();
-
     // Initialize temporary arrays
     Gpu::ManagedDeviceVector<int> inexflag;
     inexflag.resize(np);
@@ -67,10 +65,12 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     // reorder the particles, and `sep` is the position in the array that
     // separates the particles that deposit/gather on the fine patch (first part)
     // and the particles that deposit/gather in the buffers (last part)
-    long n_fine = iteratorDistance(pid.begin(), sep); // Number of particles on fine patch
+    long n_fine = iteratorDistance(pid.begin(), sep);
+    // Number of particles on fine patch, i.e. outside of the larger buffer
 
     // Second, among particles that are in the larger buffer, partition
     // particles into the smaller buffer
+
     if (WarpX::n_current_deposition_buffer == WarpX::n_field_gather_buffer) {
         // No need to do anything if the buffers have the same size
         nfine_current = nfine_gather = iteratorDistance(pid.begin(), sep);
@@ -95,6 +95,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
             amrex::ParallelFor( np - n_fine,
                fillBufferFlag(pti, bmasks, inexflag, Geom(lev), n_fine) );
             auto sep2 = stablePartition( sep, pid.end(), inexflag );
+
             if (bmasks == gather_masks) {
                 nfine_gather = iteratorDistance(pid.begin(), sep2);
             } else {
@@ -111,39 +112,39 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
         nfine_gather = 0;
     }
 
+    // Reorder the actual particle array, using the `pid` indices
     if (nfine_current != np || nfine_gather != np)
     {
+        // Extract simple pointer
+        long* AMREX_RESTRICT pid_ptr = pid.dataPtr();
 
+        // Temporary array for particle AoS
         ParticleVector particle_tmp;
         particle_tmp.resize(np);
-
-        for (long ip = 0; ip < np; ++ip) {
-            particle_tmp[ip] = aos[pid[ip]];
-        }
+        ParticleType* AMREX_RESTRICT particle_tmp_ptr = particle_tmp.dataPtr();
+
+        // Copy the particle AoS
+        auto& aos = pti.GetArrayOfStructs();
+        ParticleType* AMREX_RESTRICT aos_ptr = &(aos()[0]);
+        amrex::ParallelFor( np,
+            [=] AMREX_GPU_DEVICE (long ip) {
+                particle_tmp_ptr[ip] = aos_ptr[pid_ptr[ip]];
+            }
+        );
         std::swap(aos(), particle_tmp);
 
+        // Temporary array for particle individual attributes
         RealVector tmp;
         tmp.resize(np);
 
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = wp[pid[ip]];
-        }
+        // Copy individual attributes
+        amrex::ParallelFor( np, copyAndReorder<Real>( wp, tmp, pid ) );
         std::swap(wp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uxp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uxp, tmp, pid ) );
         std::swap(uxp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uyp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uyp, tmp, pid ) );
         std::swap(uyp, tmp);
-
-        for (long ip = 0; ip < np; ++ip) {
-            tmp[ip] = uzp[pid[ip]];
-        }
+        amrex::ParallelFor( np, copyAndReorder<Real>( uzp, tmp, pid ) );
         std::swap(uzp, tmp);
     }
-
 }
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
index a17fe85db..80fc4aab6 100644
--- a/Source/Particles/Sorting/SortingUtils.H
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -27,17 +27,16 @@ ForwardIterator stablePartition(ForwardIterator index_begin,
     // On GPU: Use thrust
     int* AMREX_RESTRICT predicate_ptr = predicate.dataPtr();
     ForwardIterator sep = thrust::stable_partition(
-                    thrust::cuda::par(Cuda::The_ThrustCachedAllocator()),
-                    index_begin, index_end,
-                    AMREX_GPU_HOST_DEVICE
-                    [predicate_ptr](long i) { return predicate_ptr[i]; }
-                );
+        thrust::cuda::par(Cuda::The_ThrustCachedAllocator()),
+        index_begin, index_end,
+        [predicate_ptr] AMREX_GPU_DEVICE (long i) { return predicate_ptr[i]; }
+    );
 #else
     // On CPU: Use std library
     ForwardIterator sep = std::stable_partition(
-                    index_begin, index_end,
-                    [&predicate](long i) { return predicate[i]; }
-                );
+        index_begin, index_end,
+        [&predicate](long i) { return predicate[i]; }
+    );
 #endif
     return sep;
 }
@@ -102,4 +101,30 @@ class fillBufferFlag
         long m_start_index;
 };
 
+// TODO: Add documentation
+template <typename T>
+class copyAndReorder
+{
+    public:
+        copyAndReorder(
+            amrex::Gpu::ManagedDeviceVector<T>& src,
+            amrex::Gpu::ManagedDeviceVector<T>& dst,
+            amrex::Gpu::ManagedDeviceVector<long>& indices ) {
+            // Extract simple structure that can be used directly on the GPU
+            m_src_ptr = src.dataPtr();
+            m_dst_ptr = dst.dataPtr();
+            m_indices_ptr = indices.dataPtr();
+        };
+
+        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        void operator()( const long ip ) const {
+            m_dst_ptr[ip] = m_src_ptr[ m_indices_ptr[ip] ];
+        };
+
+    private:
+        T* m_src_ptr;
+        T* m_dst_ptr;
+        long* m_indices_ptr;
+};
+
 #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
-- 
cgit v1.2.3


From c1823aacc5a957181cc3ad075c8718c3666d8406 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Thu, 26 Sep 2019 10:48:43 -0700
Subject: Use templated functor for copy of particles

---
 Source/Particles/Sorting/Partition.cpp | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'Source/Particles/Sorting/Partition.cpp')

diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index b4f96476e..2b5d661e4 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -115,22 +115,14 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     // Reorder the actual particle array, using the `pid` indices
     if (nfine_current != np || nfine_gather != np)
     {
-        // Extract simple pointer
-        long* AMREX_RESTRICT pid_ptr = pid.dataPtr();
-
         // Temporary array for particle AoS
         ParticleVector particle_tmp;
         particle_tmp.resize(np);
-        ParticleType* AMREX_RESTRICT particle_tmp_ptr = particle_tmp.dataPtr();
 
-        // Copy the particle AoS
+        // Copy particle AoS
         auto& aos = pti.GetArrayOfStructs();
-        ParticleType* AMREX_RESTRICT aos_ptr = &(aos()[0]);
         amrex::ParallelFor( np,
-            [=] AMREX_GPU_DEVICE (long ip) {
-                particle_tmp_ptr[ip] = aos_ptr[pid_ptr[ip]];
-            }
-        );
+            copyAndReorder<ParticleType>( aos(), particle_tmp, pid ) );
         std::swap(aos(), particle_tmp);
 
         // Temporary array for particle individual attributes
-- 
cgit v1.2.3


From bc7f60f78ecc82a4af1d94d4687e5c7a1b8a9ab8 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Mon, 30 Sep 2019 09:11:29 -0700
Subject: More optimization for GPU

---
 Source/Particles/Sorting/Partition.cpp  | 11 +++++++++--
 Source/Particles/Sorting/SortingUtils.H | 10 +++++-----
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'Source/Particles/Sorting/Partition.cpp')

diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index 2b5d661e4..badd35f02 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -43,9 +43,9 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     BL_PROFILE("PPC::Evolve::partition");
 
     // Initialize temporary arrays
-    Gpu::ManagedDeviceVector<int> inexflag;
+    Gpu::DeviceVector<int> inexflag;
     inexflag.resize(np);
-    Gpu::ManagedDeviceVector<long> pid;
+    Gpu::DeviceVector<long> pid;
     pid.resize(np);
 
     // First, partition particles into the larger buffer
@@ -138,5 +138,12 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
         std::swap(uyp, tmp);
         amrex::ParallelFor( np, copyAndReorder<Real>( uzp, tmp, pid ) );
         std::swap(uzp, tmp);
+
+        // Make sure that the temporary arrays are not destroyed before
+        // the GPU kernels finish running
+        Gpu::streamSynchronize();
     }
+    // Make sure that the temporary arrays are not destroyed before
+    // the GPU kernels finish running
+    Gpu::streamSynchronize();
 }
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
index 76c143b09..1aaaf73a8 100644
--- a/Source/Particles/Sorting/SortingUtils.H
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -6,7 +6,7 @@
 #include <AMReX_Gpu.H>
 
 // TODO: Add documentation
-void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector<long>& v )
+void fillWithConsecutiveIntegers( amrex::Gpu::DeviceVector<long>& v )
 {
 #ifdef AMREX_USE_GPU
     // On GPU: Use thrust
@@ -21,7 +21,7 @@ void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector<long>& v )
 template< typename ForwardIterator >
 ForwardIterator stablePartition(ForwardIterator index_begin,
                                ForwardIterator index_end,
-                               amrex::Gpu::ManagedDeviceVector<int>& predicate)
+                               amrex::Gpu::DeviceVector<int>& predicate)
 {
 #ifdef AMREX_USE_GPU
     // On GPU: Use thrust
@@ -59,7 +59,7 @@ class fillBufferFlag
 {
     public:
         fillBufferFlag( WarpXParIter& pti, const amrex::iMultiFab* bmasks,
-                        amrex::Gpu::ManagedDeviceVector<int>& inexflag,
+                        amrex::Gpu::DeviceVector<int>& inexflag,
                         const amrex::Geometry& geom, long start_index=0 ) {
 
             // Extract simple structure that can be used directly on the GPU
@@ -109,14 +109,14 @@ class copyAndReorder
         copyAndReorder(
             amrex::Gpu::ManagedDeviceVector<T>& src,
             amrex::Gpu::ManagedDeviceVector<T>& dst,
-            amrex::Gpu::ManagedDeviceVector<long>& indices ) {
+            amrex::Gpu::DeviceVector<long>& indices ) {
             // Extract simple structure that can be used directly on the GPU
             m_src_ptr = src.dataPtr();
             m_dst_ptr = dst.dataPtr();
             m_indices_ptr = indices.dataPtr();
         };
 
-        AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE
+        AMREX_GPU_DEVICE AMREX_FORCE_INLINE
         void operator()( const long ip ) const {
             m_dst_ptr[ip] = m_src_ptr[ m_indices_ptr[ip] ];
         };
-- 
cgit v1.2.3


From 0f32e809c513ea122361e018a04374a599f1e571 Mon Sep 17 00:00:00 2001
From: Remi Lehe <remi.lehe@normalesup.org>
Date: Tue, 1 Oct 2019 20:21:34 -0700
Subject: Edit const correctness

---
 Source/Particles/Sorting/Partition.cpp  |  6 ++--
 Source/Particles/Sorting/SortingUtils.H | 50 ++++++++++++++++++---------------
 2 files changed, 30 insertions(+), 26 deletions(-)

(limited to 'Source/Particles/Sorting/Partition.cpp')

diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp
index badd35f02..d1455249a 100644
--- a/Source/Particles/Sorting/Partition.cpp
+++ b/Source/Particles/Sorting/Partition.cpp
@@ -60,12 +60,12 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
     // - Find the indices that reorder particles so that the last particles
     //   are in the larger buffer
     fillWithConsecutiveIntegers( pid );
-    auto sep = stablePartition( pid.begin(), pid.end(), inexflag );
+    auto const sep = stablePartition( pid.begin(), pid.end(), inexflag );
     // At the end of this step, `pid` contains the indices that should be used to
     // reorder the particles, and `sep` is the position in the array that
     // separates the particles that deposit/gather on the fine patch (first part)
     // and the particles that deposit/gather in the buffers (last part)
-    long n_fine = iteratorDistance(pid.begin(), sep);
+    long const n_fine = iteratorDistance(pid.begin(), sep);
     // Number of particles on fine patch, i.e. outside of the larger buffer
 
     // Second, among particles that are in the larger buffer, partition
@@ -94,7 +94,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers(
             // the smaller buffer, by looking up the mask. Store the answer in `inexflag`.
             amrex::ParallelFor( np - n_fine,
                fillBufferFlag(pti, bmasks, inexflag, Geom(lev), n_fine) );
-            auto sep2 = stablePartition( sep, pid.end(), inexflag );
+            auto const sep2 = stablePartition( sep, pid.end(), inexflag );
 
             if (bmasks == gather_masks) {
                 nfine_gather = iteratorDistance(pid.begin(), sep2);
diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H
index 37788668e..36a33c1cd 100644
--- a/Source/Particles/Sorting/SortingUtils.H
+++ b/Source/Particles/Sorting/SortingUtils.H
@@ -4,6 +4,10 @@
 #include <WarpXParticleContainer.H>
 #include <AMReX_CudaContainers.H>
 #include <AMReX_Gpu.H>
+#ifdef AMREX_USE_GPU
+    #include <thrust/partition.h>
+    #include <thrust/distance.h>
+#endif
 
 /* \brief Fill the elements of the input vector with consecutive integer,
  *        starting from 0
@@ -24,28 +28,28 @@ void fillWithConsecutiveIntegers( amrex::Gpu::DeviceVector<long>& v )
 /* \brief Find the indices that would reorder the elements of `predicate`
  * so that the elements with non-zero value precede the other elements
  *
- * \param[in] index_begin Point to the beginning of the vector which is
+ * \param[in, out] index_begin Point to the beginning of the vector which is
  *            to be filled with these indices
- * \param[in] index_begin Point to the end of the vector which is
+ * \param[in, out] index_begin Point to the end of the vector which is
  *            to be filled with these indices
  * \param[in] Vector that indicates the elements that need to be reordered first
  */
 template< typename ForwardIterator >
-ForwardIterator stablePartition(ForwardIterator index_begin,
-                               ForwardIterator index_end,
-                               amrex::Gpu::DeviceVector<int>& predicate)
+ForwardIterator stablePartition(ForwardIterator const index_begin,
+                               ForwardIterator const index_end,
+                               amrex::Gpu::DeviceVector<int> const& predicate)
 {
 #ifdef AMREX_USE_GPU
     // On GPU: Use thrust
-    int* AMREX_RESTRICT predicate_ptr = predicate.dataPtr();
-    ForwardIterator sep = thrust::stable_partition(
+    int const* AMREX_RESTRICT predicate_ptr = predicate.dataPtr();
+    ForwardIterator const sep = thrust::stable_partition(
         thrust::cuda::par(amrex::Cuda::The_ThrustCachedAllocator()),
         index_begin, index_end,
         [predicate_ptr] AMREX_GPU_DEVICE (long i) { return predicate_ptr[i]; }
     );
 #else
     // On CPU: Use std library
-    ForwardIterator sep = std::stable_partition(
+    ForwardIterator const sep = std::stable_partition(
         index_begin, index_end,
         [&predicate](long i) { return predicate[i]; }
     );
@@ -60,8 +64,8 @@ ForwardIterator stablePartition(ForwardIterator index_begin,
  * \return The number of elements between `first` and `last`
  */
 template< typename ForwardIterator >
-int iteratorDistance(ForwardIterator first,
-                     ForwardIterator last)
+int iteratorDistance(ForwardIterator const first,
+                     ForwardIterator const last)
 {
 #ifdef AMREX_USE_GPU
     // On GPU: Use thrust
@@ -86,9 +90,10 @@ int iteratorDistance(ForwardIterator first,
 class fillBufferFlag
 {
     public:
-        fillBufferFlag( WarpXParIter& pti, const amrex::iMultiFab* bmasks,
+        fillBufferFlag( WarpXParIter const& pti, amrex::iMultiFab const* bmasks,
                         amrex::Gpu::DeviceVector<int>& inexflag,
-                        const amrex::Geometry& geom, long start_index=0 ) {
+                        amrex::Geometry const& geom, long const start_index=0 ) :
+                        m_start_index(start_index) {
 
             // Extract simple structure that can be used directly on the GPU
             m_particles = &(pti.GetArrayOfStructs()[0]);
@@ -99,7 +104,6 @@ class fillBufferFlag
                 m_prob_lo[idim] = geom.ProbLo(idim);
                 m_inv_cell_size[idim] = geom.InvCellSize(idim);
             }
-            m_start_index = start_index;
         };
 
 
@@ -124,26 +128,26 @@ class fillBufferFlag
         amrex::Real m_inv_cell_size[AMREX_SPACEDIM];
         amrex::IntVect m_domain_small_end;
         int* m_inexflag_ptr;
-        WarpXParticleContainer::ParticleType* m_particles;
-        amrex::Array4<const int> m_buffer_mask;
-        long m_start_index;
+        WarpXParticleContainer::ParticleType const* m_particles;
+        amrex::Array4<int const> m_buffer_mask;
+        long const m_start_index;
 };
 
 /* \brief Functor that copies the elements of `src` into `dst`,
  *       while reordering them according to `indices`
  *
- * \param src Source vector
- * \param dst Destination vector
- * \param indices Array of indices that indicate how to reorder elements
+ * \param[in] src Source vector
+ * \param[out] dst Destination vector
+ * \param[in] indices Array of indices that indicate how to reorder elements
  */
 template <typename T>
 class copyAndReorder
 {
     public:
         copyAndReorder(
-            amrex::Gpu::ManagedDeviceVector<T>& src,
+            amrex::Gpu::ManagedDeviceVector<T> const& src,
             amrex::Gpu::ManagedDeviceVector<T>& dst,
-            amrex::Gpu::DeviceVector<long>& indices ) {
+            amrex::Gpu::DeviceVector<long> const& indices ) {
             // Extract simple structure that can be used directly on the GPU
             m_src_ptr = src.dataPtr();
             m_dst_ptr = dst.dataPtr();
@@ -156,9 +160,9 @@ class copyAndReorder
         };
 
     private:
-        T* m_src_ptr;
+        T const* m_src_ptr;
         T* m_dst_ptr;
-        long* m_indices_ptr;
+        long const* m_indices_ptr;
 };
 
 #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_
-- 
cgit v1.2.3