From a7d4ebb9cb2b7528b7b1891eccb794fb6b70cf09 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Wed, 25 Sep 2019 00:50:41 -0700 Subject: Start GPU conversion --- Source/Particles/Sorting/Partition.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'Source/Particles/Sorting/Partition.cpp') diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp index 683dbbd04..29e187821 100644 --- a/Source/Particles/Sorting/Partition.cpp +++ b/Source/Particles/Sorting/Partition.cpp @@ -40,22 +40,25 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( RealVector& uxp, RealVector& uyp, RealVector& uzp, RealVector& wp) { BL_PROFILE("PPC::Evolve::partition"); - - std::vector inexflag; + Gpu::ManagedDeviceVector inexflag; inexflag.resize(np); - auto& aos = pti.GetArrayOfStructs(); - // We need to partition the large buffer first iMultiFab const* bmasks = (WarpX::n_field_gather_buffer >= WarpX::n_current_deposition_buffer) ? gather_masks : current_masks; - int i = 0; - const auto& msk = (*bmasks)[pti]; - for (const auto& p : aos) { - const IntVect& iv = Index(p, lev); - inexflag[i++] = msk(iv); - } + + // For each particle, find whether it is in the large buffer, by looking up the mask + const Array4& msk = (*bmasks)[pti].array(); + auto& aos = pti.GetArrayOfStructs(); + ParticleType * AMREX_RESTRICT pstructs = &(aos[0]); + int * inexflag_ptr = inexflag.dataPtr(); + amrex::ParallelFor( pti.numParticles(), + [=] AMREX_GPU_DEVICE (long i) { + const IntVect& iv = Index(pstructs[i], lev); + inexflag_ptr[i] = msk(iv); + } + ); Vector pid; pid.resize(np); -- cgit v1.2.3 From feac8d56e5cc84f63f6e7be7e1668113558c9e47 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Wed, 25 Sep 2019 14:53:21 -0700 Subject: Implement `iota` on GPU --- Source/Particles/Sorting/Make.package | 1 + Source/Particles/Sorting/Partition.cpp | 10 ++++++---- Source/Particles/Sorting/SortingUtils.H | 18 ++++++++++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 Source/Particles/Sorting/SortingUtils.H (limited to 'Source/Particles/Sorting/Partition.cpp') diff --git a/Source/Particles/Sorting/Make.package b/Source/Particles/Sorting/Make.package index 750d2607e..f9c708e5b 100644 --- a/Source/Particles/Sorting/Make.package +++ b/Source/Particles/Sorting/Make.package @@ -1,3 +1,4 @@ +CEXE_headers += SortingUtils.H CEXE_sources += Partition.cpp INCLUDE_LOCATIONS += $(WARPX_HOME)/Source/Particles/Sorting VPATH_LOCATIONS += $(WARPX_HOME)/Source/Particles/Sorting diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp index 29e187821..49d4d25a6 100644 --- a/Source/Particles/Sorting/Partition.cpp +++ b/Source/Particles/Sorting/Partition.cpp @@ -1,5 +1,6 @@ -#include +#include #include +#include #include using namespace amrex; @@ -43,7 +44,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( Gpu::ManagedDeviceVector inexflag; inexflag.resize(np); - // We need to partition the large buffer first + // Select the largest buffer first iMultiFab const* bmasks = (WarpX::n_field_gather_buffer >= WarpX::n_current_deposition_buffer) ? gather_masks : current_masks; @@ -60,9 +61,10 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( } ); - Vector pid; + // Partition the particles according whether they are in the large buffer or not + Gpu::ManagedDeviceVector pid; pid.resize(np); - std::iota(pid.begin(), pid.end(), 0L); + fillWithConsecutiveIntegers( pid ); auto sep = std::stable_partition(pid.begin(), pid.end(), [&inexflag](long id) { return inexflag[id]; }); diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H new file mode 100644 index 000000000..d072ebd2a --- /dev/null +++ b/Source/Particles/Sorting/SortingUtils.H @@ -0,0 +1,18 @@ +#ifndef WARPX_PARTICLES_SORTING_SORTINGUTILS_H_ +#define WARPX_PARTICLES_SORTING_SORTINGUTILS_H_ + +#include +#include + +// TODO: Add documentation +void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector& v ) { +#ifdef AMREX_USE_GPU + // On GPU: Use thrust + thrust::sequence( v.begin(), v.end() ); +#else + // On CPU: Use std library + std::iota( v.begin(), v.end(), 0L ); +#endif +} + +#endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_ -- cgit v1.2.3 From 69b512e9fb315bd987314ec71ff463c6a90afde0 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Wed, 25 Sep 2019 13:12:23 -0700 Subject: Replaced Index function --- Source/Particles/Sorting/Partition.cpp | 15 +++++++++++++-- Source/Particles/Sorting/SortingUtils.H | 22 ++++++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) (limited to 'Source/Particles/Sorting/Partition.cpp') diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp index 49d4d25a6..6874e2df6 100644 --- a/Source/Particles/Sorting/Partition.cpp +++ b/Source/Particles/Sorting/Partition.cpp @@ -54,9 +54,20 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( auto& aos = pti.GetArrayOfStructs(); ParticleType * AMREX_RESTRICT pstructs = &(aos[0]); int * inexflag_ptr = inexflag.dataPtr(); - amrex::ParallelFor( pti.numParticles(), + const Geometry& geom = Geom(lev); + const Real prob_lo_x = geom.ProbLo(0); + const Real prob_lo_y = geom.ProbLo(1); + const Real prob_lo_z = geom.ProbLo(2); + const Real inv_dx = geom.InvCellSize(0); + const Real inv_dy = geom.InvCellSize(1); + const Real inv_dz = geom.InvCellSize(2); + const IntVect domain_small_end = geom.Domain().smallEnd(); + amrex::ParallelFor( np, [=] AMREX_GPU_DEVICE (long i) { - const IntVect& iv = Index(pstructs[i], lev); + + const IntVect& iv = findParticleIndex(pstructs[i], + prob_lo_x, prob_lo_y, prob_lo_z, + inv_dx, inv_dy, inv_dz, domain_small_end ); inexflag_ptr[i] = msk(iv); } ); diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H index d072ebd2a..7dec21446 100644 --- a/Source/Particles/Sorting/SortingUtils.H +++ b/Source/Particles/Sorting/SortingUtils.H @@ -15,4 +15,26 @@ void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector& v ) { #endif } +// TODO: Add documentation +template< typename PType > +AMREX_GPU_HOST_DEVICE AMREX_INLINE +amrex::IntVect findParticleIndex ( PType p, + const amrex::Real prob_lo_x, + const amrex::Real prob_lo_y, + const amrex::Real prob_lo_z, + const amrex::Real inv_dx, + const amrex::Real inv_dy, + const amrex::Real inv_dz, + const amrex::IntVect domain_small_end ) +{ + amrex::IntVect iv; + AMREX_D_TERM(iv[0]=static_cast(floor((p.m_rdata.pos[0]-prob_lo_x)*inv_dx));, + iv[1]=static_cast(floor((p.m_rdata.pos[1]-prob_lo_y)*inv_dy));, + iv[2]=static_cast(floor((p.m_rdata.pos[2]-prob_lo_z)*inv_dz));); + + iv += domain_small_end; + + return iv; +}; + #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_ -- cgit v1.2.3 From afde2a0f3cca52fde99ff59b72a6630cbea4c391 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Wed, 25 Sep 2019 14:11:16 -0700 Subject: Rewrite as functor --- Source/Particles/Sorting/Partition.cpp | 35 +++++--------------- Source/Particles/Sorting/SortingUtils.H | 58 +++++++++++++++++++++++---------- 2 files changed, 50 insertions(+), 43 deletions(-) (limited to 'Source/Particles/Sorting/Partition.cpp') diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp index 6874e2df6..7e247bd8c 100644 --- a/Source/Particles/Sorting/Partition.cpp +++ b/Source/Particles/Sorting/Partition.cpp @@ -41,40 +41,23 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( RealVector& uxp, RealVector& uyp, RealVector& uzp, RealVector& wp) { BL_PROFILE("PPC::Evolve::partition"); + + auto& aos = pti.GetArrayOfStructs(); Gpu::ManagedDeviceVector inexflag; inexflag.resize(np); + Gpu::ManagedDeviceVector pid; + pid.resize(np); // Select the largest buffer first iMultiFab const* bmasks = (WarpX::n_field_gather_buffer >= WarpX::n_current_deposition_buffer) ? gather_masks : current_masks; - // For each particle, find whether it is in the large buffer, by looking up the mask - const Array4& msk = (*bmasks)[pti].array(); - auto& aos = pti.GetArrayOfStructs(); - ParticleType * AMREX_RESTRICT pstructs = &(aos[0]); - int * inexflag_ptr = inexflag.dataPtr(); - const Geometry& geom = Geom(lev); - const Real prob_lo_x = geom.ProbLo(0); - const Real prob_lo_y = geom.ProbLo(1); - const Real prob_lo_z = geom.ProbLo(2); - const Real inv_dx = geom.InvCellSize(0); - const Real inv_dy = geom.InvCellSize(1); - const Real inv_dz = geom.InvCellSize(2); - const IntVect domain_small_end = geom.Domain().smallEnd(); - amrex::ParallelFor( np, - [=] AMREX_GPU_DEVICE (long i) { - - const IntVect& iv = findParticleIndex(pstructs[i], - prob_lo_x, prob_lo_y, prob_lo_z, - inv_dx, inv_dy, inv_dz, domain_small_end ); - inexflag_ptr[i] = msk(iv); - } - ); - - // Partition the particles according whether they are in the large buffer or not - Gpu::ManagedDeviceVector pid; - pid.resize(np); + // For each particle, find whether it is in the large buffer, + // by looking up the mask. Store the answer in `inexflag` + amrex::ParallelFor( np, fillBufferFlag( pti, bmasks, inexflag, Geom(lev) ) ); + + // Partition the particles according to whether they are in the large buffer or not fillWithConsecutiveIntegers( pid ); auto sep = std::stable_partition(pid.begin(), pid.end(), [&inexflag](long id) { return inexflag[id]; }); diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H index 7dec21446..1133ccab5 100644 --- a/Source/Particles/Sorting/SortingUtils.H +++ b/Source/Particles/Sorting/SortingUtils.H @@ -1,8 +1,9 @@ #ifndef WARPX_PARTICLES_SORTING_SORTINGUTILS_H_ #define WARPX_PARTICLES_SORTING_SORTINGUTILS_H_ -#include +#include #include +#include // TODO: Add documentation void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector& v ) { @@ -16,25 +17,48 @@ void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector& v ) { } // TODO: Add documentation -template< typename PType > -AMREX_GPU_HOST_DEVICE AMREX_INLINE -amrex::IntVect findParticleIndex ( PType p, - const amrex::Real prob_lo_x, - const amrex::Real prob_lo_y, - const amrex::Real prob_lo_z, - const amrex::Real inv_dx, - const amrex::Real inv_dy, - const amrex::Real inv_dz, - const amrex::IntVect domain_small_end ) +class fillBufferFlag { - amrex::IntVect iv; - AMREX_D_TERM(iv[0]=static_cast(floor((p.m_rdata.pos[0]-prob_lo_x)*inv_dx));, - iv[1]=static_cast(floor((p.m_rdata.pos[1]-prob_lo_y)*inv_dy));, - iv[2]=static_cast(floor((p.m_rdata.pos[2]-prob_lo_z)*inv_dz));); + public: + fillBufferFlag( WarpXParIter& pti, const amrex::iMultiFab* bmasks, + amrex::Gpu::ManagedDeviceVector& inexflag, + const amrex::Geometry& geom ) { + + // Extract simple structure that can be used directly on the GPU + m_particles = &(pti.GetArrayOfStructs()[0]); + m_buffer_mask = (*bmasks)[pti].array(); + m_inexflag_ptr = inexflag.dataPtr(); + m_domain_small_end = geom.Domain().smallEnd(); + for (int idim=0; idim(floor( + (p.m_rdata.pos[idim]-m_prob_lo[idim])*m_inv_cell_size[idim] + )); + } + // Find the value of the buffer flag in this cell and + // store it at the corresponding particle position in the array `inexflag` + m_inexflag_ptr[i] = m_buffer_mask(iv); + }; - return iv; + private: + amrex::Real m_prob_lo[AMREX_SPACEDIM]; + amrex::Real m_inv_cell_size[AMREX_SPACEDIM]; + amrex::IntVect m_domain_small_end; + int* m_inexflag_ptr; + WarpXParticleContainer::ParticleType* m_particles; + amrex::Array4 m_buffer_mask; }; #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_ -- cgit v1.2.3 From 8575478f370f3db2525a8d5444e6949776e9da29 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Thu, 26 Sep 2019 08:51:00 -0700 Subject: Implemented stable partition --- Source/Particles/Sorting/Partition.cpp | 24 +++++++++++++++--------- Source/Particles/Sorting/SortingUtils.H | 28 +++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 10 deletions(-) (limited to 'Source/Particles/Sorting/Partition.cpp') diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp index 7e247bd8c..3ee49cafe 100644 --- a/Source/Particles/Sorting/Partition.cpp +++ b/Source/Particles/Sorting/Partition.cpp @@ -43,24 +43,30 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( BL_PROFILE("PPC::Evolve::partition"); auto& aos = pti.GetArrayOfStructs(); + + // Initialize temporary arrays Gpu::ManagedDeviceVector inexflag; inexflag.resize(np); Gpu::ManagedDeviceVector pid; pid.resize(np); - // Select the largest buffer first + // First, partition particles in the larger buffer + + // - Select the larger buffer iMultiFab const* bmasks = (WarpX::n_field_gather_buffer >= WarpX::n_current_deposition_buffer) ? gather_masks : current_masks; - - // For each particle, find whether it is in the large buffer, - // by looking up the mask. Store the answer in `inexflag` - amrex::ParallelFor( np, fillBufferFlag( pti, bmasks, inexflag, Geom(lev) ) ); - - // Partition the particles according to whether they are in the large buffer or not + // - For each particle, find whether it is in the larger buffer, + // by looking up the mask. Store the answer in `inexflag`. + amrex::ParallelFor( np, fillBufferFlag(pti, bmasks, inexflag, Geom(lev)) ); + // - Find the indices that reorder particles so that the last particles + // are in the larger buffer fillWithConsecutiveIntegers( pid ); - auto sep = std::stable_partition(pid.begin(), pid.end(), - [&inexflag](long id) { return inexflag[id]; }); + auto sep = stablePartition( pid.begin(), pid.end(), inexflag ); + // At the end of this step, `pid` contains the indices that should be used to + // reorder the particles, and `sep` is the position in the array that + // separates the particles that deposit/gather on the fine patch (first part) + // and the particles that deposit/gather in the buffers (last part) if (WarpX::n_current_deposition_buffer == WarpX::n_field_gather_buffer) { nfine_current = nfine_gather = std::distance(pid.begin(), sep); diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H index 1133ccab5..ede59b53b 100644 --- a/Source/Particles/Sorting/SortingUtils.H +++ b/Source/Particles/Sorting/SortingUtils.H @@ -6,7 +6,8 @@ #include // TODO: Add documentation -void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector& v ) { +void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector& v ) +{ #ifdef AMREX_USE_GPU // On GPU: Use thrust thrust::sequence( v.begin(), v.end() ); @@ -16,6 +17,31 @@ void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector& v ) { #endif } +// TODO: Add documentation +template< typename ForwardIterator > +ForwardIterator stablePartition(ForwardIterator index_begin, + ForwardIterator index_end, + amrex::Gpu::ManagedDeviceVector& predicate) +{ +#ifdef AMREX_USE_GPU + // On GPU: Use thrust + int* AMREX_RESTRICT predicate_ptr = predicate.dataPtr(); + ForwardIterator sep = thrust::stable_partition( + thrust::cuda::par(Cuda::The_ThrustCachedAllocator()), + index_begin, index_end, + AMREX_GPU_HOST_DEVICE + [predicate_ptr](long i) { return predicate_ptr[i]; } + ); +#else + // On CPU: Use std library + ForwardIterator sep = std::stable_partition( + index_begin, index_end, + [&predicate](long i) { return predicate[i]; } + ); +#endif + return sep; +} + // TODO: Add documentation class fillBufferFlag { -- cgit v1.2.3 From fac016f1173aacbf4500a28ef63ed683365e4d3f Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Thu, 26 Sep 2019 09:00:22 -0700 Subject: Implemented iterator distance --- Source/Particles/Sorting/Partition.cpp | 16 +++++++++++----- Source/Particles/Sorting/SortingUtils.H | 13 +++++++++++++ 2 files changed, 24 insertions(+), 5 deletions(-) (limited to 'Source/Particles/Sorting/Partition.cpp') diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp index 3ee49cafe..d64b9fa25 100644 --- a/Source/Particles/Sorting/Partition.cpp +++ b/Source/Particles/Sorting/Partition.cpp @@ -50,7 +50,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( Gpu::ManagedDeviceVector pid; pid.resize(np); - // First, partition particles in the larger buffer + // First, partition particles into the larger buffer // - Select the larger buffer iMultiFab const* bmasks = @@ -68,16 +68,22 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( // separates the particles that deposit/gather on the fine patch (first part) // and the particles that deposit/gather in the buffers (last part) + // Second, among particles that are in the larger buffer, partition + // particles into the smaller buffer if (WarpX::n_current_deposition_buffer == WarpX::n_field_gather_buffer) { - nfine_current = nfine_gather = std::distance(pid.begin(), sep); - } else if (sep != pid.end()) { + // No need to do anything if the buffers have the same size + nfine_current = nfine_gather = iteratorDistance(pid.begin(), sep); + } else if (sep == pid.end()) { + // No need to do anything if there are no particles in the larger buffer + nfine_current = nfine_gather = np; + } else { int n_buf; if (bmasks == gather_masks) { - nfine_gather = std::distance(pid.begin(), sep); + nfine_gather = iteratorDistance(pid.begin(), sep); bmasks = current_masks; n_buf = WarpX::n_current_deposition_buffer; } else { - nfine_current = std::distance(pid.begin(), sep); + nfine_current = iteratorDistance(pid.begin(), sep); bmasks = gather_masks; n_buf = WarpX::n_field_gather_buffer; } diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H index ede59b53b..7d53a352e 100644 --- a/Source/Particles/Sorting/SortingUtils.H +++ b/Source/Particles/Sorting/SortingUtils.H @@ -42,6 +42,19 @@ ForwardIterator stablePartition(ForwardIterator index_begin, return sep; } +// TODO: Add documentation +template< typename ForwardIterator > +int iteratorDistance(ForwardIterator first, + ForwardIterator last) +{ +#ifdef AMREX_USE_GPU + // On GPU: Use thrust + return thrust::distance( first, last ); +#else + return std::distance( first, last ); +#endif +} + // TODO: Add documentation class fillBufferFlag { -- cgit v1.2.3 From a95c8f5445b7e8b1fc8808ea5bc30ba3c3d02351 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Thu, 26 Sep 2019 09:53:51 -0700 Subject: Perform partition in smaller buffer --- Source/Particles/Sorting/Partition.cpp | 25 +++++++++++-------------- Source/Particles/Sorting/SortingUtils.H | 8 +++++--- 2 files changed, 16 insertions(+), 17 deletions(-) (limited to 'Source/Particles/Sorting/Partition.cpp') diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp index d64b9fa25..138f28105 100644 --- a/Source/Particles/Sorting/Partition.cpp +++ b/Source/Particles/Sorting/Partition.cpp @@ -58,7 +58,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( gather_masks : current_masks; // - For each particle, find whether it is in the larger buffer, // by looking up the mask. Store the answer in `inexflag`. - amrex::ParallelFor( np, fillBufferFlag(pti, bmasks, inexflag, Geom(lev)) ); + amrex::ParallelFor( np, fillBufferFlag(pti, bmasks, inexflag, Geom(lev), 0) ); // - Find the indices that reorder particles so that the last particles // are in the larger buffer fillWithConsecutiveIntegers( pid ); @@ -67,6 +67,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( // reorder the particles, and `sep` is the position in the array that // separates the particles that deposit/gather on the fine patch (first part) // and the particles that deposit/gather in the buffers (last part) + long n_fine = iteratorDistance(pid.begin(), sep); // Number of particles on fine patch // Second, among particles that are in the larger buffer, partition // particles into the smaller buffer @@ -79,29 +80,25 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( } else { int n_buf; if (bmasks == gather_masks) { - nfine_gather = iteratorDistance(pid.begin(), sep); + nfine_gather = n_fine; bmasks = current_masks; n_buf = WarpX::n_current_deposition_buffer; } else { - nfine_current = iteratorDistance(pid.begin(), sep); + nfine_current = n_fine; bmasks = gather_masks; n_buf = WarpX::n_field_gather_buffer; } if (n_buf > 0) { - const auto& msk2 = (*bmasks)[pti]; - for (auto it = sep; it != pid.end(); ++it) { - const long id = *it; - const IntVect& iv = Index(aos[id], lev); - inexflag[id] = msk2(iv); - } - - auto sep2 = std::stable_partition(sep, pid.end(), - [&inexflag](long id) {return inexflag[id];}); + // - For each particle in the large buffer, find whether it is in + // the smaller buffer, by looking up the mask. Store the answer in `inexflag`. + amrex::ParallelFor( np - n_fine, + fillBufferFlag(pti, bmasks, inexflag, Geom(lev), n_fine) ); + auto sep2 = stablePartition( sep, pid.end(), inexflag ); if (bmasks == gather_masks) { - nfine_gather = std::distance(pid.begin(), sep2); + nfine_gather = iteratorDistance(pid.begin(), sep2); } else { - nfine_current = std::distance(pid.begin(), sep2); + nfine_current = iteratorDistance(pid.begin(), sep2); } } } diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H index 7d53a352e..a17fe85db 100644 --- a/Source/Particles/Sorting/SortingUtils.H +++ b/Source/Particles/Sorting/SortingUtils.H @@ -61,7 +61,7 @@ class fillBufferFlag public: fillBufferFlag( WarpXParIter& pti, const amrex::iMultiFab* bmasks, amrex::Gpu::ManagedDeviceVector& inexflag, - const amrex::Geometry& geom ) { + const amrex::Geometry& geom, long start_index=0 ) { // Extract simple structure that can be used directly on the GPU m_particles = &(pti.GetArrayOfStructs()[0]); @@ -72,13 +72,14 @@ class fillBufferFlag m_prob_lo[idim] = geom.ProbLo(idim); m_inv_cell_size[idim] = geom.InvCellSize(idim); } + m_start_index = start_index; }; AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void operator()( const long i ) const { // Select a particle - auto& p = m_particles[i]; + auto& p = m_particles[i+m_start_index]; // Find the index of the cell where this particle is located amrex::IntVect iv; for (int idim=0; idim m_buffer_mask; + long m_start_index; }; #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_ -- cgit v1.2.3 From 2b08e3082ba46361b943b7ce67f8d21eb835ece9 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Thu, 26 Sep 2019 10:35:48 -0700 Subject: Implement particle copy --- Source/Particles/Sorting/Partition.cpp | 47 +++++++++++++++++---------------- Source/Particles/Sorting/SortingUtils.H | 41 ++++++++++++++++++++++------ 2 files changed, 57 insertions(+), 31 deletions(-) (limited to 'Source/Particles/Sorting/Partition.cpp') diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp index 138f28105..b4f96476e 100644 --- a/Source/Particles/Sorting/Partition.cpp +++ b/Source/Particles/Sorting/Partition.cpp @@ -42,8 +42,6 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( { BL_PROFILE("PPC::Evolve::partition"); - auto& aos = pti.GetArrayOfStructs(); - // Initialize temporary arrays Gpu::ManagedDeviceVector inexflag; inexflag.resize(np); @@ -67,10 +65,12 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( // reorder the particles, and `sep` is the position in the array that // separates the particles that deposit/gather on the fine patch (first part) // and the particles that deposit/gather in the buffers (last part) - long n_fine = iteratorDistance(pid.begin(), sep); // Number of particles on fine patch + long n_fine = iteratorDistance(pid.begin(), sep); + // Number of particles on fine patch, i.e. outside of the larger buffer // Second, among particles that are in the larger buffer, partition // particles into the smaller buffer + if (WarpX::n_current_deposition_buffer == WarpX::n_field_gather_buffer) { // No need to do anything if the buffers have the same size nfine_current = nfine_gather = iteratorDistance(pid.begin(), sep); @@ -95,6 +95,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( amrex::ParallelFor( np - n_fine, fillBufferFlag(pti, bmasks, inexflag, Geom(lev), n_fine) ); auto sep2 = stablePartition( sep, pid.end(), inexflag ); + if (bmasks == gather_masks) { nfine_gather = iteratorDistance(pid.begin(), sep2); } else { @@ -111,39 +112,39 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( nfine_gather = 0; } + // Reorder the actual particle array, using the `pid` indices if (nfine_current != np || nfine_gather != np) { + // Extract simple pointer + long* AMREX_RESTRICT pid_ptr = pid.dataPtr(); + // Temporary array for particle AoS ParticleVector particle_tmp; particle_tmp.resize(np); - - for (long ip = 0; ip < np; ++ip) { - particle_tmp[ip] = aos[pid[ip]]; - } + ParticleType* AMREX_RESTRICT particle_tmp_ptr = particle_tmp.dataPtr(); + + // Copy the particle AoS + auto& aos = pti.GetArrayOfStructs(); + ParticleType* AMREX_RESTRICT aos_ptr = &(aos()[0]); + amrex::ParallelFor( np, + [=] AMREX_GPU_DEVICE (long ip) { + particle_tmp_ptr[ip] = aos_ptr[pid_ptr[ip]]; + } + ); std::swap(aos(), particle_tmp); + // Temporary array for particle individual attributes RealVector tmp; tmp.resize(np); - for (long ip = 0; ip < np; ++ip) { - tmp[ip] = wp[pid[ip]]; - } + // Copy individual attributes + amrex::ParallelFor( np, copyAndReorder( wp, tmp, pid ) ); std::swap(wp, tmp); - - for (long ip = 0; ip < np; ++ip) { - tmp[ip] = uxp[pid[ip]]; - } + amrex::ParallelFor( np, copyAndReorder( uxp, tmp, pid ) ); std::swap(uxp, tmp); - - for (long ip = 0; ip < np; ++ip) { - tmp[ip] = uyp[pid[ip]]; - } + amrex::ParallelFor( np, copyAndReorder( uyp, tmp, pid ) ); std::swap(uyp, tmp); - - for (long ip = 0; ip < np; ++ip) { - tmp[ip] = uzp[pid[ip]]; - } + amrex::ParallelFor( np, copyAndReorder( uzp, tmp, pid ) ); std::swap(uzp, tmp); } - } diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H index a17fe85db..80fc4aab6 100644 --- a/Source/Particles/Sorting/SortingUtils.H +++ b/Source/Particles/Sorting/SortingUtils.H @@ -27,17 +27,16 @@ ForwardIterator stablePartition(ForwardIterator index_begin, // On GPU: Use thrust int* AMREX_RESTRICT predicate_ptr = predicate.dataPtr(); ForwardIterator sep = thrust::stable_partition( - thrust::cuda::par(Cuda::The_ThrustCachedAllocator()), - index_begin, index_end, - AMREX_GPU_HOST_DEVICE - [predicate_ptr](long i) { return predicate_ptr[i]; } - ); + thrust::cuda::par(Cuda::The_ThrustCachedAllocator()), + index_begin, index_end, + [predicate_ptr] AMREX_GPU_DEVICE (long i) { return predicate_ptr[i]; } + ); #else // On CPU: Use std library ForwardIterator sep = std::stable_partition( - index_begin, index_end, - [&predicate](long i) { return predicate[i]; } - ); + index_begin, index_end, + [&predicate](long i) { return predicate[i]; } + ); #endif return sep; } @@ -102,4 +101,30 @@ class fillBufferFlag long m_start_index; }; +// TODO: Add documentation +template +class copyAndReorder +{ + public: + copyAndReorder( + amrex::Gpu::ManagedDeviceVector& src, + amrex::Gpu::ManagedDeviceVector& dst, + amrex::Gpu::ManagedDeviceVector& indices ) { + // Extract simple structure that can be used directly on the GPU + m_src_ptr = src.dataPtr(); + m_dst_ptr = dst.dataPtr(); + m_indices_ptr = indices.dataPtr(); + }; + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + void operator()( const long ip ) const { + m_dst_ptr[ip] = m_src_ptr[ m_indices_ptr[ip] ]; + }; + + private: + T* m_src_ptr; + T* m_dst_ptr; + long* m_indices_ptr; +}; + #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_ -- cgit v1.2.3 From c1823aacc5a957181cc3ad075c8718c3666d8406 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Thu, 26 Sep 2019 10:48:43 -0700 Subject: Use templated functor for copy of particles --- Source/Particles/Sorting/Partition.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'Source/Particles/Sorting/Partition.cpp') diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp index b4f96476e..2b5d661e4 100644 --- a/Source/Particles/Sorting/Partition.cpp +++ b/Source/Particles/Sorting/Partition.cpp @@ -115,22 +115,14 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( // Reorder the actual particle array, using the `pid` indices if (nfine_current != np || nfine_gather != np) { - // Extract simple pointer - long* AMREX_RESTRICT pid_ptr = pid.dataPtr(); - // Temporary array for particle AoS ParticleVector particle_tmp; particle_tmp.resize(np); - ParticleType* AMREX_RESTRICT particle_tmp_ptr = particle_tmp.dataPtr(); - // Copy the particle AoS + // Copy particle AoS auto& aos = pti.GetArrayOfStructs(); - ParticleType* AMREX_RESTRICT aos_ptr = &(aos()[0]); amrex::ParallelFor( np, - [=] AMREX_GPU_DEVICE (long ip) { - particle_tmp_ptr[ip] = aos_ptr[pid_ptr[ip]]; - } - ); + copyAndReorder( aos(), particle_tmp, pid ) ); std::swap(aos(), particle_tmp); // Temporary array for particle individual attributes -- cgit v1.2.3 From bc7f60f78ecc82a4af1d94d4687e5c7a1b8a9ab8 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Mon, 30 Sep 2019 09:11:29 -0700 Subject: More optimization for GPU --- Source/Particles/Sorting/Partition.cpp | 11 +++++++++-- Source/Particles/Sorting/SortingUtils.H | 10 +++++----- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'Source/Particles/Sorting/Partition.cpp') diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp index 2b5d661e4..badd35f02 100644 --- a/Source/Particles/Sorting/Partition.cpp +++ b/Source/Particles/Sorting/Partition.cpp @@ -43,9 +43,9 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( BL_PROFILE("PPC::Evolve::partition"); // Initialize temporary arrays - Gpu::ManagedDeviceVector inexflag; + Gpu::DeviceVector inexflag; inexflag.resize(np); - Gpu::ManagedDeviceVector pid; + Gpu::DeviceVector pid; pid.resize(np); // First, partition particles into the larger buffer @@ -138,5 +138,12 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( std::swap(uyp, tmp); amrex::ParallelFor( np, copyAndReorder( uzp, tmp, pid ) ); std::swap(uzp, tmp); + + // Make sure that the temporary arrays are not destroyed before + // the GPU kernels finish running + Gpu::streamSynchronize(); } + // Make sure that the temporary arrays are not destroyed before + // the GPU kernels finish running + Gpu::streamSynchronize(); } diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H index 76c143b09..1aaaf73a8 100644 --- a/Source/Particles/Sorting/SortingUtils.H +++ b/Source/Particles/Sorting/SortingUtils.H @@ -6,7 +6,7 @@ #include // TODO: Add documentation -void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector& v ) +void fillWithConsecutiveIntegers( amrex::Gpu::DeviceVector& v ) { #ifdef AMREX_USE_GPU // On GPU: Use thrust @@ -21,7 +21,7 @@ void fillWithConsecutiveIntegers( amrex::Gpu::ManagedDeviceVector& v ) template< typename ForwardIterator > ForwardIterator stablePartition(ForwardIterator index_begin, ForwardIterator index_end, - amrex::Gpu::ManagedDeviceVector& predicate) + amrex::Gpu::DeviceVector& predicate) { #ifdef AMREX_USE_GPU // On GPU: Use thrust @@ -59,7 +59,7 @@ class fillBufferFlag { public: fillBufferFlag( WarpXParIter& pti, const amrex::iMultiFab* bmasks, - amrex::Gpu::ManagedDeviceVector& inexflag, + amrex::Gpu::DeviceVector& inexflag, const amrex::Geometry& geom, long start_index=0 ) { // Extract simple structure that can be used directly on the GPU @@ -109,14 +109,14 @@ class copyAndReorder copyAndReorder( amrex::Gpu::ManagedDeviceVector& src, amrex::Gpu::ManagedDeviceVector& dst, - amrex::Gpu::ManagedDeviceVector& indices ) { + amrex::Gpu::DeviceVector& indices ) { // Extract simple structure that can be used directly on the GPU m_src_ptr = src.dataPtr(); m_dst_ptr = dst.dataPtr(); m_indices_ptr = indices.dataPtr(); }; - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE void operator()( const long ip ) const { m_dst_ptr[ip] = m_src_ptr[ m_indices_ptr[ip] ]; }; -- cgit v1.2.3 From 0f32e809c513ea122361e018a04374a599f1e571 Mon Sep 17 00:00:00 2001 From: Remi Lehe Date: Tue, 1 Oct 2019 20:21:34 -0700 Subject: Edit const correctness --- Source/Particles/Sorting/Partition.cpp | 6 ++-- Source/Particles/Sorting/SortingUtils.H | 50 ++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 26 deletions(-) (limited to 'Source/Particles/Sorting/Partition.cpp') diff --git a/Source/Particles/Sorting/Partition.cpp b/Source/Particles/Sorting/Partition.cpp index badd35f02..d1455249a 100644 --- a/Source/Particles/Sorting/Partition.cpp +++ b/Source/Particles/Sorting/Partition.cpp @@ -60,12 +60,12 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( // - Find the indices that reorder particles so that the last particles // are in the larger buffer fillWithConsecutiveIntegers( pid ); - auto sep = stablePartition( pid.begin(), pid.end(), inexflag ); + auto const sep = stablePartition( pid.begin(), pid.end(), inexflag ); // At the end of this step, `pid` contains the indices that should be used to // reorder the particles, and `sep` is the position in the array that // separates the particles that deposit/gather on the fine patch (first part) // and the particles that deposit/gather in the buffers (last part) - long n_fine = iteratorDistance(pid.begin(), sep); + long const n_fine = iteratorDistance(pid.begin(), sep); // Number of particles on fine patch, i.e. outside of the larger buffer // Second, among particles that are in the larger buffer, partition @@ -94,7 +94,7 @@ PhysicalParticleContainer::PartitionParticlesInBuffers( // the smaller buffer, by looking up the mask. Store the answer in `inexflag`. amrex::ParallelFor( np - n_fine, fillBufferFlag(pti, bmasks, inexflag, Geom(lev), n_fine) ); - auto sep2 = stablePartition( sep, pid.end(), inexflag ); + auto const sep2 = stablePartition( sep, pid.end(), inexflag ); if (bmasks == gather_masks) { nfine_gather = iteratorDistance(pid.begin(), sep2); diff --git a/Source/Particles/Sorting/SortingUtils.H b/Source/Particles/Sorting/SortingUtils.H index 37788668e..36a33c1cd 100644 --- a/Source/Particles/Sorting/SortingUtils.H +++ b/Source/Particles/Sorting/SortingUtils.H @@ -4,6 +4,10 @@ #include #include #include +#ifdef AMREX_USE_GPU + #include + #include +#endif /* \brief Fill the elements of the input vector with consecutive integer, * starting from 0 @@ -24,28 +28,28 @@ void fillWithConsecutiveIntegers( amrex::Gpu::DeviceVector& v ) /* \brief Find the indices that would reorder the elements of `predicate` * so that the elements with non-zero value precede the other elements * - * \param[in] index_begin Point to the beginning of the vector which is + * \param[in, out] index_begin Point to the beginning of the vector which is * to be filled with these indices - * \param[in] index_begin Point to the end of the vector which is + * \param[in, out] index_begin Point to the end of the vector which is * to be filled with these indices * \param[in] Vector that indicates the elements that need to be reordered first */ template< typename ForwardIterator > -ForwardIterator stablePartition(ForwardIterator index_begin, - ForwardIterator index_end, - amrex::Gpu::DeviceVector& predicate) +ForwardIterator stablePartition(ForwardIterator const index_begin, + ForwardIterator const index_end, + amrex::Gpu::DeviceVector const& predicate) { #ifdef AMREX_USE_GPU // On GPU: Use thrust - int* AMREX_RESTRICT predicate_ptr = predicate.dataPtr(); - ForwardIterator sep = thrust::stable_partition( + int const* AMREX_RESTRICT predicate_ptr = predicate.dataPtr(); + ForwardIterator const sep = thrust::stable_partition( thrust::cuda::par(amrex::Cuda::The_ThrustCachedAllocator()), index_begin, index_end, [predicate_ptr] AMREX_GPU_DEVICE (long i) { return predicate_ptr[i]; } ); #else // On CPU: Use std library - ForwardIterator sep = std::stable_partition( + ForwardIterator const sep = std::stable_partition( index_begin, index_end, [&predicate](long i) { return predicate[i]; } ); @@ -60,8 +64,8 @@ ForwardIterator stablePartition(ForwardIterator index_begin, * \return The number of elements between `first` and `last` */ template< typename ForwardIterator > -int iteratorDistance(ForwardIterator first, - ForwardIterator last) +int iteratorDistance(ForwardIterator const first, + ForwardIterator const last) { #ifdef AMREX_USE_GPU // On GPU: Use thrust @@ -86,9 +90,10 @@ int iteratorDistance(ForwardIterator first, class fillBufferFlag { public: - fillBufferFlag( WarpXParIter& pti, const amrex::iMultiFab* bmasks, + fillBufferFlag( WarpXParIter const& pti, amrex::iMultiFab const* bmasks, amrex::Gpu::DeviceVector& inexflag, - const amrex::Geometry& geom, long start_index=0 ) { + amrex::Geometry const& geom, long const start_index=0 ) : + m_start_index(start_index) { // Extract simple structure that can be used directly on the GPU m_particles = &(pti.GetArrayOfStructs()[0]); @@ -99,7 +104,6 @@ class fillBufferFlag m_prob_lo[idim] = geom.ProbLo(idim); m_inv_cell_size[idim] = geom.InvCellSize(idim); } - m_start_index = start_index; }; @@ -124,26 +128,26 @@ class fillBufferFlag amrex::Real m_inv_cell_size[AMREX_SPACEDIM]; amrex::IntVect m_domain_small_end; int* m_inexflag_ptr; - WarpXParticleContainer::ParticleType* m_particles; - amrex::Array4 m_buffer_mask; - long m_start_index; + WarpXParticleContainer::ParticleType const* m_particles; + amrex::Array4 m_buffer_mask; + long const m_start_index; }; /* \brief Functor that copies the elements of `src` into `dst`, * while reordering them according to `indices` * - * \param src Source vector - * \param dst Destination vector - * \param indices Array of indices that indicate how to reorder elements + * \param[in] src Source vector + * \param[out] dst Destination vector + * \param[in] indices Array of indices that indicate how to reorder elements */ template class copyAndReorder { public: copyAndReorder( - amrex::Gpu::ManagedDeviceVector& src, + amrex::Gpu::ManagedDeviceVector const& src, amrex::Gpu::ManagedDeviceVector& dst, - amrex::Gpu::DeviceVector& indices ) { + amrex::Gpu::DeviceVector const& indices ) { // Extract simple structure that can be used directly on the GPU m_src_ptr = src.dataPtr(); m_dst_ptr = dst.dataPtr(); @@ -156,9 +160,9 @@ class copyAndReorder }; private: - T* m_src_ptr; + T const* m_src_ptr; T* m_dst_ptr; - long* m_indices_ptr; + long const* m_indices_ptr; }; #endif // WARPX_PARTICLES_SORTING_SORTINGUTILS_H_ -- cgit v1.2.3