Use blaspp::gemm on GPU for Hankel transform (#3383)

* Use gemm on GPU for Hankel transform * Add stream synchronization * Add `amrex` * blas::gemm call: add `queue` with device id * CMake: BLAS++ Missing Deps * Update installation instructions for Summit * CMake: BLAS++ should not need curand * Add paths to blaspp/lapackpp * Move Queue Constructor to Constructor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Correct name of profiled area * Use gemm for inverse Hankel transform * Add missing comma * Clean up the code so that it compiles for CPU * Clean up code ; update documentation * Update Comment Co-authored-by: Remi Lehe <remi.lehe@normalesup.org> * Update Tools/machines/summit-olcf/summit_warpx.profile.example Co-authored-by: Axel Huebl <axel.huebl@plasma.ninja> * Add stream synchronization * Switch to streamsynchronize * Update comments Co-authored-by: Axel Huebl <axel.huebl@plasma.ninja> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
author: Remi Lehe <remi.lehe@normalesup.org> 2022-09-15 15:15:09 -0700
committer: GitHub <noreply@github.com> 2022-09-15 15:15:09 -0700
commit: ac2521aa4c23e999715931331a817c41e416ddd2 (patch)
tree: 059810af6a77e9e3e2a65b670e58368748b36e81 /Source/FieldSolver/SpectralSolver/SpectralHankelTransform
parent: 04b6f67caab8ee95428a569c529110a12b2527f3 (diff)
download: WarpX-ac2521aa4c23e999715931331a817c41e416ddd2.tar.gz
WarpX-ac2521aa4c23e999715931331a817c41e416ddd2.tar.zst
WarpX-ac2521aa4c23e999715931331a817c41e416ddd2.zip
2 files changed, 49 insertions, 48 deletions
diff --git a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.H b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.H
index 5a87031c4..3f74f5d82 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.H
+++ b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.H
@@ -8,6 +8,14 @@
 #define WARPX_HANKEL_TRANSFORM_H_
 
 #include <AMReX_FArrayBox.H>
+#include <AMReX_REAL.H>
+#include <AMReX_GpuContainers.H>
+
+#ifdef AMREX_USE_GPU
+#   include <blas.hh>
+#endif
+
+#include <memory>
 
 /* \brief This defines the class that performs the Hankel transform.
  * Original authors: Remi Lehe, Manuel Kirchen
@@ -45,6 +53,10 @@ class HankelTransform
 
         RealVector m_invM;
         RealVector m_M;
+
+#ifdef AMREX_USE_GPU
+        std::unique_ptr<blas::Queue> m_queue;
+#endif
 };
 
 #endif
diff --git a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp
index 43b26f2ee..ddd07acad 100644
--- a/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp
+++ b/Source/FieldSolver/SpectralSolver/SpectralHankelTransform/HankelTransform.cpp
@@ -11,6 +11,8 @@
 #include "Utils/WarpXConst.H"
 #include "WarpX.H"
 
+#include "Utils/WarpXProfilerWrapper.H"
+
 #include <blas.hh>
 #include <lapack.hh>
 
@@ -23,10 +25,20 @@ HankelTransform::HankelTransform (int const hankel_order,
 : m_nr(nr), m_nk(nr)
 {
 
+    WARPX_PROFILE("HankelTransform::HankelTransform");
+
     // Check that azimuthal_mode has a valid value
     WARPX_ALWAYS_ASSERT_WITH_MESSAGE(hankel_order-1 <= azimuthal_mode && azimuthal_mode <= hankel_order+1,
                                      "azimuthal_mode must be either hankel_order-1, hankel_order or hankel_order+1");
 
+#ifdef AMREX_USE_GPU
+    // BLAS setup
+    //   SYCL note: we need to double check AMReX device ID conventions and
+    //   BLAS++ device ID conventions are the same
+    int const device_id = amrex::Gpu::Device::deviceId();
+    m_queue = std::make_unique<blas::Queue>( device_id, 0 );
+#endif
+
     amrex::Vector<amrex::Real> alphas;
     amrex::Vector<int> alpha_errors;
 
@@ -186,6 +198,8 @@ void
 HankelTransform::HankelForwardTransform (amrex::FArrayBox const& F, int const F_icomp,
                                          amrex::FArrayBox      & G, int const G_icomp)
 {
+    WARPX_PROFILE("HankelTransform::HankelForwardTransform");
+
     amrex::Box const& F_box = F.box();
     amrex::Box const& G_box = G.box();
 
@@ -198,37 +212,24 @@ HankelTransform::HankelForwardTransform (amrex::FArrayBox const& F, int const F_
     AMREX_ALWAYS_ASSERT(ngr >= 0);
     AMREX_ALWAYS_ASSERT(F_box.bigEnd(0)+1 >= m_nr);
 
-#ifndef AMREX_USE_GPU
-    // On CPU, the blas::gemm is significantly faster
+    // We perform stream synchronization since `gemm` may be running
+    // on a different stream.
+    amrex::Gpu::streamSynchronize();
 
     // Note that M is flagged to be transposed since it has dimensions (m_nr, m_nk)
     blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans,
                m_nk, nz, m_nr, 1._rt,
                m_M.dataPtr(), m_nk,
                F.dataPtr(F_icomp)+ngr, nrF, 0._rt,
-               G.dataPtr(G_icomp), m_nk);
-
-#else
-    // On GPU, the explicit loop is significantly faster
-    // It is not clear if the GPU gemm wasn't build properly, it is cycling data out and back
-    // in to the device, or if it is because gemm is launching its own threads.
-
-    amrex::Real const * M_arr = m_M.dataPtr();
-    amrex::Array4<const amrex::Real> const & F_arr = F.array();
-    amrex::Array4<      amrex::Real> const & G_arr = G.array();
-
-    int const nr = m_nr;
-
-    amrex::ParallelFor(G_box,
-    [=] AMREX_GPU_DEVICE(int ik, int iz, int k3d) noexcept {
-        G_arr(ik,iz,k3d,G_icomp) = 0.;
-        for (int ir=0 ; ir < nr ; ir++) {
-            int const ii = ir + ik*nr;
-            G_arr(ik,iz,k3d,G_icomp) += M_arr[ii]*F_arr(ir,iz,k3d,F_icomp);
-        }
-    });
-
+               G.dataPtr(G_icomp), m_nk
+#ifdef AMREX_USE_GPU
+               , *m_queue // Calls the GPU version of blas::gemm
 #endif
+           );
+
+    // We perform stream synchronization since `gemm` may be running
+    // on a different stream.
+    amrex::Gpu::streamSynchronize();
 
 }
 
@@ -236,6 +237,8 @@ void
 HankelTransform::HankelInverseTransform (amrex::FArrayBox const& G, int const G_icomp,
                                          amrex::FArrayBox      & F, int const F_icomp)
 {
+    WARPX_PROFILE("HankelTransform::HankelInverseTransform");
+
     amrex::Box const& G_box = G.box();
     amrex::Box const& F_box = F.box();
 
@@ -248,36 +251,22 @@ HankelTransform::HankelInverseTransform (amrex::FArrayBox const& G, int const G_
     AMREX_ALWAYS_ASSERT(ngr >= 0);
     AMREX_ALWAYS_ASSERT(F_box.bigEnd(0)+1 >= m_nr);
 
-#ifndef AMREX_USE_GPU
-    // On CPU, the blas::gemm is significantly faster
+    // We perform stream synchronization since `gemm` may be running
+    // on a different stream.
+    amrex::Gpu::streamSynchronize();
 
     // Note that m_invM is flagged to be transposed since it has dimensions (m_nk, m_nr)
     blas::gemm(blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans,
                m_nr, nz, m_nk, 1._rt,
                m_invM.dataPtr(), m_nr,
                G.dataPtr(G_icomp), m_nk, 0._rt,
-               F.dataPtr(F_icomp)+ngr, nrF);
-
-#else
-    // On GPU, the explicit loop is significantly faster
-    // It is not clear if the GPU gemm wasn't build properly, it is cycling data out and back
-    // in to the device, or if it is because gemm is launching its own threads.
-
-    amrex::Real const * invM_arr = m_invM.dataPtr();
-    amrex::Array4<const amrex::Real> const & G_arr = G.array();
-    amrex::Array4<      amrex::Real> const & F_arr = F.array();
-
-    int const nk = m_nk;
-
-    amrex::ParallelFor(G_box,
-    [=] AMREX_GPU_DEVICE(int ir, int iz, int k3d) noexcept {
-        F_arr(ir,iz,k3d,F_icomp) = 0.;
-        for (int ik=0 ; ik < nk ; ik++) {
-            int const ii = ik + ir*nk;
-            F_arr(ir,iz,k3d,F_icomp) += invM_arr[ii]*G_arr(ik,iz,k3d,G_icomp);
-        }
-    });
-
+               F.dataPtr(F_icomp)+ngr, nrF
+#ifdef AMREX_USE_GPU
+               , *m_queue // Calls the GPU version of blas::gemm
 #endif
+           );
 
+    // We perform stream synchronization since `gemm` may be running
+    // on a different stream.
+    amrex::Gpu::streamSynchronize();
 }
author	Remi Lehe <remi.lehe@normalesup.org>	2022-09-15 15:15:09 -0700
committer	GitHub <noreply@github.com>	2022-09-15 15:15:09 -0700
commit	ac2521aa4c23e999715931331a817c41e416ddd2 (patch)
tree	059810af6a77e9e3e2a65b670e58368748b36e81 /Source/FieldSolver/SpectralSolver/SpectralHankelTransform
parent	04b6f67caab8ee95428a569c529110a12b2527f3 (diff)
download	WarpX-ac2521aa4c23e999715931331a817c41e416ddd2.tar.gz WarpX-ac2521aa4c23e999715931331a817c41e416ddd2.tar.zst WarpX-ac2521aa4c23e999715931331a817c41e416ddd2.zip