aboutsummaryrefslogtreecommitdiff
path: root/Source/ablastr/parallelization/MPIInitHelpers.cpp
diff options
context:
space:
mode:
authorGravatar Axel Huebl <axel.huebl@plasma.ninja> 2023-08-28 10:25:25 -0700
committerGravatar GitHub <noreply@github.com> 2023-08-28 10:25:25 -0700
commitf02ad26b531fb087b18db929ee328e074fb1b1ee (patch)
tree0b15489e1e1bf973e18afc0ff5ba6839027d774a /Source/ablastr/parallelization/MPIInitHelpers.cpp
parentbacabae8ed0fdcc82d9f3ea9d82b7dc53691e3b8 (diff)
downloadWarpX-f02ad26b531fb087b18db929ee328e074fb1b1ee.tar.gz
WarpX-f02ad26b531fb087b18db929ee328e074fb1b1ee.tar.zst
WarpX-f02ad26b531fb087b18db929ee328e074fb1b1ee.zip
Work-Around: Segfault in MPI_Init with HIP (#4237)
* Work-Around: Segfault in MPI_Init with HIP See: https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#olcfdev-1655-occasional-seg-fault-during-mpi-init * Move to ABLASTR All that counts is that HIP is initialized before GPU-aware MPI. * Add Exception
Diffstat (limited to 'Source/ablastr/parallelization/MPIInitHelpers.cpp')
-rw-r--r--Source/ablastr/parallelization/MPIInitHelpers.cpp24
1 files changed, 21 insertions, 3 deletions
diff --git a/Source/ablastr/parallelization/MPIInitHelpers.cpp b/Source/ablastr/parallelization/MPIInitHelpers.cpp
index 65e7525c0..633c004c9 100644
--- a/Source/ablastr/parallelization/MPIInitHelpers.cpp
+++ b/Source/ablastr/parallelization/MPIInitHelpers.cpp
@@ -1,7 +1,6 @@
-/* Copyright 2020 Axel Huebl
- *
- * This file is part of ABLASTR.
+/* This file is part of ABLASTR.
*
+ * Authors: Axel Huebl
* License: BSD-3-Clause-LBNL
*/
#include "MPIInitHelpers.H"
@@ -15,10 +14,19 @@
# include <mpi.h>
#endif
+// OLCFDEV-1655: Segfault during MPI_Init & in PMI_Allgather
+// https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#olcfdev-1655-occasional-seg-fault-during-mpi-init
+#if defined(AMREX_USE_HIP)
+#include <hip/hip_runtime.h>
+#endif
+
+#include <iostream>
#include <string>
#include <utility>
+#include <stdexcept>
#include <sstream>
+
namespace ablastr::parallelization
{
int
@@ -40,6 +48,16 @@ namespace ablastr::parallelization
std::pair< int, int >
mpi_init (int argc, char* argv[])
{
+ // OLCFDEV-1655: Segfault during MPI_Init & in PMI_Allgather
+ // https://docs.olcf.ornl.gov/systems/crusher_quick_start_guide.html#olcfdev-1655-occasional-seg-fault-during-mpi-init
+#if defined(AMREX_USE_HIP) && defined(AMREX_USE_MPI)
+ hipError_t hip_ok = hipInit(0);
+ if (hip_ok != hipSuccess) {
+ std::cerr << "hipInit failed with error code " << hip_ok << "! Aborting now.\n";
+ throw std::runtime_error("hipInit failed. Did not proceeding with MPI_Init_thread.");
+ }
+#endif
+
const int thread_required = mpi_thread_required();
#ifdef AMREX_USE_MPI
int thread_provided = -1;