aboutsummaryrefslogtreecommitdiff
path: root/Source/ablastr/utils/SignalHandling.cpp
diff options
context:
space:
mode:
authorGravatar Axel Huebl <axel.huebl@plasma.ninja> 2022-04-07 06:54:31 -0700
committerGravatar GitHub <noreply@github.com> 2022-04-07 06:54:31 -0700
commit85180bebc3fc82256c197102b5ec8e2da8070e5b (patch)
tree7c4b072dc06f1f795146df465276e40b23b46969 /Source/ablastr/utils/SignalHandling.cpp
parenta9311e50aaa6cb49e4aaf02edc7504f7778316aa (diff)
downloadWarpX-85180bebc3fc82256c197102b5ec8e2da8070e5b.tar.gz
WarpX-85180bebc3fc82256c197102b5ec8e2da8070e5b.tar.zst
WarpX-85180bebc3fc82256c197102b5ec8e2da8070e5b.zip
Fix MPI Signal Handling on Perlmutter (#3029)
* Fix: `MPI_CXX_BOOL` -> `MPI_BYTE` C99 types were aded in MPI-2.2, while Cray's MPICH fork in version 8.1.13 defines `MPI_CXX_BOOL` to `MPI_DATATYPE_NULL` on Perlmutter. We could use `MPI_C_BOOL`, which is technically a `_Bool` from [<cstdbool>](https://en.cppreference.com/w/cpp/header/cstdbool) (deprecated: C++17; removed: C++20) - or we simply do a static assert on `sizeof(bool)` and communicate as a `MPI_BYTE` or `MPI_CHAR`. * Signals: Do no MPI Comms if none is configured * Docs: Link and Explain Allowed Signal Values * Review Comments & Style
Diffstat (limited to 'Source/ablastr/utils/SignalHandling.cpp')
-rw-r--r--Source/ablastr/utils/SignalHandling.cpp34
1 files changed, 27 insertions, 7 deletions
diff --git a/Source/ablastr/utils/SignalHandling.cpp b/Source/ablastr/utils/SignalHandling.cpp
index cdec9b653..e56faeb52 100644
--- a/Source/ablastr/utils/SignalHandling.cpp
+++ b/Source/ablastr/utils/SignalHandling.cpp
@@ -21,6 +21,7 @@
namespace ablastr::utils {
+bool SignalHandling::m_any_signal_action_active = false;
std::atomic<bool> SignalHandling::signal_received_flags[NUM_SIGNALS];
bool SignalHandling::signal_conf_requests[SIGNAL_REQUESTS_SIZE][NUM_SIGNALS];
bool SignalHandling::signal_actions_requested[SIGNAL_REQUESTS_SIZE];
@@ -29,7 +30,7 @@ MPI_Request SignalHandling::signal_mpi_ibcast_request;
#endif
int
-SignalHandling::parseSignalNameToNumber(const std::string &str)
+SignalHandling::parseSignalNameToNumber (const std::string &str)
{
amrex::IParser signals_parser(str);
@@ -111,7 +112,7 @@ SignalHandling::parseSignalNameToNumber(const std::string &str)
}
void
-SignalHandling::InitSignalHandling()
+SignalHandling::InitSignalHandling ()
{
#if defined(__linux__) || defined(__APPLE__)
struct sigaction sa;
@@ -134,12 +135,23 @@ SignalHandling::InitSignalHandling()
"Failed to install signal handler for a configured signal");
}
}
+
+ for (int signal_number = 0; signal_number < NUM_SIGNALS; ++signal_number) {
+ for (int signal_request = 0; signal_request < SIGNAL_REQUESTS_SIZE; ++signal_request) {
+ m_any_signal_action_active |= signal_conf_requests[signal_request][signal_number];
+ }
+ }
#endif
}
void
-SignalHandling::CheckSignals()
+SignalHandling::CheckSignals ()
{
+ // Is any signal handling action configured?
+ // If not, we can skip all handling and the MPI communication as well.
+ if (!m_any_signal_action_active)
+ return;
+
// We assume that signals will definitely be delivered to rank 0,
// and may be delivered to other ranks as well. For coordination,
// we process them according to when they're received by rank 0.
@@ -162,21 +174,29 @@ SignalHandling::CheckSignals()
#if defined(AMREX_USE_MPI)
auto comm = amrex::ParallelDescriptor::Communicator();
+ // Due to a bug in Cray's MPICH 8.1.13 implementation (CUDA builds on Perlmutter@NERSC in 2022),
+ // we cannot use the MPI_CXX_BOOL C++ datatype here. See WarpX PR #3029 and NERSC INC0183281
+ static_assert(sizeof(bool) == 1, "We communicate bools as 1 byte-sized type in MPI");
BL_MPI_REQUIRE(MPI_Ibcast(signal_actions_requested, SIGNAL_REQUESTS_SIZE,
- MPI_CXX_BOOL, 0, comm,&signal_mpi_ibcast_request));
+ MPI_BYTE, 0, comm,&signal_mpi_ibcast_request));
#endif
}
void
-SignalHandling::WaitSignals()
+SignalHandling::WaitSignals ()
{
+ // Is any signal handling action configured?
+ // If not, we can skip all handling and the MPI communication as well.
+ if (!m_any_signal_action_active)
+ return;
+
#if defined(AMREX_USE_MPI)
BL_MPI_REQUIRE(MPI_Wait(&signal_mpi_ibcast_request, MPI_STATUS_IGNORE));
#endif
}
bool
-SignalHandling::TestAndResetActionRequestFlag(int action_to_test)
+SignalHandling::TestAndResetActionRequestFlag (int action_to_test)
{
bool retval = signal_actions_requested[action_to_test];
signal_actions_requested[action_to_test] = false;
@@ -184,7 +204,7 @@ SignalHandling::TestAndResetActionRequestFlag(int action_to_test)
}
void
-SignalHandling::SignalSetFlag(int signal_number)
+SignalHandling::SignalSetFlag (int signal_number)
{
signal_received_flags[signal_number] = true;
}