diff options
author | 2022-04-07 06:54:31 -0700 | |
---|---|---|
committer | 2022-04-07 06:54:31 -0700 | |
commit | 85180bebc3fc82256c197102b5ec8e2da8070e5b (patch) | |
tree | 7c4b072dc06f1f795146df465276e40b23b46969 /Source/ablastr/utils/SignalHandling.cpp | |
parent | a9311e50aaa6cb49e4aaf02edc7504f7778316aa (diff) | |
download | WarpX-85180bebc3fc82256c197102b5ec8e2da8070e5b.tar.gz WarpX-85180bebc3fc82256c197102b5ec8e2da8070e5b.tar.zst WarpX-85180bebc3fc82256c197102b5ec8e2da8070e5b.zip |
Fix MPI Signal Handling on Perlmutter (#3029)
* Fix: `MPI_CXX_BOOL` -> `MPI_BYTE`
C99 types were aded in MPI-2.2, while Cray's MPICH fork in version
8.1.13 defines `MPI_CXX_BOOL` to `MPI_DATATYPE_NULL` on Perlmutter.
We could use `MPI_C_BOOL`, which is technically a `_Bool` from
[<cstdbool>](https://en.cppreference.com/w/cpp/header/cstdbool)
(deprecated: C++17; removed: C++20) - or we simply do a static
assert on `sizeof(bool)` and communicate as a `MPI_BYTE` or
`MPI_CHAR`.
* Signals: Do no MPI Comms if none is configured
* Docs: Link and Explain Allowed Signal Values
* Review Comments & Style
Diffstat (limited to 'Source/ablastr/utils/SignalHandling.cpp')
-rw-r--r-- | Source/ablastr/utils/SignalHandling.cpp | 34 |
1 files changed, 27 insertions, 7 deletions
diff --git a/Source/ablastr/utils/SignalHandling.cpp b/Source/ablastr/utils/SignalHandling.cpp index cdec9b653..e56faeb52 100644 --- a/Source/ablastr/utils/SignalHandling.cpp +++ b/Source/ablastr/utils/SignalHandling.cpp @@ -21,6 +21,7 @@ namespace ablastr::utils { +bool SignalHandling::m_any_signal_action_active = false; std::atomic<bool> SignalHandling::signal_received_flags[NUM_SIGNALS]; bool SignalHandling::signal_conf_requests[SIGNAL_REQUESTS_SIZE][NUM_SIGNALS]; bool SignalHandling::signal_actions_requested[SIGNAL_REQUESTS_SIZE]; @@ -29,7 +30,7 @@ MPI_Request SignalHandling::signal_mpi_ibcast_request; #endif int -SignalHandling::parseSignalNameToNumber(const std::string &str) +SignalHandling::parseSignalNameToNumber (const std::string &str) { amrex::IParser signals_parser(str); @@ -111,7 +112,7 @@ SignalHandling::parseSignalNameToNumber(const std::string &str) } void -SignalHandling::InitSignalHandling() +SignalHandling::InitSignalHandling () { #if defined(__linux__) || defined(__APPLE__) struct sigaction sa; @@ -134,12 +135,23 @@ SignalHandling::InitSignalHandling() "Failed to install signal handler for a configured signal"); } } + + for (int signal_number = 0; signal_number < NUM_SIGNALS; ++signal_number) { + for (int signal_request = 0; signal_request < SIGNAL_REQUESTS_SIZE; ++signal_request) { + m_any_signal_action_active |= signal_conf_requests[signal_request][signal_number]; + } + } #endif } void -SignalHandling::CheckSignals() +SignalHandling::CheckSignals () { + // Is any signal handling action configured? + // If not, we can skip all handling and the MPI communication as well. + if (!m_any_signal_action_active) + return; + // We assume that signals will definitely be delivered to rank 0, // and may be delivered to other ranks as well. For coordination, // we process them according to when they're received by rank 0. @@ -162,21 +174,29 @@ SignalHandling::CheckSignals() #if defined(AMREX_USE_MPI) auto comm = amrex::ParallelDescriptor::Communicator(); + // Due to a bug in Cray's MPICH 8.1.13 implementation (CUDA builds on Perlmutter@NERSC in 2022), + // we cannot use the MPI_CXX_BOOL C++ datatype here. See WarpX PR #3029 and NERSC INC0183281 + static_assert(sizeof(bool) == 1, "We communicate bools as 1 byte-sized type in MPI"); BL_MPI_REQUIRE(MPI_Ibcast(signal_actions_requested, SIGNAL_REQUESTS_SIZE, - MPI_CXX_BOOL, 0, comm,&signal_mpi_ibcast_request)); + MPI_BYTE, 0, comm,&signal_mpi_ibcast_request)); #endif } void -SignalHandling::WaitSignals() +SignalHandling::WaitSignals () { + // Is any signal handling action configured? + // If not, we can skip all handling and the MPI communication as well. + if (!m_any_signal_action_active) + return; + #if defined(AMREX_USE_MPI) BL_MPI_REQUIRE(MPI_Wait(&signal_mpi_ibcast_request, MPI_STATUS_IGNORE)); #endif } bool -SignalHandling::TestAndResetActionRequestFlag(int action_to_test) +SignalHandling::TestAndResetActionRequestFlag (int action_to_test) { bool retval = signal_actions_requested[action_to_test]; signal_actions_requested[action_to_test] = false; @@ -184,7 +204,7 @@ SignalHandling::TestAndResetActionRequestFlag(int action_to_test) } void -SignalHandling::SignalSetFlag(int signal_number) +SignalHandling::SignalSetFlag (int signal_number) { signal_received_flags[signal_number] = true; } |