#!/bin/bash -l # Copyright 2021 Axel Huebl # This file is part of WarpX. # License: BSD-3-Clause-LBNL # # Ref: # - https://docs-dev.nersc.gov/cgpu/hardware/ # - https://docs-dev.nersc.gov/cgpu/access/ # - https://docs-dev.nersc.gov/cgpu/usage/#controlling-task-and-gpu-binding # Just increase this number of you need more nodes. #SBATCH -N 2 #SBATCH -t 03:00:00 #SBATCH -J #SBATCH -A m1759 #SBATCH -q regular #SBATCH -C gpu # 8 V100 GPUs (16 GB) per node #SBATCH --gres=gpu:8 #SBATCH --exclusive # one MPI rank per GPU (a quarter-socket) #SBATCH --tasks-per-node=8 # request all logical (virtual) cores per quarter-socket #SBATCH --cpus-per-task=10 #SBATCH -e WarpX.e%j #SBATCH -o WarpX.o%j # each Cori GPU node has 2 sockets of Intel Xeon Gold 6148 ('Skylake') @ 2.40 GHz export WARPX_NMPI_PER_NODE=8 # each MPI rank per half-socket has 10 physical cores # or 20 logical (virtual) cores # we split half-sockets again by 2 to have one MPI rank per GPU # over-subscribing each physical core with 2x # hyperthreading leads to often to slight speedup on Intel # the settings below make sure threads are close to the # controlling MPI rank (process) per half socket and # distribute equally over close-by physical cores and, # for N>20, also equally over close-by logical cores export OMP_PROC_BIND=spread export OMP_PLACES=threads export OMP_NUM_THREADS=10 # for async_io support: (optional) export MPICH_MAX_THREAD_SAFETY=multiple EXE="" srun --cpu_bind=cores --gpus-per-task=1 --gpu-bind=map_gpu:0,1,2,3,4,5,6,7 \ -n $(( ${SLURM_JOB_NUM_NODES} * ${WARPX_NMPI_PER_NODE} )) \ ${EXE} \ > output.txt