#!/bin/bash -l # Just increase this number of you need more nodes. #SBATCH -N 2 #SBATCH -t 24:00:00 #SBATCH -A #SBATCH -J WarpX #SBATCH -q pbatch #SBATCH --qos=normal #SBATCH --license=lustre1,lustre2 #SBATCH --export=ALL #SBATCH -e error.txt #SBATCH -o output.txt # one MPI rank per half-socket (see below) #SBATCH --tasks-per-node=2 # request all logical (virtual) cores per half-socket #SBATCH --cpus-per-task=18 # each Quartz node has 1 socket of Intel Xeon E5-2695 v4 # each Xeon CPU is divided into 2 bus rings that each have direct L3 access export WARPX_NMPI_PER_NODE=2 # each MPI rank per half-socket has 9 physical cores # or 18 logical (virtual) cores # over-subscribing each physical core with 2x # hyperthreading led to a slight (3.5%) speedup on Cori's Intel Xeon E5-2698 v3, # so we do the same here # the settings below make sure threads are close to the # controlling MPI rank (process) per half socket and # distribute equally over close-by physical cores and, # for N>9, also equally over close-by logical cores export OMP_PROC_BIND=spread export OMP_PLACES=threads export OMP_NUM_THREADS=18 EXE="" # e.g. ./warpx srun --cpu_bind=cores -n $(( ${SLURM_JOB_NUM_NODES} * ${WARPX_NMPI_PER_NODE} )) ${EXE}