diff options
Diffstat (limited to 'Tools/BatchScripts')
-rw-r--r-- | Tools/BatchScripts/batch_lassen.sh | 22 | ||||
-rw-r--r-- | Tools/BatchScripts/batch_quartz.sh | 40 |
2 files changed, 62 insertions, 0 deletions
diff --git a/Tools/BatchScripts/batch_lassen.sh b/Tools/BatchScripts/batch_lassen.sh new file mode 100644 index 000000000..0fd2500c5 --- /dev/null +++ b/Tools/BatchScripts/batch_lassen.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Copyright 2020 Axel Huebl +# +# This file is part of WarpX. +# +# License: BSD-3-Clause-LBNL +# +# Refs.: +# https://jsrunvisualizer.olcf.ornl.gov/?s4f0o11n6c7g1r11d1b1l0= +# https://hpc.llnl.gov/training/tutorials/using-lcs-sierra-system#quick16 + +#BSUB -G <allocation ID> +#BSUB -W 00:10 +#BSUB -nnodes 2 +#BSUB -alloc_flags smt4 +#BSUB -J WarpX +#BSUB -o WarpXo.%J +#BSUB -e WarpXe.%J + +export OMP_NUM_THREADS=1 +jsrun -r 4 -a 1 -g 1 -c 7 -l GPU-CPU -d packed -b rs -M "-gpu" <path/to/executable> <input file> > output.txt diff --git a/Tools/BatchScripts/batch_quartz.sh b/Tools/BatchScripts/batch_quartz.sh new file mode 100644 index 000000000..4c1a82ff8 --- /dev/null +++ b/Tools/BatchScripts/batch_quartz.sh @@ -0,0 +1,40 @@ +#!/bin/bash -l + +# Just increase this number of you need more nodes. +#SBATCH -N 2 +#SBATCH -t 24:00:00 +#SBATCH -A <allocation ID> + +#SBATCH -J WarpX +#SBATCH -q pbatch +#SBATCH --qos=normal +#SBATCH --license=lustre1,lustre2 +#SBATCH --export=ALL +#SBATCH -e error.txt +#SBATCH -o output.txt +# one MPI rank per half-socket (see below) +#SBATCH --tasks-per-node=2 +# request all logical (virtual) cores per half-socket +#SBATCH --cpus-per-task=18 + + +# each Quartz node has 1 socket of Intel Xeon E5-2695 v4 +# each Xeon CPU is divided into 2 bus rings that each have direct L3 access +export WARPX_NMPI_PER_NODE=2 + +# each MPI rank per half-socket has 9 physical cores +# or 18 logical (virtual) cores +# over-subscribing each physical core with 2x +# hyperthreading led to a slight (3.5%) speedup on Cori's Intel Xeon E5-2698 v3, +# so we do the same here +# the settings below make sure threads are close to the +# controlling MPI rank (process) per half socket and +# distribute equally over close-by physical cores and, +# for N>9, also equally over close-by logical cores +export OMP_PROC_BIND=spread +export OMP_PLACES=threads +export OMP_NUM_THREADS=18 + +EXE="<path/to/executable>" # e.g. ./warpx + +srun --cpu_bind=cores -n $(( ${SLURM_JOB_NUM_NODES} * ${WARPX_NMPI_PER_NODE} )) ${EXE} <input file> |