diff options
Diffstat (limited to 'Tools/batchScripts')
-rw-r--r-- | Tools/batchScripts/batch_cori.sh | 33 | ||||
-rw-r--r-- | Tools/batchScripts/batch_summit.sh | 16 | ||||
-rwxr-xr-x | Tools/batchScripts/script_profiling_summit.sh | 48 |
3 files changed, 97 insertions, 0 deletions
diff --git a/Tools/batchScripts/batch_cori.sh b/Tools/batchScripts/batch_cori.sh new file mode 100644 index 000000000..e6cd5e1ef --- /dev/null +++ b/Tools/batchScripts/batch_cori.sh @@ -0,0 +1,33 @@ +#!/bin/bash -l + +#SBATCH -N 2 +#SBATCH -t 01:00:00 +#SBATCH -q regular +#SBATCH -C knl +#SBATCH -S 4 +#SBATCH -J <job name> +#SBATCH -A <allocation ID> +#SBATCH -e error.txt +#SBATCH -o output.txt + +export OMP_PLACES=threads +export OMP_PROC_BIND=spread + +# KNLs have 4 hyperthreads max +export CORI_MAX_HYPETHREAD_LEVEL=4 +# We use 64 cores out of the 68 available on Cori KNL, +# and leave 4 to the system (see "#SBATCH -S 4" above). +export CORI_NCORES_PER_NODE=64 + +# Typically use 8 MPI ranks per node without hyperthreading, +# i.e., OMP_NUM_THREADS=8 +export WARPX_NMPI_PER_NODE=8 +export WARPX_HYPERTHREAD_LEVEL=1 + +# Compute OMP_NUM_THREADS and the thread count (-c option) +export CORI_NHYPERTHREADS_MAX=$(( ${CORI_MAX_HYPETHREAD_LEVEL} * ${CORI_NCORES_PER_NODE} )) +export WARPX_NTHREADS_PER_NODE=$(( ${WARPX_HYPERTHREAD_LEVEL} * ${CORI_NCORES_PER_NODE} )) +export OMP_NUM_THREADS=$(( ${WARPX_NTHREADS_PER_NODE} / ${WARPX_NMPI_PER_NODE} )) +export WARPX_THREAD_COUNT=$(( ${CORI_NHYPERTHREADS_MAX} / ${WARPX_NMPI_PER_NODE} )) + +srun --cpu_bind=cores -n $(( ${SLURM_JOB_NUM_NODES} * ${WARPX_NMPI_PER_NODE} )) -c ${WARPX_THREAD_COUNT} <path/to/executable> <input file> diff --git a/Tools/batchScripts/batch_summit.sh b/Tools/batchScripts/batch_summit.sh new file mode 100644 index 000000000..002660b91 --- /dev/null +++ b/Tools/batchScripts/batch_summit.sh @@ -0,0 +1,16 @@ +#!/bin/bash +#BSUB -P <allocation ID> +#BSUB -W 00:10 +#BSUB -nnodes 2 +#BSUB -J WarpX +#BSUB -o WarpXo.%J +#BSUB -e WarpXe.%J + +module load pgi +module load cuda + +omp=1 +export OMP_NUM_THREADS=${omp} + +num_nodes=$(( $(printf '%s\n' ${LSB_HOSTS} | sort -u | wc -l) - 1 )) +jsrun -n ${num_nodes} -a 6 -g 6 -c 6 --bind=packed:${omp} <path/to/executable> <input file> > output.txt diff --git a/Tools/batchScripts/script_profiling_summit.sh b/Tools/batchScripts/script_profiling_summit.sh new file mode 100755 index 000000000..cd6b0eadd --- /dev/null +++ b/Tools/batchScripts/script_profiling_summit.sh @@ -0,0 +1,48 @@ +#!/bin/bash +#BSUB -P GEN109 +#BSUB -W 0:10 +#BSUB -nnodes 1 +#BSUB -J WarpX +#BSUB -o WarpXo.%J +#BSUB -e WarpXe.%J + +module load pgi +module load cuda/9.1.85 +module list +set -x + +omp=1 +export OMP_NUM_THREADS=${omp} +#EXE="../main3d.pgi.DEBUG.TPROF.MPI.ACC.CUDA.ex" +EXE="../main3d.pgi.TPROF.MPI.ACC.CUDA.ex" +#JSRUN="jsrun -n 4 -a 1 -g 1 -c 1 --bind=packed:${omp} " +#JSRUN="jsrun -n 1 -a 4 -g 4 -c 4 --bind=packed:${omp} " +JSRUN="jsrun -n 1 -a 1 -g 1 -c 1 --bind=packed:${omp} " + +rundir="${LSB_JOBNAME}-${LSB_JOBID}" +mkdir $rundir +cp $0 $rundir +cp inputs $rundir +cd $rundir + +# 1. Run normally +${JSRUN} --smpiargs="-gpu" ${EXE} inputs + +# 2. Run under cuda-memcheck +# ${JSRUN} --smpiargs="-gpu" cuda-memcheck ${EXE} inputs &> memcheck.txt + +# 3. Run under nvprof and direct all stdout and stderr to nvprof.txt +#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes ${EXE} inputs &> nvprof.txt + +# 4. Run under nvprof and store performance data in a nvvp file +# Can be converted to text using nvprof -i nvprof-timeline-%p.nvvp +#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes -o nvprof-timeline-%p.nvvp ${EXE} inputs + +# COLLECT PERFORMANCE METRICS - THIS IS MUCH SLOWER. Set nsteps=2 in the inputs files +# 5. Run under nvprof and collect metrics for a subset of kernels +#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --kernels '(deposit_current|gather_\w+_field|push_\w+_boris)' --analysis-metrics -o nvprof-metrics-kernel-%p.nvvp ${EXE} inputs + +# 6. Run under nvprof and collect metrics for all kernels -- much slower! +#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --analysis-metrics -o nvprof-metrics-%p.nvvp ${EXE} inputs + +cp ../WarpX*.${LSB_JOBID} . |