3 files changed, 97 insertions, 0 deletions
diff --git a/Tools/batchScripts/batch_cori.sh b/Tools/batchScripts/batch_cori.sh
new file mode 100644
index 000000000..e6cd5e1ef
--- /dev/null
+++ b/Tools/batchScripts/batch_cori.sh
@@ -0,0 +1,33 @@
+#!/bin/bash -l
+
+#SBATCH -N 2
+#SBATCH -t 01:00:00
+#SBATCH -q regular
+#SBATCH -C knl
+#SBATCH -S 4
+#SBATCH -J <job name>
+#SBATCH -A <allocation ID>
+#SBATCH -e error.txt
+#SBATCH -o output.txt
+
+export OMP_PLACES=threads
+export OMP_PROC_BIND=spread
+
+# KNLs have 4 hyperthreads max
+export CORI_MAX_HYPETHREAD_LEVEL=4
+# We use 64 cores out of the 68 available on Cori KNL,
+# and leave 4 to the system (see "#SBATCH -S 4" above).
+export CORI_NCORES_PER_NODE=64
+
+# Typically use 8 MPI ranks per node without hyperthreading,
+# i.e., OMP_NUM_THREADS=8
+export WARPX_NMPI_PER_NODE=8
+export WARPX_HYPERTHREAD_LEVEL=1
+
+# Compute OMP_NUM_THREADS and the thread count (-c option)
+export CORI_NHYPERTHREADS_MAX=$(( ${CORI_MAX_HYPETHREAD_LEVEL} * ${CORI_NCORES_PER_NODE} ))
+export WARPX_NTHREADS_PER_NODE=$(( ${WARPX_HYPERTHREAD_LEVEL} * ${CORI_NCORES_PER_NODE} ))
+export OMP_NUM_THREADS=$(( ${WARPX_NTHREADS_PER_NODE} / ${WARPX_NMPI_PER_NODE} ))
+export WARPX_THREAD_COUNT=$(( ${CORI_NHYPERTHREADS_MAX} / ${WARPX_NMPI_PER_NODE} ))
+
+srun --cpu_bind=cores -n $(( ${SLURM_JOB_NUM_NODES} * ${WARPX_NMPI_PER_NODE} )) -c ${WARPX_THREAD_COUNT} <path/to/executable> <input file>
diff --git a/Tools/batchScripts/batch_summit.sh b/Tools/batchScripts/batch_summit.sh
new file mode 100644
index 000000000..002660b91
--- /dev/null
+++ b/Tools/batchScripts/batch_summit.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+#BSUB -P <allocation ID>
+#BSUB -W 00:10
+#BSUB -nnodes 2
+#BSUB -J WarpX
+#BSUB -o WarpXo.%J
+#BSUB -e WarpXe.%J
+
+module load pgi
+module load cuda
+
+omp=1
+export OMP_NUM_THREADS=${omp}
+
+num_nodes=$(( $(printf '%s\n' ${LSB_HOSTS} | sort -u | wc -l) - 1 ))
+jsrun -n ${num_nodes} -a 6 -g 6 -c 6 --bind=packed:${omp} <path/to/executable> <input file> > output.txt
diff --git a/Tools/batchScripts/script_profiling_summit.sh b/Tools/batchScripts/script_profiling_summit.sh
new file mode 100755
index 000000000..cd6b0eadd
--- /dev/null
+++ b/Tools/batchScripts/script_profiling_summit.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#BSUB -P GEN109
+#BSUB -W 0:10
+#BSUB -nnodes 1
+#BSUB -J WarpX
+#BSUB -o WarpXo.%J
+#BSUB -e WarpXe.%J
+
+module load pgi
+module load cuda/9.1.85
+module list
+set -x
+
+omp=1
+export OMP_NUM_THREADS=${omp}
+#EXE="../main3d.pgi.DEBUG.TPROF.MPI.ACC.CUDA.ex"
+EXE="../main3d.pgi.TPROF.MPI.ACC.CUDA.ex"
+#JSRUN="jsrun -n 4 -a 1 -g 1 -c 1 --bind=packed:${omp} "
+#JSRUN="jsrun -n 1 -a 4 -g 4 -c 4 --bind=packed:${omp} "
+JSRUN="jsrun -n 1 -a 1 -g 1 -c 1 --bind=packed:${omp} "
+
+rundir="${LSB_JOBNAME}-${LSB_JOBID}"
+mkdir $rundir
+cp $0 $rundir
+cp inputs $rundir
+cd $rundir
+
+# 1. Run normally
+${JSRUN} --smpiargs="-gpu" ${EXE} inputs
+
+# 2. Run under cuda-memcheck
+# ${JSRUN} --smpiargs="-gpu" cuda-memcheck ${EXE} inputs &> memcheck.txt
+
+# 3. Run under nvprof and direct all stdout and stderr to nvprof.txt
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes ${EXE} inputs &> nvprof.txt
+
+# 4. Run under nvprof and store performance data in a nvvp file
+# Can be converted to text using nvprof -i nvprof-timeline-%p.nvvp
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes -o nvprof-timeline-%p.nvvp ${EXE} inputs
+
+# COLLECT PERFORMANCE METRICS - THIS IS MUCH SLOWER. Set nsteps=2 in the inputs files
+# 5. Run under nvprof and collect metrics for a subset of kernels
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --kernels '(deposit_current|gather_\w+_field|push_\w+_boris)' --analysis-metrics -o nvprof-metrics-kernel-%p.nvvp ${EXE} inputs
+
+# 6. Run under nvprof and collect metrics for all kernels -- much slower!
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --analysis-metrics -o nvprof-metrics-%p.nvvp ${EXE} inputs
+
+cp ../WarpX*.${LSB_JOBID} .