#!/bin/bash
#BSUB -P GEN109
#BSUB -W 0:10
#BSUB -nnodes 1
#BSUB -J WarpX
#BSUB -o WarpXo.%J
#BSUB -e WarpXe.%J

module load pgi
module load cuda/9.1.85
module list
set -x

omp=1
export OMP_NUM_THREADS=${omp}
#EXE="../main3d.pgi.DEBUG.TPROF.MPI.ACC.CUDA.ex"
EXE="../main3d.pgi.TPROF.MPI.ACC.CUDA.ex"
#JSRUN="jsrun -n 4 -a 1 -g 1 -c 1 --bind=packed:${omp} "
#JSRUN="jsrun -n 1 -a 4 -g 4 -c 4 --bind=packed:${omp} "
JSRUN="jsrun -n 1 -a 1 -g 1 -c 1 --bind=packed:${omp} "

rundir="${LSB_JOBNAME}-${LSB_JOBID}"
mkdir $rundir
cp $0 $rundir
cp inputs $rundir
cd $rundir

# 1. Run normally
${JSRUN} --smpiargs="-gpu" ${EXE} inputs

# 2. Run under cuda-memcheck
# ${JSRUN} --smpiargs="-gpu" cuda-memcheck ${EXE} inputs &> memcheck.txt

# 3. Run under nvprof and direct all stdout and stderr to nvprof.txt
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes ${EXE} inputs &> nvprof.txt

# 4. Run under nvprof and store performance data in a nvvp file
# Can be converted to text using nvprof -i nvprof-timeline-%p.nvvp
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes -o nvprof-timeline-%p.nvvp ${EXE} inputs

# COLLECT PERFORMANCE METRICS - THIS IS MUCH SLOWER. Set nsteps=2 in the inputs files
# 5. Run under nvprof and collect metrics for a subset of kernels
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --kernels '(deposit_current|gather_\w+_field|push_\w+_boris)' --analysis-metrics -o nvprof-metrics-kernel-%p.nvvp ${EXE} inputs

# 6. Run under nvprof and collect metrics for all kernels -- much slower!
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --analysis-metrics -o nvprof-metrics-%p.nvvp ${EXE} inputs

cp ../WarpX*.${LSB_JOBID} .