blob: cd6b0eadd7c46929aa78598a36e2550941b941c7 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
#!/bin/bash
#BSUB -P GEN109
#BSUB -W 0:10
#BSUB -nnodes 1
#BSUB -J WarpX
#BSUB -o WarpXo.%J
#BSUB -e WarpXe.%J
module load pgi
module load cuda/9.1.85
module list
set -x
omp=1
export OMP_NUM_THREADS=${omp}
#EXE="../main3d.pgi.DEBUG.TPROF.MPI.ACC.CUDA.ex"
EXE="../main3d.pgi.TPROF.MPI.ACC.CUDA.ex"
#JSRUN="jsrun -n 4 -a 1 -g 1 -c 1 --bind=packed:${omp} "
#JSRUN="jsrun -n 1 -a 4 -g 4 -c 4 --bind=packed:${omp} "
JSRUN="jsrun -n 1 -a 1 -g 1 -c 1 --bind=packed:${omp} "
rundir="${LSB_JOBNAME}-${LSB_JOBID}"
mkdir $rundir
cp $0 $rundir
cp inputs $rundir
cd $rundir
# 1. Run normally
${JSRUN} --smpiargs="-gpu" ${EXE} inputs
# 2. Run under cuda-memcheck
# ${JSRUN} --smpiargs="-gpu" cuda-memcheck ${EXE} inputs &> memcheck.txt
# 3. Run under nvprof and direct all stdout and stderr to nvprof.txt
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes ${EXE} inputs &> nvprof.txt
# 4. Run under nvprof and store performance data in a nvvp file
# Can be converted to text using nvprof -i nvprof-timeline-%p.nvvp
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes -o nvprof-timeline-%p.nvvp ${EXE} inputs
# COLLECT PERFORMANCE METRICS - THIS IS MUCH SLOWER. Set nsteps=2 in the inputs files
# 5. Run under nvprof and collect metrics for a subset of kernels
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --kernels '(deposit_current|gather_\w+_field|push_\w+_boris)' --analysis-metrics -o nvprof-metrics-kernel-%p.nvvp ${EXE} inputs
# 6. Run under nvprof and collect metrics for all kernels -- much slower!
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --analysis-metrics -o nvprof-metrics-%p.nvvp ${EXE} inputs
cp ../WarpX*.${LSB_JOBID} .
|