aboutsummaryrefslogtreecommitdiff
path: root/Tools/BatchScripts/script_profiling_summit.sh
blob: 167ae9d313e609f653bca3f1208a7fd3193d3323 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/bin/bash

# Copyright 2019 Andrew Myers, Maxence Thevenet
#
# This file is part of WarpX.
#
# License: BSD-3-Clause-LBNL

#BSUB -P GEN109
#BSUB -W 0:10
#BSUB -nnodes 1
#BSUB -J WarpX
#BSUB -o WarpXo.%J
#BSUB -e WarpXe.%J

# make output group-readable by default
umask 0027

# fix problems with collectives since RHEL8 update: OLCFHELP-3545
# disable all the IBM optimized barriers and drop back to HCOLL or OMPI's barrier implementations
export OMPI_MCA_coll_ibm_skip_barrier=true

#module load pgi
#module load cuda/9.1.85
#module list
set -x

omp=1
export OMP_NUM_THREADS=${omp}
#EXE="../main3d.pgi.DEBUG.TPROF.MPI.ACC.CUDA.ex"
EXE="../main3d.pgi.TPROF.MPI.ACC.CUDA.ex"
#JSRUN="jsrun -n 4 -a 1 -g 1 -c 1 --bind=packed:${omp} "
#JSRUN="jsrun -n 1 -a 4 -g 4 -c 4 --bind=packed:${omp} "
JSRUN="jsrun -n 1 -a 1 -g 1 -c 1 --bind=packed:${omp} "

rundir="${LSB_JOBNAME}-${LSB_JOBID}"
mkdir $rundir
cp $0 $rundir
cp inputs $rundir
cd $rundir

# 1. Run normally
${JSRUN} --smpiargs="-gpu" ${EXE} inputs

# 2. Run under cuda-memcheck
# ${JSRUN} --smpiargs="-gpu" cuda-memcheck ${EXE} inputs &> memcheck.txt

# 3. Run under nvprof and direct all stdout and stderr to nvprof.txt
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes ${EXE} inputs &> nvprof.txt

# 4. Run under nvprof and store performance data in a nvvp file
# Can be converted to text using nvprof -i nvprof-timeline-%p.nvvp
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes -o nvprof-timeline-%p.nvvp ${EXE} inputs

# COLLECT PERFORMANCE METRICS - THIS IS MUCH SLOWER. Set nsteps=2 in the inputs files
# 5. Run under nvprof and collect metrics for a subset of kernels
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --kernels '(deposit_current|gather_\w+_field|push_\w+_boris)' --analysis-metrics -o nvprof-metrics-kernel-%p.nvvp ${EXE} inputs

# 6. Run under nvprof and collect metrics for all kernels -- much slower!
#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --analysis-metrics -o nvprof-metrics-%p.nvvp ${EXE} inputs

cp ../WarpX*.${LSB_JOBID} .