2 files changed, 117 insertions, 0 deletions
diff --git a/Examples/Tests/gpu_test/inputs b/Examples/Tests/gpu_test/inputs
new file mode 100644
index 000000000..18bb80d26
--- /dev/null
+++ b/Examples/Tests/gpu_test/inputs
@@ -0,0 +1,69 @@
+# Maximum number of time steps
+max_step = 10
+
+# number of grid points
+amr.n_cell =  64 64 64
+
+# Maximum allowable size of each subdomain in the problem domain;
+#    this is used to decompose the domain for parallel calculations.
+amr.max_grid_size = 64
+
+# Maximum level in hierarchy (for now must be 0, i.e., one level in total)
+amr.max_level = 0
+
+# Geometry
+geometry.coord_sys   = 0                  # 0: Cartesian
+geometry.is_periodic = 1     1     1      # Is periodic?
+geometry.prob_lo     = -20.e-6   -20.e-6   -20.e-6    # physical domain
+geometry.prob_hi     =  20.e-6    20.e-6    20.e-6
+
+warpx.serialize_ics = 1
+warpx.do_pml = 0
+
+# Verbosity
+warpx.verbose = 1
+
+# Algorithms
+algo.current_deposition = 3
+algo.charge_deposition = 0
+algo.field_gathering = 1
+algo.particle_pusher = 0
+
+interpolation.nox = 1
+interpolation.noy = 1
+interpolation.noz = 1
+
+particles.do_tiling = 0
+
+# CFL
+warpx.cfl = 1.0
+
+amr.plot_int = -10
+
+particles.nspecies = 1
+particles.species_names = electrons
+
+electrons.charge = -q_e
+electrons.mass = m_e
+electrons.injection_style = "NUniformPerCell"
+electrons.num_particles_per_cell_each_dim = 2 2 2
+electrons.profile = constant
+electrons.density = 1.e25  # number of electrons per m^3
+electrons.momentum_distribution_type = "gaussian"
+electrons.ux_th  = 0.01 # uth the std of the (unitless) momentum
+electrons.uy_th  = 0.01 # uth the std of the (unitless) momentum
+electrons.uz_th  = 0.01 # uth the std of the (unitless) momentum
+electrons.uz_m   = 10.  # Mean momentum along z (unitless)
+
+# Laser
+warpx.use_laser    = 1
+laser.profile      = Gaussian
+laser.position     = 0. 0. 0.e-6 # This point is on the laser plane
+laser.direction    = 0. 0. 1.     # The plane normal direction
+laser.polarization = 1. 0. 0.    # The main polarization vector
+laser.e_max        = 16.e12        # Maximum amplitude of the laser field (in V/m)
+laser.profile_waist = 3.e-6      # The waist of the laser (in meters)
+laser.profile_duration = 15.e-15  # The duration of the laser (in seconds)
+laser.profile_t_peak = 30.e-15    # The time at which the laser reaches its peak (in seconds)
+laser.profile_focal_distance = 100.e-6  # Focal distance from the antenna (in meters)
+laser.wavelength = 0.8e-6         # The wavelength of the laser (in meters)
diff --git a/Examples/Tests/gpu_test/script.sh b/Examples/Tests/gpu_test/script.sh
new file mode 100755
index 000000000..cd6b0eadd
--- /dev/null
+++ b/Examples/Tests/gpu_test/script.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+#BSUB -P GEN109
+#BSUB -W 0:10
+#BSUB -nnodes 1
+#BSUB -J WarpX
+#BSUB -o WarpXo.%J
+#BSUB -e WarpXe.%J
+
+module load pgi
+module load cuda/9.1.85
+module list
+set -x
+
+omp=1
+export OMP_NUM_THREADS=${omp}
+#EXE="../main3d.pgi.DEBUG.TPROF.MPI.ACC.CUDA.ex"
+EXE="../main3d.pgi.TPROF.MPI.ACC.CUDA.ex"
+#JSRUN="jsrun -n 4 -a 1 -g 1 -c 1 --bind=packed:${omp} "
+#JSRUN="jsrun -n 1 -a 4 -g 4 -c 4 --bind=packed:${omp} "
+JSRUN="jsrun -n 1 -a 1 -g 1 -c 1 --bind=packed:${omp} "
+
+rundir="${LSB_JOBNAME}-${LSB_JOBID}"
+mkdir $rundir
+cp $0 $rundir
+cp inputs $rundir
+cd $rundir
+
+# 1. Run normally
+${JSRUN} --smpiargs="-gpu" ${EXE} inputs
+
+# 2. Run under cuda-memcheck
+# ${JSRUN} --smpiargs="-gpu" cuda-memcheck ${EXE} inputs &> memcheck.txt
+
+# 3. Run under nvprof and direct all stdout and stderr to nvprof.txt
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes ${EXE} inputs &> nvprof.txt
+
+# 4. Run under nvprof and store performance data in a nvvp file
+# Can be converted to text using nvprof -i nvprof-timeline-%p.nvvp
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes -o nvprof-timeline-%p.nvvp ${EXE} inputs
+
+# COLLECT PERFORMANCE METRICS - THIS IS MUCH SLOWER. Set nsteps=2 in the inputs files
+# 5. Run under nvprof and collect metrics for a subset of kernels
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --kernels '(deposit_current|gather_\w+_field|push_\w+_boris)' --analysis-metrics -o nvprof-metrics-kernel-%p.nvvp ${EXE} inputs
+
+# 6. Run under nvprof and collect metrics for all kernels -- much slower!
+#${JSRUN} --smpiargs="-gpu" nvprof --profile-child-processes --analysis-metrics -o nvprof-metrics-%p.nvvp ${EXE} inputs
+
+cp ../WarpX*.${LSB_JOBID} .