aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar Axel Huebl <axel.huebl@plasma.ninja> 2019-08-26 21:55:53 -0700
committerGravatar GitHub <noreply@github.com> 2019-08-26 21:55:53 -0700
commit17eefa31d683117b365c7c272f2e3631c64c71a3 (patch)
treead389f30c3b1956eb502e018499d1bcf3cefa748
parent3d44362029fb39476f9542a10af9e0fc5eb1ef9b (diff)
parent83d451a02b3fc493e3e48d992599e60319042860 (diff)
downloadWarpX-17eefa31d683117b365c7c272f2e3631c64c71a3.tar.gz
WarpX-17eefa31d683117b365c7c272f2e3631c64c71a3.tar.zst
WarpX-17eefa31d683117b365c7c272f2e3631c64c71a3.zip
Merge pull request #291 from ECP-WarpX/doc_platforms
Docs: System Submission & Helper Scripts
-rw-r--r--Docs/source/building/cori.rst3
-rw-r--r--Docs/source/building/summit.rst28
-rw-r--r--Docs/source/running_cpp/parallelization.rst24
-rw-r--r--Docs/source/running_cpp/platforms.rst69
-rw-r--r--Docs/source/running_cpp/running_cpp.rst1
-rwxr-xr-xExamples/Tests/gpu_test/script_profiling.sh (renamed from Examples/Tests/gpu_test/script.sh)0
-rw-r--r--Examples/batchScripts/batch_cori.sh33
-rw-r--r--Examples/batchScripts/batch_summit.sh16
-rw-r--r--Tools/compute_domain.py114
9 files changed, 243 insertions, 45 deletions
diff --git a/Docs/source/building/cori.rst b/Docs/source/building/cori.rst
index 2330e2153..d89f3f17b 100644
--- a/Docs/source/building/cori.rst
+++ b/Docs/source/building/cori.rst
@@ -51,6 +51,9 @@ In order to compile for the **Knight's Landing (KNL) architecture**:
module swap PrgEnv-intel PrgEnv-gnu
make -j 16 COMP=gnu
+See :doc:`../running_cpp/platforms` for more information on how to run
+WarpX on Cori.
+
GPU Build
---------
diff --git a/Docs/source/building/summit.rst b/Docs/source/building/summit.rst
index 88588eb72..424cb68f5 100644
--- a/Docs/source/building/summit.rst
+++ b/Docs/source/building/summit.rst
@@ -23,29 +23,5 @@ Then, ``cd`` into the directory ``WarpX`` and use the following set of commands
module load cuda
make -j 4 USE_GPU=TRUE COMP=pgi
-In order to submit a simulation, create a file `submission_script` with
-the following text (replace bracketed variables):
-
-::
-
- #!/bin/bash
- #BSUB -J <jobName>
- #BSUB -W <requestedTime>
- #BSUB -nnodes <numberOfNodes>
- #BSUB -P <accountNumber>
-
- module load pgi
- module load cuda
-
- omp=1
- export OMP_NUM_THREADS=${omp}
- num_nodes=$(( $(printf '%s\n' ${LSB_HOSTS} | sort -u | wc -l) - 1 ))
-
- jsrun -n ${num_nodes} -a 6 -g 6 -c 6 --bind=packed:${omp} --smpiargs="-gpu" <warpxExecutable> <inputScript>
-
-
-Then run
-
-::
-
- bsub submission_script
+See :doc:`../running_cpp/platforms` for more information on how to run
+WarpX on Summit.
diff --git a/Docs/source/running_cpp/parallelization.rst b/Docs/source/running_cpp/parallelization.rst
index 440c17235..a8c89f340 100644
--- a/Docs/source/running_cpp/parallelization.rst
+++ b/Docs/source/running_cpp/parallelization.rst
@@ -61,22 +61,8 @@ and MPI decomposition and computer architecture used for the run:
* Amount of high-bandwidth memory.
-Below is a list of experience-based parameters
-that were observed to give good performance on given supercomputers.
-
-Rule of thumb for 3D runs on NERSC Cori KNL
--------------------------------------------
-
-For a 3D simulation with a few (1-4) particles per cell using FDTD Maxwell
-solver on Cori KNL for a well load-balanced problem (in our case laser
-wakefield acceleration simulation in a boosted frame in the quasi-linear
-regime), the following set of parameters provided good performance:
-
-* ``amr.max_grid_size=64`` and ``amr.blocking_factor=64`` so that the size of
- each grid is fixed to ``64**3`` (we are not using load-balancing here).
-
-* **8 MPI ranks per KNL node**, with ``OMP_NUM_THREADS=8`` (that is 64 threads
- per KNL node, i.e. 1 thread per physical core, and 4 cores left to the
- system).
-
-* **2 grids per MPI**, *i.e.*, 16 grids per KNL node.
+Because these parameters put additional contraints on the domain size for a
+simulation, it can be cumbersome to calculate the number of cells and the
+physical size of the computational domain for a given resolution. This
+:download:`Python script<../../../Tools/compute_domain.py>` does it
+automatically.
diff --git a/Docs/source/running_cpp/platforms.rst b/Docs/source/running_cpp/platforms.rst
new file mode 100644
index 000000000..fc4e2b1fb
--- /dev/null
+++ b/Docs/source/running_cpp/platforms.rst
@@ -0,0 +1,69 @@
+Running on specific platforms
+=============================
+
+Running on Cori KNL at NERSC
+----------------------------
+
+The batch script below can be used to run a WarpX simulation on 2 KNL nodes on
+the supercomputer Cori at NERSC. Replace descriptions between chevrons ``<>``
+by relevant values, for instance ``<job name>`` could be ``laserWakefield``.
+
+.. literalinclude:: ../../../Examples/batchScripts/batch_cori.sh
+ :language: bash
+
+To run a simulation, copy the lines above to a file ``batch_cori.sh`` and
+run
+::
+
+ sbatch batch_cori.sh
+
+to submit the job.
+
+For a 3D simulation with a few (1-4) particles per cell using FDTD Maxwell
+solver on Cori KNL for a well load-balanced problem (in our case laser
+wakefield acceleration simulation in a boosted frame in the quasi-linear
+regime), the following set of parameters provided good performance:
+
+* ``amr.max_grid_size=64`` and ``amr.blocking_factor=64`` so that the size of
+ each grid is fixed to ``64**3`` (we are not using load-balancing here).
+
+* **8 MPI ranks per KNL node**, with ``OMP_NUM_THREADS=8`` (that is 64 threads
+ per KNL node, i.e. 1 thread per physical core, and 4 cores left to the
+ system).
+
+* **2 grids per MPI**, *i.e.*, 16 grids per KNL node.
+
+Running on Summit at OLCF
+-------------------------
+
+The batch script below can be used to run a WarpX simulation on 2 nodes on
+the supercomputer Summit at OLCF. Replace descriptions between chevrons ``<>``
+by relevalt values, for instance ``<input file>`` could be
+``plasma_mirror_inputs``. Note that the only option so far is to run with one
+MPI rank per GPU.
+
+.. literalinclude:: ../../../Examples/batchScripts/batch_summit.sh
+ :language: bash
+
+To run a simulation, copy the lines above to a file ``batch_summit.sh`` and
+run
+::
+
+ bsub batch_summit.sh
+
+to submit the job.
+
+For a 3D simulation with a few (1-4) particles per cell using FDTD Maxwell
+solver on Summit for a well load-balanced problem (in our case laser
+wakefield acceleration simulation in a boosted frame in the quasi-linear
+regime), the following set of parameters provided good performance:
+
+* ``amr.max_grid_size=256`` and ``amr.blocking_factor=128``.
+
+* **One MPI rank per GPU** (e.g., 6 MPI ranks for the 6 GPUs on each Summit
+ node)
+
+* **Two `128x128x128` grids per GPU**, or **one `128x128x256` grid per GPU**.
+
+A batch script with more options regarding profiling on Summit can be found at
+:download:`Summit batch script<../../../Examples/Tests/gpu_test/script_profiling.sh>` \ No newline at end of file
diff --git a/Docs/source/running_cpp/running_cpp.rst b/Docs/source/running_cpp/running_cpp.rst
index 7d82e55f1..31cecb12f 100644
--- a/Docs/source/running_cpp/running_cpp.rst
+++ b/Docs/source/running_cpp/running_cpp.rst
@@ -9,3 +9,4 @@ Running WarpX as an executable
parameters
profiling
parallelization
+ platforms \ No newline at end of file
diff --git a/Examples/Tests/gpu_test/script.sh b/Examples/Tests/gpu_test/script_profiling.sh
index cd6b0eadd..cd6b0eadd 100755
--- a/Examples/Tests/gpu_test/script.sh
+++ b/Examples/Tests/gpu_test/script_profiling.sh
diff --git a/Examples/batchScripts/batch_cori.sh b/Examples/batchScripts/batch_cori.sh
new file mode 100644
index 000000000..e6cd5e1ef
--- /dev/null
+++ b/Examples/batchScripts/batch_cori.sh
@@ -0,0 +1,33 @@
+#!/bin/bash -l
+
+#SBATCH -N 2
+#SBATCH -t 01:00:00
+#SBATCH -q regular
+#SBATCH -C knl
+#SBATCH -S 4
+#SBATCH -J <job name>
+#SBATCH -A <allocation ID>
+#SBATCH -e error.txt
+#SBATCH -o output.txt
+
+export OMP_PLACES=threads
+export OMP_PROC_BIND=spread
+
+# KNLs have 4 hyperthreads max
+export CORI_MAX_HYPETHREAD_LEVEL=4
+# We use 64 cores out of the 68 available on Cori KNL,
+# and leave 4 to the system (see "#SBATCH -S 4" above).
+export CORI_NCORES_PER_NODE=64
+
+# Typically use 8 MPI ranks per node without hyperthreading,
+# i.e., OMP_NUM_THREADS=8
+export WARPX_NMPI_PER_NODE=8
+export WARPX_HYPERTHREAD_LEVEL=1
+
+# Compute OMP_NUM_THREADS and the thread count (-c option)
+export CORI_NHYPERTHREADS_MAX=$(( ${CORI_MAX_HYPETHREAD_LEVEL} * ${CORI_NCORES_PER_NODE} ))
+export WARPX_NTHREADS_PER_NODE=$(( ${WARPX_HYPERTHREAD_LEVEL} * ${CORI_NCORES_PER_NODE} ))
+export OMP_NUM_THREADS=$(( ${WARPX_NTHREADS_PER_NODE} / ${WARPX_NMPI_PER_NODE} ))
+export WARPX_THREAD_COUNT=$(( ${CORI_NHYPERTHREADS_MAX} / ${WARPX_NMPI_PER_NODE} ))
+
+srun --cpu_bind=cores -n $(( ${SLURM_JOB_NUM_NODES} * ${WARPX_NMPI_PER_NODE} )) -c ${WARPX_THREAD_COUNT} <path/to/executable> <input file>
diff --git a/Examples/batchScripts/batch_summit.sh b/Examples/batchScripts/batch_summit.sh
new file mode 100644
index 000000000..002660b91
--- /dev/null
+++ b/Examples/batchScripts/batch_summit.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+#BSUB -P <allocation ID>
+#BSUB -W 00:10
+#BSUB -nnodes 2
+#BSUB -J WarpX
+#BSUB -o WarpXo.%J
+#BSUB -e WarpXe.%J
+
+module load pgi
+module load cuda
+
+omp=1
+export OMP_NUM_THREADS=${omp}
+
+num_nodes=$(( $(printf '%s\n' ${LSB_HOSTS} | sort -u | wc -l) - 1 ))
+jsrun -n ${num_nodes} -a 6 -g 6 -c 6 --bind=packed:${omp} <path/to/executable> <input file> > output.txt
diff --git a/Tools/compute_domain.py b/Tools/compute_domain.py
new file mode 100644
index 000000000..822d776e8
--- /dev/null
+++ b/Tools/compute_domain.py
@@ -0,0 +1,114 @@
+import os, shutil, re
+import numpy as np
+import scipy.constants as scc
+import time, copy
+
+'''
+This Python script helps a user to parallelize a WarpX simulation.
+
+The user specifies the minimal size of the physical domain and the resolution
+in each dimension, and the scripts computes:
+- the number of cells and physical domain to satify the user-specified domain
+ size and resolution AND make sure that the number of cells along each
+ direction is a multiple of max_grid_size.
+- a starting point on how to parallelize on Cori KNL (number of nodes, etc.).
+
+When running in a boosted frame, the script also has the option to
+automatically compute the number of cells in z to satisfy dx>dz in the boosted
+frame.
+
+Note that the script has no notion of blocking_factor. It is assumed that
+blocking_factor = max_grid_size, and that all boxes have the same size.
+'''
+
+# Update the lines below for your simulation
+# ------------------------------------------
+# 2 elements for 2D, 3 elements for 3D
+# Lower corner of the box
+box_lo0 = np.array([-25.e-6, -25.e-6, -15.e-6])
+# Upper corner of the box
+box_hi0 = np.array([ 25.e-6, 25.e-6, 60.e-6])
+# Cell size
+dx = 1.e-6
+dz = dx
+cell_size = np.array([dx, dx, dz])
+# Use this for simulations in a boosted frame if you
+# want to enforce dz < dx / dx_over_dz_boosted_frame
+compute_dz_boosted_frame = True
+gamma_boost = 30.
+dx_over_dz_boosted_frame = 1.1 # >1. is usually more stable
+# ------------------------------------------
+
+# similar to numpy.ceil, except the output data type is int
+def intceil(num):
+ return np.ceil(num).astype(int)
+
+# Enlarge simulation boundaries to satisfy three conditions:
+# - The resolution must be exactly the one provided by the user
+# - The physical domain must cover the domain specified by box_lo0, box_hi0
+# - The number of cells must be a multiple of mgs (max_grid_size).
+def adjust_bounds(box_lo0, box_hi0, box_ncell0, mgs):
+ cell_size = (box_hi0-box_lo0) / box_ncell0
+ box_ncell = intceil(box_ncell0/mgs)*mgs
+ box_lo = box_ncell * cell_size * box_lo0 / (box_hi0 - box_lo0)
+ box_hi = box_ncell * cell_size * box_hi0 / (box_hi0 - box_lo0)
+ return box_lo, box_hi, box_ncell
+
+# Calculate parallelization for the simulation, given numerical parameters
+# (number of cells, max_grid_size, number of threads per node etc.)
+def nb_nodes_mpi(box_ncell,mgs,threadspernode,ompnumthreads,ngridpernode, ndim):
+ nmpipernode = threadspernode/ompnumthreads
+ ngridpermpi = ngridpernode/nmpipernode
+ box_ngrids = box_ncell/mgs
+ if ndim == 2:
+ ngrids = box_ngrids[0] * box_ngrids[1]
+ elif ndim == 3:
+ ngrids = np.prod(box_ngrids)
+ n_mpi = intceil( ngrids/ngridpermpi )
+ n_node = intceil( n_mpi/nmpipernode )
+ return n_node, n_mpi
+
+# Get number of dimensions (2 or 3)
+ndim = box_lo0.size
+if compute_dz_boosted_frame:
+ # Adjust dz so that dx/dz = dx_over_dz_boosted_frame in simulation frame
+ cell_size[-1] = cell_size[0] / dx_over_dz_boosted_frame / 2. / gamma_boost
+# Given the resolution, compute number of cells a priori
+box_ncell0 = ( box_hi0 - box_lo0 ) / cell_size
+
+if ndim == 2:
+ # Set of parameters suitable for a 2D simulation on Cori KNL
+ ngridpernode = 16.
+ ompnumthreads = 8.
+ mgs = 1024.
+ threadspernode = 64. # HyperThreading level = 1: no hyperthreading
+ distance_between_threads = int(68*4/threadspernode)
+ c_option = int( ompnumthreads*distance_between_threads )
+elif ndim == 3:
+ # Set of parameters suitable for a 3D simulation on Cori KNL
+ ngridpernode = 8.
+ ompnumthreads = 8.
+ mgs = 64.
+ threadspernode = 64. # HyperThreading level = 1: no hyperthreading
+ distance_between_threads = int(68*4/threadspernode)
+ c_option = int( ompnumthreads*distance_between_threads )
+
+# Adjust simulation bounds
+box_lo, box_hi, box_ncell = adjust_bounds(box_lo0, box_hi0, box_ncell0, mgs)
+
+# Calculate parallelization
+n_node,n_mpi = nb_nodes_mpi(box_ncell, mgs, threadspernode, ompnumthreads, ngridpernode, ndim)
+
+# Print results
+string_output = ' ### Parameters used ### \n'
+string_output += 'ngridpernode = ' + str(ngridpernode) + '\n'
+string_output += 'ompnumthreads = ' + str(ompnumthreads) + '\n'
+string_output += 'mgs (max_grid_size) = ' + str(mgs) + '\n'
+string_output += 'threadspernode ( = # MPI ranks per node * OMP_NUM_THREADS) = ' + str(threadspernode) + '\n'
+string_output += 'ndim = ' + str(ndim) + '\n\n'
+string_output += 'box_lo = ' + str(box_lo) + '\n'
+string_output += 'box_hi = ' + str(box_hi) + '\n'
+string_output += 'box_ncell = ' + str(box_ncell) + '\n'
+string_output += 'n_node = ' + str(n_node) + '\n'
+string_output += 'n_mpi = ' + str(n_mpi) + '\n'
+print(string_output)