10 files changed, 459 insertions, 225 deletions
diff --git a/Tools/performance_tests/GNUmakefile_perftest b/Tools/performance_tests/GNUmakefile_perftest
deleted file mode 100644
index 38275332d..000000000
--- a/Tools/performance_tests/GNUmakefile_perftest
+++ /dev/null
@@ -1,16 +0,0 @@
-WARPX_HOME := ../..
-AMREX_HOME  ?= $(WARPX_HOME)/../amrex
-PICSAR_HOME ?= $(WARPX_HOME)/../picsar
-OPENBC_HOME ?= $(WARPX_HOME)/../openbc_poisson
-DEBUG	= FALSE
-DIM	= 3
-COMP=intel
-TINY_PROFILE = TRUE
-USE_OMP   = TRUE
-USE_CUDA  = FALSE
-USE_ACC   = FALSE
-USE_SENSEI_INSITU = FALSE
-EBASE     = perf_tests
-USE_PYTHON_MAIN = FALSE
-WarpxBinDir = Bin
-include $(WARPX_HOME)/Source/Make.WarpX
diff --git a/Tools/performance_tests/automated_test_1_uniform_rest_32ppc b/Tools/performance_tests/automated_test_1_uniform_rest_32ppc
index 55c1a6061..500e10859 100644
--- a/Tools/performance_tests/automated_test_1_uniform_rest_32ppc
+++ b/Tools/performance_tests/automated_test_1_uniform_rest_32ppc
@@ -22,6 +22,7 @@ warpx.verbose = 1
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
+warpx.do_pml = 0
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Tools/performance_tests/automated_test_2_uniform_rest_1ppc b/Tools/performance_tests/automated_test_2_uniform_rest_1ppc
index 8e17042c9..2af282db8 100644
--- a/Tools/performance_tests/automated_test_2_uniform_rest_1ppc
+++ b/Tools/performance_tests/automated_test_2_uniform_rest_1ppc
@@ -22,6 +22,7 @@ warpx.verbose = 1
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
+warpx.do_pml = 1
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Tools/performance_tests/automated_test_3_uniform_drift_4ppc b/Tools/performance_tests/automated_test_3_uniform_drift_4ppc
index 13af8aaff..93d224061 100644
--- a/Tools/performance_tests/automated_test_3_uniform_drift_4ppc
+++ b/Tools/performance_tests/automated_test_3_uniform_drift_4ppc
@@ -23,6 +23,7 @@ warpx.verbose = 1
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
+warpx.do_pml = 0
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Tools/performance_tests/automated_test_5_loadimbalance b/Tools/performance_tests/automated_test_5_loadimbalance
index 22c9ec4b6..d2e03372b 100644
--- a/Tools/performance_tests/automated_test_5_loadimbalance
+++ b/Tools/performance_tests/automated_test_5_loadimbalance
@@ -21,6 +21,7 @@ warpx.load_balance_int = 5
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
+warpx.do_pml = 0
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Tools/performance_tests/automated_test_6_output_2ppc b/Tools/performance_tests/automated_test_6_output_2ppc
index f4498c410..9e8a839cc 100644
--- a/Tools/performance_tests/automated_test_6_output_2ppc
+++ b/Tools/performance_tests/automated_test_6_output_2ppc
@@ -22,6 +22,7 @@ warpx.verbose = 1
 interpolation.nox = 3
 interpolation.noy = 3
 interpolation.noz = 3
+warpx.do_pml = 0
 
 # CFL
 warpx.cfl = 1.0
diff --git a/Tools/performance_tests/cori.py b/Tools/performance_tests/cori.py
new file mode 100644
index 000000000..dbe3a1e2a
--- /dev/null
+++ b/Tools/performance_tests/cori.py
@@ -0,0 +1,159 @@
+import os, copy
+
+from functions_perftest import test_element
+
+module_name = {'cpu': 'haswell.', 'knl': 'mic-knl.', 'gpu':'.'}
+
+def executable_name(compiler, architecture):
+    return 'perf_tests3d.' + compiler + \
+        '.' + module_name[architecture] + 'TPROF.MPI.OMP.ex'
+
+def get_config_command(compiler, architecture):
+    config_command = ''
+    config_command += 'module unload darshan;'
+    if architecture == 'knl':
+        if compiler == 'intel':
+            config_command += 'module unload PrgEnv-gnu;'
+            config_command += 'module load PrgEnv-intel;'
+        elif compiler == 'gnu':
+            config_command += 'module unload PrgEnv-intel;'
+            config_command += 'module load PrgEnv-gnu;'
+        config_command += 'module unload craype-haswell;'
+        config_command += 'module load craype-mic-knl;'
+    elif architecture == 'cpu':
+        if compiler == 'intel':
+            config_command += 'module unload PrgEnv-gnu;'
+            config_command += 'module load PrgEnv-intel;'
+        elif compiler == 'gnu':
+            config_command += 'module unload PrgEnv-intel;'
+            config_command += 'module load PrgEnv-gnu;'
+        config_command += 'module unload craype-mic-knl;'
+        config_command += 'module load craype-haswell;'
+    return config_command
+
+# This function runs a batch script with 
+# dependencies to perform the analysis 
+# after all performance tests are done.
+def process_analysis(automated, cwd, compiler, architecture, n_node_list, start_date):
+    dependencies = ''
+    f_log = open(cwd + 'log_jobids_tmp.txt' ,'r')
+    for line in f_log.readlines():
+        dependencies += line.split()[3] + ':'
+
+    batch_string = '''#!/bin/bash
+#SBATCH --job-name=warpx_1node_read
+#SBATCH --time=00:07:00
+#SBATCH -C knl
+#SBATCH -N 1
+#SBATCH -S 4
+#SBATCH -q regular
+#SBATCH -e read_error.txt
+#SBATCH -o read_output.txt
+#SBATCH --mail-type=end
+#SBATCH --account=m2852
+module load h5py-parallel
+'''
+    batch_string += 'python run_automated.py --compiler=' + \
+        compiler + ' --architecture=' + architecture + \
+        ' --mode=read' + \
+        ' --n_node_list=' + '"' + n_node_list + '"' + \
+        ' --start_date=' + start_date
+    if automated == True:
+        batch_string += ' --automated'
+    batch_string += '\n'
+    batch_file = 'slurm_perfread'
+    f_exe = open(batch_file,'w')
+    f_exe.write(batch_string)
+    f_exe.close()
+    os.system('chmod 700 ' + batch_file)
+    print( 'process_analysis line:  ' + 'sbatch  --dependency afterok:' + dependencies[0:-1] + ' ' + batch_file)
+    os.system('sbatch  --dependency afterok:' + dependencies[0:-1] + ' ' + batch_file)
+
+# Calculate simulation time. Take 5 min + 5 min / simulation
+def time_min(nb_simulations):
+    return 5. + nb_simulations*5.
+
+def get_submit_job_command():
+    return ' sbatch '
+
+def get_batch_string(test_list, job_time_min, Cname, n_node):
+
+    job_time_str = str(int(job_time_min/60)) + ':' + str(int(job_time_min%60)) + ':00'
+
+    batch_string = ''
+    batch_string += '#!/bin/bash\n'
+    batch_string += '#SBATCH --job-name=' + test_list[0].input_file + '\n'
+    batch_string += '#SBATCH --time=' + job_time_str + '\n'
+    batch_string += '#SBATCH -C ' + Cname + '\n'
+    batch_string += '#SBATCH -N ' + str(n_node) + '\n'
+    batch_string += '#SBATCH -q regular\n'
+    batch_string += '#SBATCH -e error.txt\n'
+    batch_string += '#SBATCH --account=m2852\n'
+    return batch_string
+
+def get_run_string(current_test, architecture, n_node, count, bin_name, runtime_param_string):
+    srun_string = ''
+    srun_string += 'export OMP_NUM_THREADS=' + str(current_test.n_omp) + '\n'
+    # number of logical cores per MPI process
+    if architecture == 'cpu':
+        cflag_value = max(1, int(32/current_test.n_mpi_per_node) * 2) # Follow NERSC directives
+    elif architecture == 'knl':
+        cflag_value = max(1, int(64/current_test.n_mpi_per_node) * 4) # Follow NERSC directives
+    output_filename = 'out_' + '_'.join([current_test.input_file, str(n_node), str(current_test.n_mpi_per_node), str(current_test.n_omp), str(count)]) + '.txt'
+    srun_string += 'srun --cpu_bind=cores '+ \
+        ' -n ' + str(n_node*current_test.n_mpi_per_node) + \
+        ' -c ' + str(cflag_value)   + \
+        ' ./'  + bin_name + \
+        ' ' + current_test.input_file + \
+        runtime_param_string + \
+        ' > ' + output_filename + '\n'
+    return srun_string
+
+def get_test_list(n_repeat):
+    test_list_unq = []
+    # n_node is kept to None and passed in functions as an external argument
+    # That way, several test_element_instance run with the same n_node on the same batch job
+    test_list_unq.append( test_element(input_file='automated_test_1_uniform_rest_32ppc', 
+                                       n_mpi_per_node=8, 
+                                       n_omp=8, 
+                                       n_cell=[128, 128, 128], 
+                                       max_grid_size=64,
+                                       blocking_factor=32,
+                                       n_step=10) )
+    test_list_unq.append( test_element(input_file='automated_test_2_uniform_rest_1ppc', 
+                                       n_mpi_per_node=8, 
+                                       n_omp=8, 
+                                       n_cell=[256, 256, 512], 
+                                       max_grid_size=64,
+                                       blocking_factor=32,
+                                       n_step=10) )
+    test_list_unq.append( test_element(input_file='automated_test_3_uniform_drift_4ppc', 
+                                       n_mpi_per_node=8, 
+                                       n_omp=8, 
+                                       n_cell=[128, 128, 128], 
+                                       max_grid_size=64,
+                                       blocking_factor=32,
+                                       n_step=10) )
+    test_list_unq.append( test_element(input_file='automated_test_4_labdiags_2ppc', 
+                                       n_mpi_per_node=8, 
+                                       n_omp=8, 
+                                       n_cell=[64, 64, 128], 
+                                       max_grid_size=64,
+                                       blocking_factor=32,
+                                       n_step=50) )
+    test_list_unq.append( test_element(input_file='automated_test_5_loadimbalance', 
+                                       n_mpi_per_node=8, 
+                                       n_omp=8, 
+                                       n_cell=[128, 128, 128], 
+                                       max_grid_size=64,
+                                       blocking_factor=32,
+                                       n_step=10) )
+    test_list_unq.append( test_element(input_file='automated_test_6_output_2ppc', 
+                                       n_mpi_per_node=8, 
+                                       n_omp=8, 
+                                       n_cell=[128, 256, 256], 
+                                       max_grid_size=64,
+                                       blocking_factor=32,
+                                       n_step=0) )
+    test_list = [copy.deepcopy(item) for item in test_list_unq for _ in range(n_repeat) ]
+    return test_list
diff --git a/Tools/performance_tests/functions_perftest.py b/Tools/performance_tests/functions_perftest.py
index 5e026bf12..67622317a 100644
--- a/Tools/performance_tests/functions_perftest.py
+++ b/Tools/performance_tests/functions_perftest.py
@@ -1,7 +1,32 @@
-import os, shutil, re
+import os, shutil, re, copy
 import pandas as pd
 import numpy as np
 import git
+# import cori
+# import summit
+
+# Each instance of this class contains information for a single test.
+class test_element():
+    def __init__(self, input_file=None, n_node=None, n_mpi_per_node=None, 
+                 n_omp=None, n_cell=None, n_step=None, max_grid_size=None,
+                 blocking_factor=None):
+        self.input_file = input_file
+        self.n_node = n_node
+        self.n_mpi_per_node = n_mpi_per_node
+        self.n_omp = n_omp
+        self.n_cell = n_cell
+        self.n_step = n_step
+        self.max_grid_size = max_grid_size
+        self.blocking_factor = blocking_factor
+
+    def scale_n_cell(self, n_node=0):
+        n_cell_scaled = copy.deepcopy(self.n_cell)
+        index_dim = 0
+        while n_node > 1:
+            n_cell_scaled[index_dim] *= 2
+            n_node /= 2
+            index_dim = (index_dim+1) % 3
+        self.n_cell = n_cell_scaled
 
 def scale_n_cell(ncell, n_node):
      ncell_scaled = ncell[:]
@@ -25,56 +50,6 @@ def get_file_content(filename=None):
     file_handler.close()
     return file_content
 
-def run_batch_nnode(test_list, res_dir, bin_name, config_command, architecture='knl', Cname='knl', n_node=1, runtime_param_list=[]):
-    # Clean res_dir
-    if os.path.exists(res_dir):
-        shutil.rmtree(res_dir, ignore_errors=True)
-    os.makedirs(res_dir)
-    # Copy files to res_dir
-    cwd = os.environ['AUTOMATED_PERF_TESTS'] + '/WarpX/Tools/performance_tests/'
-    bin_dir = cwd + 'Bin/'
-    shutil.copy(bin_dir + bin_name, res_dir)
-    os.chdir(res_dir)
-    # Calculate simulation time. Take 5 min + 2 min / simulation
-    job_time_min = 5. + len(test_list)*5.
-    job_time_str = str(int(job_time_min/60)) + ':' + str(int(job_time_min%60)) + ':00'
-    batch_string = ''
-    batch_string += '#!/bin/bash\n'
-    batch_string += '#SBATCH --job-name=' + test_list[0].input_file + '\n'
-    batch_string += '#SBATCH --time=' + job_time_str + '\n'
-    batch_string += '#SBATCH -C ' + Cname + '\n'
-    batch_string += '#SBATCH -N ' + str(n_node) + '\n'
-    batch_string += '#SBATCH -q regular\n'
-    batch_string += '#SBATCH -e error.txt\n'
-    batch_string += '#SBATCH --account=m2852\n'
-
-    for count, current_test in enumerate(test_list):
-        shutil.copy(cwd + current_test.input_file, res_dir)
-        srun_string = ''
-        srun_string += 'export OMP_NUM_THREADS=' + str(current_test.n_omp) + '\n'
-        # number of logical cores per MPI process
-        if architecture == 'cpu':
-            cflag_value = max(1, int(32/current_test.n_mpi_per_node) * 2) # Follow NERSC directives
-        elif architecture == 'knl':
-            cflag_value = max(1, int(64/current_test.n_mpi_per_node) * 4) # Follow NERSC directives
-        output_filename = 'out_' + '_'.join([current_test.input_file, str(n_node), str(current_test.n_mpi_per_node), str(current_test.n_omp), str(count)]) + '.txt'
-        srun_string += 'srun --cpu_bind=cores '+ \
-                       ' -n ' + str(n_node*current_test.n_mpi_per_node) + \
-                       ' -c ' + str(cflag_value)   + \
-                       ' ./'  + bin_name + \
-                       ' ' + current_test.input_file + \
-                       runtime_param_list[ count ] + \
-                       ' > ' + output_filename + '\n'
-        batch_string += srun_string
-        batch_string += 'rm -rf plotfiles ; rm -rf lab_frame_data\n'
-    batch_file = 'slurm'
-    f_exe = open(batch_file,'w')
-    f_exe.write(batch_string)
-    f_exe.close()
-    os.system('chmod 700 ' + bin_name)
-    os.system(config_command + 'sbatch ' + batch_file + ' >> ' + cwd + 'log_jobids_tmp.txt')
-    return 0
-
 def run_batch(run_name, res_dir, bin_name, config_command, architecture='knl',\
               Cname='knl', n_node=1, n_mpi=1, n_omp=1):
     # Clean res_dir
@@ -82,7 +57,6 @@ def run_batch(run_name, res_dir, bin_name, config_command, architecture='knl',\
         shutil.rmtree(res_dir)
     os.makedirs(res_dir)
     # Copy files to res_dir
-    # Copy files to res_dir
     cwd = os.environ['WARPX'] + '/Tools/performance_tests/'
     bin_dir = cwd + 'Bin/'
     shutil.copy(bin_dir + bin_name, res_dir)
@@ -119,6 +93,27 @@ def run_batch(run_name, res_dir, bin_name, config_command, architecture='knl',\
     os.system(config_command + 'sbatch ' + batch_file + ' >> ' + cwd + 'log_jobids_tmp.txt')
     return 0
 
+def run_batch_nnode(test_list, res_dir, bin_name, config_command, batch_string, submit_job_command):
+    # Clean res_dir
+    if os.path.exists(res_dir):
+         shutil.rmtree(res_dir, ignore_errors=True)
+    os.makedirs(res_dir)
+    # Copy files to res_dir
+    cwd = os.environ['AUTOMATED_PERF_TESTS'] + '/warpx/Tools/performance_tests/'
+    bin_dir = cwd + 'Bin/'
+    shutil.copy(bin_dir + bin_name, res_dir)
+    os.chdir(res_dir)
+    
+    for count, current_test in enumerate(test_list):
+        shutil.copy(cwd + current_test.input_file, res_dir)
+    batch_file = 'batch_script.sh'
+    f_exe = open(batch_file,'w')
+    f_exe.write(batch_string)
+    f_exe.close()
+    os.system('chmod 700 ' + bin_name)
+    os.system(config_command + submit_job_command + batch_file +\
+                   ' >> ' + cwd + 'log_jobids_tmp.txt')
+
 # Read output file and return init time and 1-step time
 def read_run_perf(filename, n_steps):
     timing_list = []
diff --git a/Tools/performance_tests/run_automated.py b/Tools/performance_tests/run_automated.py
index 8f79750d4..fd771faac 100644
--- a/Tools/performance_tests/run_automated.py
+++ b/Tools/performance_tests/run_automated.py
@@ -1,15 +1,41 @@
-#!/usr/common/software/python/2.7-anaconda-4.4/bin/python
-
 import os, sys, shutil, datetime, git
 import argparse, re, time, copy
 import pandas as pd
 from functions_perftest import store_git_hash, get_file_content, \
-                               run_batch_nnode, extract_dataframe
+    run_batch_nnode, extract_dataframe
+
+# Get name of supercomputer and import configuration functions from 
+# machine-specific file
+if os.getenv("LMOD_SYSTEM_NAME") == 'summit':
+    machine = 'summit'
+    from summit import executable_name, process_analysis, \
+        get_config_command, time_min, get_submit_job_command, \
+        get_batch_string, get_run_string, get_test_list
+if os.getenv("NERSC_HOST") == 'cori':
+    machine = 'cori'
+    from cori import executable_name, process_analysis, \
+        get_config_command, time_min, get_submit_job_command, \
+        get_batch_string, get_run_string, get_test_list
 
 # typical use: python run_automated.py --n_node_list='1,8,16,32' --automated
 # Assume warpx, picsar, amrex and perf_logs repos ar in the same directory and
 # environment variable AUTOMATED_PERF_TESTS contains the path to this directory
 
+# requirements:
+# - python packages: gitpython and pandas
+# - AUTOMATED_PERF_TESTS: environment variables where warpx, 
+#   amrex and picsar are installed ($AUTOMATED_PERF_TESTS/warpx etc.)
+# - SCRATCH: environment variable where performance results are written.
+#   This script will create folder $SCRATCH/performance_warpx/
+
+if "AUTOMATED_PERF_TESTS" not in os.environ:
+    raise ValueError("environment variable AUTOMATED_PERF_TESTS is not defined.\n"
+                     "It should contain the path to the directory where WarpX, "
+                     "AMReX and PICSAR repos are.")
+if "SCRATCH" not in os.environ:
+    raise ValueError("environment variable SCRATCH is not defined.\n"
+                     "This script will create $SCRATCH/performance_warpx/ "
+                     "to store performance results.")
 # Handle parser
 ###############
 parser = argparse.ArgumentParser( description='Run performance tests and write results in files' )
@@ -33,11 +59,11 @@ parser.add_argument('--n_node_list',
 parser.add_argument('--start_date',
                     dest='start_date' )
 parser.add_argument('--compiler',
-                    choices=['gnu', 'intel'],
+                    choices=['gnu', 'intel', 'pgi'],
                     default='intel',
                     help='which compiler to use')
 parser.add_argument('--architecture',
-                    choices=['cpu', 'knl'],
+                    choices=['cpu', 'knl', 'gpu'],
                     default='knl',
                     help='which architecture to cross-compile for NERSC machines')
 parser.add_argument('--mode',
@@ -48,6 +74,8 @@ args = parser.parse_args()
 n_node_list_string   = args.n_node_list.split(',')
 n_node_list = [int(i) for i in n_node_list_string]
 start_date = args.start_date
+compiler = args.compiler
+architecture = args.architecture
 
 # Set behavior variables
 ########################
@@ -71,70 +99,21 @@ if args.automated == True:
     push_on_perf_log_repo = False
     pull_3_repos = True
     recompile = True
-
-# Each instance of this class contains information for a single test.
-class test_element():
-    def __init__(self, input_file=None, n_node=None, n_mpi_per_node=None,
-                 n_omp=None, n_cell=None, n_step=None):
-        self.input_file = input_file
-        self.n_node = n_node
-        self.n_mpi_per_node = n_mpi_per_node
-        self.n_omp = n_omp
-        self.n_cell = n_cell
-        self.n_step = n_step
-
-    def scale_n_cell(self, n_node=0):
-        n_cell_scaled = copy.deepcopy(self.n_cell)
-        index_dim = 0
-        while n_node > 1:
-            n_cell_scaled[index_dim] *= 2
-            n_node /= 2
-            index_dim = (index_dim+1) % 3
-        self.n_cell = n_cell_scaled
+    if machine == 'summit': 
+        compiler = 'pgi'
+        architecture = 'gpu'
 
 # List of tests to perform
 # ------------------------
-test_list_unq = []
 # Each test runs n_repeat times
 n_repeat = 2
-# n_node is kept to None and passed in functions as an external argument
-# That way, several test_element_instance run with the same n_node on the same batch job
-test_list_unq.append( test_element(input_file='automated_test_1_uniform_rest_32ppc',
-                                   n_mpi_per_node=8,
-                                   n_omp=8,
-                                   n_cell=[128, 128, 128],
-                                   n_step=10) )
-test_list_unq.append( test_element(input_file='automated_test_2_uniform_rest_1ppc',
-                                   n_mpi_per_node=8,
-                                   n_omp=8,
-                                   n_cell=[256, 256, 512],
-                                   n_step=10) )
-test_list_unq.append( test_element(input_file='automated_test_3_uniform_drift_4ppc',
-                                   n_mpi_per_node=8,
-                                   n_omp=8,
-                                   n_cell=[128, 128, 128],
-                                   n_step=10) )
-test_list_unq.append( test_element(input_file='automated_test_4_labdiags_2ppc',
-                                   n_mpi_per_node=8,
-                                   n_omp=8,
-                                   n_cell=[64, 64, 128],
-                                   n_step=50) )
-test_list_unq.append( test_element(input_file='automated_test_5_loadimbalance',
-                                   n_mpi_per_node=8,
-                                   n_omp=8,
-                                   n_cell=[128, 128, 128],
-                                   n_step=10) )
-test_list_unq.append( test_element(input_file='automated_test_6_output_2ppc',
-                                   n_mpi_per_node=8,
-                                   n_omp=8,
-                                   n_cell=[128, 256, 256],
-                                   n_step=0) )
-test_list = [copy.deepcopy(item) for item in test_list_unq for _ in range(n_repeat) ]
+# test_list is machine-specific
+test_list = get_test_list(n_repeat)
 
 # Define directories
 # ------------------
 source_dir_base = os.environ['AUTOMATED_PERF_TESTS']
-warpx_dir = source_dir_base + '/WarpX/'
+warpx_dir = source_dir_base + '/warpx/'
 picsar_dir = source_dir_base + '/picsar/'
 amrex_dir = source_dir_base + '/amrex/'
 res_dir_base = os.environ['SCRATCH'] + '/performance_warpx/'
@@ -142,12 +121,13 @@ perf_logs_repo = source_dir_base + 'perf_logs/'
 
 # Define dictionaries
 # -------------------
-compiler_name = {'intel': 'intel', 'gnu': 'gcc'}
-module_name = {'cpu': 'haswell', 'knl': 'mic-knl'}
-module_Cname = {'cpu': 'haswell', 'knl': 'knl,quad,cache'}
+compiler_name = {'intel': 'intel', 'gnu': 'gcc', 'pgi':'pgi'}
+module_Cname = {'cpu': 'haswell', 'knl': 'knl,quad,cache', 'gpu':''}
+csv_file = {'cori':'cori_knl.csv', 'summit':'summit.csv'}
 cwd = os.getcwd() + '/'
 bin_dir = cwd + 'Bin/'
-bin_name = 'perf_tests3d.' + args.compiler + '.' + module_name[args.architecture] + '.TPROF.MPI.OMP.ex'
+bin_name = executable_name(compiler, architecture)
+
 log_dir  = cwd
 perf_database_file = cwd + perf_database_file
 day = time.strftime('%d')
@@ -159,27 +139,7 @@ year = time.strftime('%Y')
 if args.mode == 'run':
     start_date = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
     # Set default options for compilation and execution
-    config_command = ''
-    config_command += 'module unload darshan;'
-    config_command += 'module load craype-hugepages4M;'
-    if args.architecture == 'knl':
-        if args.compiler == 'intel':
-            config_command += 'module unload PrgEnv-gnu;'
-            config_command += 'module load PrgEnv-intel;'
-        elif args.compiler == 'gnu':
-            config_command += 'module unload PrgEnv-intel;'
-            config_command += 'module load PrgEnv-gnu;'
-        config_command += 'module unload craype-haswell;'
-        config_command += 'module load craype-mic-knl;'
-    elif args.architecture == 'cpu':
-        if args.compiler == 'intel':
-            config_command += 'module unload PrgEnv-gnu;'
-            config_command += 'module load PrgEnv-intel;'
-        elif args.compiler == 'gnu':
-            config_command += 'module unload PrgEnv-intel;'
-            config_command += 'module load PrgEnv-gnu;'
-        config_command += 'module unload craype-mic-knl;'
-        config_command += 'module load craype-haswell;'
+    config_command = get_config_command(compiler, architecture)
     # Create main result directory if does not exist
     if not os.path.exists(res_dir_base):
         os.mkdir(res_dir_base)
@@ -194,56 +154,31 @@ if args.mode == 'run':
             git_repo.pull()
             git_repo = git.cmd.Git( warpx_dir  )
             git_repo.pull()
-        with open(cwd + 'GNUmakefile_perftest') as makefile_handler:
-            makefile_text = makefile_handler.read()
-        makefile_text = re.sub('\nCOMP.*', '\nCOMP=%s' %compiler_name[args.compiler], makefile_text)
-        with open(cwd + 'GNUmakefile_perftest', 'w') as makefile_handler:
-            makefile_handler.write( makefile_text )
-        os.system(config_command + " make -f GNUmakefile_perftest realclean ; " + " rm -r tmp_build_dir *.mod; make -j 8 -f GNUmakefile_perftest")
+        
+        # Copy WarpX/GNUmakefile to current directory and recompile
+        # with specific options for automated performance tests.
+        # This way, performance test compilation does not mess with user's
+        # compilation
+        shutil.copyfile("../../GNUmakefile","./GNUmakefile")
+        make_realclean_command = " make realclean WARPX_HOME=../.. " \
+            "AMREX_HOME=../../../amrex/ PICSAR_HOME=../../../picsar/ " \
+            "EBASE=perf_tests COMP=%s" %compiler_name[compiler] + ";"
+        make_command = "make -j 16 WARPX_HOME=../.. " \
+            "AMREX_HOME=../../../amrex/ PICSAR_HOME=../../../picsar/ " \
+            "EBASE=perf_tests COMP=%s" %compiler_name[compiler]
+        if machine == 'summit':
+            make_command += ' USE_GPU=TRUE '
+        os.system(config_command + make_realclean_command + \
+                  "rm -r tmp_build_dir *.mod; " + make_command )
+
+        # Store git hashes for WarpX, AMReX and PICSAR into file, so that
+        # they can be read when running the analysis.
         if os.path.exists( cwd + 'store_git_hashes.txt' ):
             os.remove( cwd + 'store_git_hashes.txt' )
         store_git_hash(repo_path=picsar_dir, filename=cwd + 'store_git_hashes.txt', name='picsar')
         store_git_hash(repo_path=amrex_dir , filename=cwd + 'store_git_hashes.txt', name='amrex' )
         store_git_hash(repo_path=warpx_dir , filename=cwd + 'store_git_hashes.txt', name='warpx' )
 
-# This function runs a batch script with
-# dependencies to perform the analysis
-# after all performance tests are done.
-def process_analysis():
-    dependencies = ''
-    f_log = open(cwd + 'log_jobids_tmp.txt' ,'r')
-    for line in f_log.readlines():
-        dependencies += line.split()[3] + ':'
-    batch_string = ''
-    batch_string += '#!/bin/bash\n'
-    batch_string += '#SBATCH --job-name=warpx_1node_read\n'
-    batch_string += '#SBATCH --time=00:07:00\n'
-    batch_string += '#SBATCH -C knl\n'
-    batch_string += '#SBATCH -N 1\n'
-    batch_string += '#SBATCH -S 4\n'
-    batch_string += '#SBATCH -q regular\n'
-    batch_string += '#SBATCH -e read_error.txt\n'
-    batch_string += '#SBATCH -o read_output.txt\n'
-    batch_string += '#SBATCH --mail-type=end\n'
-    batch_string += '#SBATCH --account=m2852\n'
-    batch_string += 'module load h5py-parallel\n'
-    batch_string += 'python ' + __file__ + ' --compiler=' + \
-                    args.compiler + ' --architecture=' + args.architecture + \
-                    ' --mode=read' + \
-                ' --n_node_list=' + '"' + args.n_node_list + '"' + \
-                ' --start_date=' + start_date
-    if args.automated == True:
-        batch_string += ' --automated'
-    batch_string += '\n'
-    batch_file = 'slurm_perfread'
-    f_exe = open(batch_file,'w')
-    f_exe.write(batch_string)
-    f_exe.close()
-    os.system('chmod 700 ' + batch_file)
-    print( 'process_analysis line:  ' + 'sbatch  --dependency afterok:' + dependencies[0:-1] + ' ' + batch_file)
-    os.system('sbatch  --dependency afterok:' + dependencies[0:-1] + ' ' + batch_file)
-    return 0
-
 # Loop over the tests and run all simulations:
 # One batch job submitted per n_node. Several
 # tests run within the same batch job.
@@ -254,24 +189,35 @@ if args.mode == 'run':
     # loop on n_node. One batch script per n_node
     for n_node in n_node_list:
         res_dir = res_dir_base
-        res_dir += '_'.join([run_name, args.compiler, args.architecture, str(n_node)]) + '/'
+        res_dir += '_'.join([run_name, compiler, architecture, str(n_node)]) + '/'
         runtime_param_list = []
         # Deep copy as we change the attribute n_cell of
         # each instance of class test_element
         test_list_n_node = copy.deepcopy(test_list)
+        job_time_min = time_min(len(test_list))
+        batch_string = get_batch_string(test_list_n_node, job_time_min, module_Cname[architecture], n_node)
         # Loop on tests
-        for current_run in test_list_n_node:
+        for count, current_run in enumerate(test_list_n_node):
             current_run.scale_n_cell(n_node)
             runtime_param_string  = ' amr.n_cell=' + ' '.join(str(i) for i in current_run.n_cell)
+            runtime_param_string += ' amr.max_grid_size=' + str(current_run.max_grid_size)
+            runtime_param_string += ' amr.blocking_factor=' + str(current_run.blocking_factor)
             runtime_param_string += ' max_step=' + str( current_run.n_step )
-            runtime_param_list.append( runtime_param_string )
+            # runtime_param_list.append( runtime_param_string )
+            run_string = get_run_string(current_run, architecture, n_node, count, bin_name, runtime_param_string)
+            batch_string += run_string
+            batch_string += 'rm -rf plotfiles lab_frame_data diags\n'
+            
+        submit_job_command = get_submit_job_command()
         # Run the simulations.
-        run_batch_nnode(test_list_n_node, res_dir, bin_name, config_command,\
-                        architecture=args.architecture, Cname=module_Cname[args.architecture], \
-                        n_node=n_node, runtime_param_list=runtime_param_list)
+        run_batch_nnode(test_list_n_node, res_dir, bin_name, config_command, batch_string, submit_job_command)
     os.chdir(cwd)
     # submit batch for analysis
-    process_analysis()
+    if os.path.exists( 'read_error.txt' ):
+        os.remove( 'read_error.txt' )
+    if os.path.exists( 'read_output.txt' ):
+        os.remove( 'read_output.txt' )
+    process_analysis(args.automated, cwd, compiler, architecture, args.n_node_list, start_date)
 
 # read the output file from each test and store timers in
 # hdf5 file with pandas format
@@ -279,10 +225,10 @@ if args.mode == 'run':
 for n_node in n_node_list:
     print(n_node)
     if browse_output_files:
+        res_dir = res_dir_base
+        res_dir += '_'.join([run_name, compiler,\
+                             architecture, str(n_node)]) + '/'
         for count, current_run in enumerate(test_list):
-            res_dir = res_dir_base
-            res_dir += '_'.join([run_name, args.compiler,\
-                                 args.architecture, str(n_node)]) + '/'
             # Read performance data from the output file
             output_filename = 'out_' + '_'.join([current_run.input_file, str(n_node), str(current_run.n_mpi_per_node), str(current_run.n_omp), str(count)]) + '.txt'
             # Read data for all test to put in hdf5 a database
@@ -305,8 +251,8 @@ for n_node in n_node_list:
             # Load file perf_database_file if exists, and
             # append with results from this scan
             if os.path.exists(perf_database_file):
-                df_base = pd.read_hdf(perf_database_file, 'all_data', format='table')
-                # df_base = pd.read_hdf(perf_database_file, 'all_data')
+                # df_base = pd.read_hdf(perf_database_file, 'all_data', format='table')
+                df_base = pd.read_hdf(perf_database_file, 'all_data')
                 updated_df = df_base.append(df_newline, ignore_index=True)
             else:
                 updated_df = df_newline
@@ -314,19 +260,6 @@ for n_node in n_node_list:
             # (overwrite if file exists)
             updated_df.to_hdf(perf_database_file, key='all_data', mode='w')
 
-        # Rename directory with precise date+hour for archive purpose
-        if rename_archive == True:
-            loc_counter = 0
-            res_dir_arch = res_dir_base
-            res_dir_arch += '_'.join([year, month, day, run_name, args.compiler,\
-                                      args.architecture, str(n_node), str(loc_counter)]) + '/'
-            while os.path.exists( res_dir_arch ):
-                loc_counter += 1
-                res_dir_arch = res_dir_base
-                res_dir_arch += '_'.join([year, month, day, run_name, args.compiler,\
-                                          args.architecture, str(n_node), str(loc_counter)]) + '/'
-            os.rename( res_dir, res_dir_arch )
-
 # Extract sub-set of pandas data frame, write it to
 # csv file and copy this file to perf_logs repo
 # -------------------------------------------------
@@ -342,19 +275,42 @@ if write_csv:
         df_small[ df_small['input_file']=='automated_test_6_output_2ppc' ]['time_WritePlotFile']
     df_small = df_small.loc[:, ['date', 'input_file', 'git_hashes', 'n_node', 'n_mpi_per_node', 'n_omp', 'rep', 'start_date', 'time_initialization', 'step_time'] ]
     # Write to csv
-    df_small.to_csv( 'cori_knl.csv' )
+    df_small.to_csv( csv_file[machine] )
     # Errors may occur depending on the version of pandas. I had errors with v0.21.0 solved with 0.23.0
     # Second, move files to perf_logs repo
     if update_perf_log_repo:
+        # get perf_logs repo
         git_repo = git.Repo( perf_logs_repo )
         if push_on_perf_log_repo:
             git_repo.git.stash('save')
             git_repo.git.pull()
-        shutil.move( 'cori_knl.csv', perf_logs_repo + '/logs_csv/cori_knl.csv' )
+        # move csv file to perf_logs repon and commit the new version
+        shutil.move( csv_file[machine], perf_logs_repo + '/logs_csv/' + csv_file[machine] )
         os.chdir( perf_logs_repo )
         sys.path.append('./')
         import generate_index_html
         git_repo.git.add('./index.html')
-        git_repo.git.add('./logs_csv/cori_knl.csv')
+        git_repo.git.add('./logs_csv/' + csv_file[machine])
         index = git_repo.index
         index.commit("automated tests")
+
+# Rename all result directories for archiving purposes:
+# include date in the name, and a counter to avoid over-writing
+for n_node in n_node_list:
+    if browse_output_files:
+        res_dir = res_dir_base
+        res_dir += '_'.join([run_name, compiler,\
+                             architecture, str(n_node)]) + '/'
+        # Rename directory with precise date+hour for archive purpose
+        if rename_archive == True:
+            loc_counter = 0
+            res_dir_arch = res_dir_base
+            res_dir_arch += '_'.join([year, month, day, run_name, compiler,\
+                                      architecture, str(n_node), str(loc_counter)]) + '/'
+            while os.path.exists( res_dir_arch ):
+                loc_counter += 1
+                res_dir_arch = res_dir_base
+                res_dir_arch += '_'.join([year, month, day, run_name, compiler,\
+                                          architecture, str(n_node), str(loc_counter)]) + '/'
+            print("renaming " + res_dir + " -> " + res_dir_arch)
+            os.rename( res_dir, res_dir_arch )
diff --git a/Tools/performance_tests/summit.py b/Tools/performance_tests/summit.py
new file mode 100644
index 000000000..69598f1fd
--- /dev/null
+++ b/Tools/performance_tests/summit.py
@@ -0,0 +1,135 @@
+# requirements:
+# - module load python/3.7.0-anaconda3-5.3.0
+
+import os, copy
+from functions_perftest import test_element
+
+def executable_name(compiler,architecture):
+    return 'perf_tests3d.' + compiler + '.TPROF.MPI.ACC.CUDA.ex'
+
+def get_config_command(compiler, architecture):
+    config_command = ''
+    config_command += 'module load pgi;'
+    config_command += 'module load cuda;'
+    return config_command
+
+# This function runs a batch script with 
+# dependencies to perform the analysis 
+# after all performance tests are done.
+def process_analysis(automated, cwd, compiler, architecture, n_node_list, start_date):
+
+    batch_string = '''#!/bin/bash
+#BSUB -P APH114
+#BSUB -W 00:10
+#BSUB -nnodes 1
+#BSUB -J perf_test
+#BSUB -o read_output.txt
+#BSUB -e read_error.txt
+'''
+    f_log = open(cwd + 'log_jobids_tmp.txt' ,'r')
+    for line in f_log.readlines():
+        dependency = line.split()[1][1:-1]
+        batch_string += '#BSUB -w ended(' + dependency + ')\n'
+
+    batch_string += 'python run_automated.py --compiler=' + \
+        compiler + ' --architecture=' + architecture + \
+        ' --mode=read' + \
+        ' --n_node_list=' + '"' + n_node_list + '"' + \
+        ' --start_date=' + start_date
+    if automated == True:
+        batch_string += ' --automated'
+    batch_string += '\n'
+    batch_file = 'bsub_perfread'
+    f_exe = open(batch_file,'w')
+    f_exe.write(batch_string)
+    f_exe.close()
+    os.system('chmod 700 ' + batch_file)
+    
+    print( 'process_analysis line:  ' + 'bsub ' + batch_file)
+    os.system('bsub ' + batch_file)
+
+# Calculate simulation time. Take 2 min + 2 min / simulation
+def time_min(nb_simulations):
+    return 2. + nb_simulations*2.
+
+def get_submit_job_command():
+    return ' bsub '
+
+def get_batch_string(test_list, job_time_min, Cname, n_node):
+
+    job_time_str = str(int(job_time_min/60)) + ':' + str(int(job_time_min%60))
+
+    batch_string = ''
+    batch_string += '#!/bin/bash\n'
+    batch_string += '#BSUB -P APH114\n'
+    batch_string += '#BSUB -W ' + job_time_str + '\n'
+    batch_string += '#BSUB -nnodes ' + str(n_node) + '\n'
+    batch_string += '#BSUB -J ' + test_list[0].input_file + '\n'
+    batch_string += '#BSUB -e error.txt\n'
+    batch_string += 'module load pgi\n' 
+    batch_string += 'module load cuda\n' 
+    return batch_string
+
+def get_run_string(current_test, architecture, n_node, count, bin_name, runtime_param_string):
+
+    output_filename = 'out_' + '_'.join([current_test.input_file, str(n_node), str(current_test.n_mpi_per_node), str(current_test.n_omp), str(count)]) + '.txt'
+
+    ngpu = str(current_test.n_mpi_per_node)
+    srun_string = ''
+    srun_string += 'jsrun '
+    srun_string += ' -n ' + str(n_node)
+    srun_string += ' -a ' + ngpu + ' -g ' + ngpu + ' -c ' + ngpu + ' --bind=packed:1 '
+    srun_string += ' ./' + bin_name + ' '
+    srun_string += current_test.input_file + ' '
+    srun_string += runtime_param_string
+    srun_string += ' > ' + output_filename + '\n'
+    return srun_string
+
+def get_test_list(n_repeat):
+    test_list_unq = []
+    # n_node is kept to None and passed in functions as an external argument
+    # That way, several test_element_instance run with the same n_node on the same batch job
+    test_list_unq.append( test_element(input_file='automated_test_1_uniform_rest_32ppc',
+                                       n_mpi_per_node=6,
+                                       n_omp=1,
+                                       n_cell=[128, 128, 192],
+                                       max_grid_size=256,
+                                       blocking_factor=32,
+                                       n_step=10) )
+    test_list_unq.append( test_element(input_file='automated_test_2_uniform_rest_1ppc',
+                                       n_mpi_per_node=6,
+                                       n_omp=1,
+                                       n_cell=[256, 512, 768],
+                                       max_grid_size=512,
+                                       blocking_factor=256,
+                                       n_step=10) )
+    test_list_unq.append( test_element(input_file='automated_test_3_uniform_drift_4ppc',
+                                       n_mpi_per_node=6,
+                                       n_omp=1,
+                                       n_cell=[128, 128, 384],
+                                       max_grid_size=256,
+                                       blocking_factor=64,
+                                       n_step=10) )
+    test_list_unq.append( test_element(input_file='automated_test_4_labdiags_2ppc',
+                                       n_mpi_per_node=6,
+                                       n_omp=1,
+                                       n_cell=[384, 512, 512],
+                                       max_grid_size=256,
+                                       blocking_factor=128,
+                                       n_step=50) )
+    test_list_unq.append( test_element(input_file='automated_test_5_loadimbalance',
+                                       n_mpi_per_node=6,
+                                       n_omp=1,
+                                       n_cell=[64, 128, 192],
+                                       max_grid_size=64,
+                                       blocking_factor=32,
+                                       n_step=10) )
+    test_list_unq.append( test_element(input_file='automated_test_6_output_2ppc',
+                                       n_mpi_per_node=6,
+                                       n_omp=1,
+                                       n_cell=[384, 256, 512],
+                                       max_grid_size=256,
+                                       blocking_factor=64,
+                                       n_step=0) )
+    test_list = [copy.deepcopy(item) for item in test_list_unq for _ in range(n_repeat) ]
+    return test_list