aboutsummaryrefslogtreecommitdiff
path: root/Tools/performance_tests/run_automated.py
diff options
context:
space:
mode:
Diffstat (limited to 'Tools/performance_tests/run_automated.py')
-rw-r--r--Tools/performance_tests/run_automated.py286
1 files changed, 121 insertions, 165 deletions
diff --git a/Tools/performance_tests/run_automated.py b/Tools/performance_tests/run_automated.py
index dca038c6c..fd771faac 100644
--- a/Tools/performance_tests/run_automated.py
+++ b/Tools/performance_tests/run_automated.py
@@ -1,15 +1,41 @@
-#!/usr/common/software/python/2.7-anaconda-4.4/bin/python
-
import os, sys, shutil, datetime, git
import argparse, re, time, copy
import pandas as pd
from functions_perftest import store_git_hash, get_file_content, \
- run_batch_nnode, extract_dataframe
+ run_batch_nnode, extract_dataframe
+
+# Get name of supercomputer and import configuration functions from
+# machine-specific file
+if os.getenv("LMOD_SYSTEM_NAME") == 'summit':
+ machine = 'summit'
+ from summit import executable_name, process_analysis, \
+ get_config_command, time_min, get_submit_job_command, \
+ get_batch_string, get_run_string, get_test_list
+if os.getenv("NERSC_HOST") == 'cori':
+ machine = 'cori'
+ from cori import executable_name, process_analysis, \
+ get_config_command, time_min, get_submit_job_command, \
+ get_batch_string, get_run_string, get_test_list
# typical use: python run_automated.py --n_node_list='1,8,16,32' --automated
# Assume warpx, picsar, amrex and perf_logs repos ar in the same directory and
# environment variable AUTOMATED_PERF_TESTS contains the path to this directory
+# requirements:
+# - python packages: gitpython and pandas
+# - AUTOMATED_PERF_TESTS: environment variables where warpx,
+# amrex and picsar are installed ($AUTOMATED_PERF_TESTS/warpx etc.)
+# - SCRATCH: environment variable where performance results are written.
+# This script will create folder $SCRATCH/performance_warpx/
+
+if "AUTOMATED_PERF_TESTS" not in os.environ:
+ raise ValueError("environment variable AUTOMATED_PERF_TESTS is not defined.\n"
+ "It should contain the path to the directory where WarpX, "
+ "AMReX and PICSAR repos are.")
+if "SCRATCH" not in os.environ:
+ raise ValueError("environment variable SCRATCH is not defined.\n"
+ "This script will create $SCRATCH/performance_warpx/ "
+ "to store performance results.")
# Handle parser
###############
parser = argparse.ArgumentParser( description='Run performance tests and write results in files' )
@@ -24,7 +50,7 @@ parser.add_argument('--commit',
parser.add_argument('--automated',
dest='automated',
action='store_true',
- default=False,
+ default=False,
help='Use to run the automated test list')
parser.add_argument('--n_node_list',
dest='n_node_list',
@@ -33,23 +59,25 @@ parser.add_argument('--n_node_list',
parser.add_argument('--start_date',
dest='start_date' )
parser.add_argument('--compiler',
- choices=['gnu', 'intel'],
- default='intel',
+ choices=['gnu', 'intel', 'pgi'],
+ default='intel',
help='which compiler to use')
parser.add_argument('--architecture',
- choices=['cpu', 'knl'],
+ choices=['cpu', 'knl', 'gpu'],
default='knl',
help='which architecture to cross-compile for NERSC machines')
parser.add_argument('--mode',
choices=['run', 'read', 'browse_output_files', 'write_csv'],
- default='run',
+ default='run',
help='whether to run perftests or read their perf output. run calls read')
args = parser.parse_args()
n_node_list_string = args.n_node_list.split(',')
n_node_list = [int(i) for i in n_node_list_string]
start_date = args.start_date
+compiler = args.compiler
+architecture = args.architecture
-# Set behavior variables
+# Set behavior variables
########################
write_csv = False
browse_output_files = False
@@ -71,70 +99,21 @@ if args.automated == True:
push_on_perf_log_repo = False
pull_3_repos = True
recompile = True
-
-# Each instance of this class contains information for a single test.
-class test_element():
- def __init__(self, input_file=None, n_node=None, n_mpi_per_node=None,
- n_omp=None, n_cell=None, n_step=None):
- self.input_file = input_file
- self.n_node = n_node
- self.n_mpi_per_node = n_mpi_per_node
- self.n_omp = n_omp
- self.n_cell = n_cell
- self.n_step = n_step
-
- def scale_n_cell(self, n_node=0):
- n_cell_scaled = copy.deepcopy(self.n_cell)
- index_dim = 0
- while n_node > 1:
- n_cell_scaled[index_dim] *= 2
- n_node /= 2
- index_dim = (index_dim+1) % 3
- self.n_cell = n_cell_scaled
+ if machine == 'summit':
+ compiler = 'pgi'
+ architecture = 'gpu'
# List of tests to perform
# ------------------------
-test_list_unq = []
# Each test runs n_repeat times
n_repeat = 2
-# n_node is kept to None and passed in functions as an external argument
-# That way, several test_element_instance run with the same n_node on the same batch job
-test_list_unq.append( test_element(input_file='automated_test_1_uniform_rest_32ppc',
- n_mpi_per_node=8,
- n_omp=8,
- n_cell=[128, 128, 128],
- n_step=10) )
-test_list_unq.append( test_element(input_file='automated_test_2_uniform_rest_1ppc',
- n_mpi_per_node=8,
- n_omp=8,
- n_cell=[256, 256, 512],
- n_step=10) )
-test_list_unq.append( test_element(input_file='automated_test_3_uniform_drift_4ppc',
- n_mpi_per_node=8,
- n_omp=8,
- n_cell=[128, 128, 128],
- n_step=10) )
-test_list_unq.append( test_element(input_file='automated_test_4_labdiags_2ppc',
- n_mpi_per_node=8,
- n_omp=8,
- n_cell=[64, 64, 128],
- n_step=50) )
-test_list_unq.append( test_element(input_file='automated_test_5_loadimbalance',
- n_mpi_per_node=8,
- n_omp=8,
- n_cell=[128, 128, 128],
- n_step=10) )
-test_list_unq.append( test_element(input_file='automated_test_6_output_2ppc',
- n_mpi_per_node=8,
- n_omp=8,
- n_cell=[128, 256, 256],
- n_step=0) )
-test_list = [copy.deepcopy(item) for item in test_list_unq for _ in range(n_repeat) ]
+# test_list is machine-specific
+test_list = get_test_list(n_repeat)
# Define directories
# ------------------
source_dir_base = os.environ['AUTOMATED_PERF_TESTS']
-warpx_dir = source_dir_base + '/WarpX/'
+warpx_dir = source_dir_base + '/warpx/'
picsar_dir = source_dir_base + '/picsar/'
amrex_dir = source_dir_base + '/amrex/'
res_dir_base = os.environ['SCRATCH'] + '/performance_warpx/'
@@ -142,12 +121,13 @@ perf_logs_repo = source_dir_base + 'perf_logs/'
# Define dictionaries
# -------------------
-compiler_name = {'intel': 'intel', 'gnu': 'gcc'}
-module_name = {'cpu': 'haswell', 'knl': 'mic-knl'}
-module_Cname = {'cpu': 'haswell', 'knl': 'knl,quad,cache'}
+compiler_name = {'intel': 'intel', 'gnu': 'gcc', 'pgi':'pgi'}
+module_Cname = {'cpu': 'haswell', 'knl': 'knl,quad,cache', 'gpu':''}
+csv_file = {'cori':'cori_knl.csv', 'summit':'summit.csv'}
cwd = os.getcwd() + '/'
bin_dir = cwd + 'Bin/'
-bin_name = 'perf_tests3d.' + args.compiler + '.' + module_name[args.architecture] + '.TPROF.MPI.OMP.ex'
+bin_name = executable_name(compiler, architecture)
+
log_dir = cwd
perf_database_file = cwd + perf_database_file
day = time.strftime('%d')
@@ -159,30 +139,10 @@ year = time.strftime('%Y')
if args.mode == 'run':
start_date = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# Set default options for compilation and execution
- config_command = ''
- config_command += 'module unload darshan;'
- config_command += 'module load craype-hugepages4M;'
- if args.architecture == 'knl':
- if args.compiler == 'intel':
- config_command += 'module unload PrgEnv-gnu;'
- config_command += 'module load PrgEnv-intel;'
- elif args.compiler == 'gnu':
- config_command += 'module unload PrgEnv-intel;'
- config_command += 'module load PrgEnv-gnu;'
- config_command += 'module unload craype-haswell;'
- config_command += 'module load craype-mic-knl;'
- elif args.architecture == 'cpu':
- if args.compiler == 'intel':
- config_command += 'module unload PrgEnv-gnu;'
- config_command += 'module load PrgEnv-intel;'
- elif args.compiler == 'gnu':
- config_command += 'module unload PrgEnv-intel;'
- config_command += 'module load PrgEnv-gnu;'
- config_command += 'module unload craype-mic-knl;'
- config_command += 'module load craype-haswell;'
+ config_command = get_config_command(compiler, architecture)
# Create main result directory if does not exist
if not os.path.exists(res_dir_base):
- os.mkdir(res_dir_base)
+ os.mkdir(res_dir_base)
# Recompile if requested
# ----------------------
@@ -194,59 +154,34 @@ if args.mode == 'run':
git_repo.pull()
git_repo = git.cmd.Git( warpx_dir )
git_repo.pull()
- with open(cwd + 'GNUmakefile_perftest') as makefile_handler:
- makefile_text = makefile_handler.read()
- makefile_text = re.sub('\nCOMP.*', '\nCOMP=%s' %compiler_name[args.compiler], makefile_text)
- with open(cwd + 'GNUmakefile_perftest', 'w') as makefile_handler:
- makefile_handler.write( makefile_text )
- os.system(config_command + " make -f GNUmakefile_perftest realclean ; " + " rm -r tmp_build_dir *.mod; make -j 8 -f GNUmakefile_perftest")
+
+ # Copy WarpX/GNUmakefile to current directory and recompile
+ # with specific options for automated performance tests.
+ # This way, performance test compilation does not mess with user's
+ # compilation
+ shutil.copyfile("../../GNUmakefile","./GNUmakefile")
+ make_realclean_command = " make realclean WARPX_HOME=../.. " \
+ "AMREX_HOME=../../../amrex/ PICSAR_HOME=../../../picsar/ " \
+ "EBASE=perf_tests COMP=%s" %compiler_name[compiler] + ";"
+ make_command = "make -j 16 WARPX_HOME=../.. " \
+ "AMREX_HOME=../../../amrex/ PICSAR_HOME=../../../picsar/ " \
+ "EBASE=perf_tests COMP=%s" %compiler_name[compiler]
+ if machine == 'summit':
+ make_command += ' USE_GPU=TRUE '
+ os.system(config_command + make_realclean_command + \
+ "rm -r tmp_build_dir *.mod; " + make_command )
+
+ # Store git hashes for WarpX, AMReX and PICSAR into file, so that
+ # they can be read when running the analysis.
if os.path.exists( cwd + 'store_git_hashes.txt' ):
os.remove( cwd + 'store_git_hashes.txt' )
store_git_hash(repo_path=picsar_dir, filename=cwd + 'store_git_hashes.txt', name='picsar')
store_git_hash(repo_path=amrex_dir , filename=cwd + 'store_git_hashes.txt', name='amrex' )
store_git_hash(repo_path=warpx_dir , filename=cwd + 'store_git_hashes.txt', name='warpx' )
-# This function runs a batch script with
-# dependencies to perform the analysis
-# after all performance tests are done.
-def process_analysis():
- dependencies = ''
- f_log = open(cwd + 'log_jobids_tmp.txt' ,'r')
- for line in f_log.readlines():
- dependencies += line.split()[3] + ':'
- batch_string = ''
- batch_string += '#!/bin/bash\n'
- batch_string += '#SBATCH --job-name=warpx_1node_read\n'
- batch_string += '#SBATCH --time=00:07:00\n'
- batch_string += '#SBATCH -C knl\n'
- batch_string += '#SBATCH -N 1\n'
- batch_string += '#SBATCH -S 4\n'
- batch_string += '#SBATCH -q regular\n'
- batch_string += '#SBATCH -e read_error.txt\n'
- batch_string += '#SBATCH -o read_output.txt\n'
- batch_string += '#SBATCH --mail-type=end\n'
- batch_string += '#SBATCH --account=m2852\n'
- batch_string += 'module load h5py-parallel\n'
- batch_string += 'python ' + __file__ + ' --compiler=' + \
- args.compiler + ' --architecture=' + args.architecture + \
- ' --mode=read' + \
- ' --n_node_list=' + '"' + args.n_node_list + '"' + \
- ' --start_date=' + start_date
- if args.automated == True:
- batch_string += ' --automated'
- batch_string += '\n'
- batch_file = 'slurm_perfread'
- f_exe = open(batch_file,'w')
- f_exe.write(batch_string)
- f_exe.close()
- os.system('chmod 700 ' + batch_file)
- print( 'process_analysis line: ' + 'sbatch --dependency afterok:' + dependencies[0:-1] + ' ' + batch_file)
- os.system('sbatch --dependency afterok:' + dependencies[0:-1] + ' ' + batch_file)
- return 0
-
# Loop over the tests and run all simulations:
# One batch job submitted per n_node. Several
-# tests run within the same batch job.
+# tests run within the same batch job.
# --------------------------------------------
if args.mode == 'run':
if os.path.exists( 'log_jobids_tmp.txt' ):
@@ -254,24 +189,35 @@ if args.mode == 'run':
# loop on n_node. One batch script per n_node
for n_node in n_node_list:
res_dir = res_dir_base
- res_dir += '_'.join([run_name, args.compiler, args.architecture, str(n_node)]) + '/'
+ res_dir += '_'.join([run_name, compiler, architecture, str(n_node)]) + '/'
runtime_param_list = []
# Deep copy as we change the attribute n_cell of
# each instance of class test_element
test_list_n_node = copy.deepcopy(test_list)
+ job_time_min = time_min(len(test_list))
+ batch_string = get_batch_string(test_list_n_node, job_time_min, module_Cname[architecture], n_node)
# Loop on tests
- for current_run in test_list_n_node:
+ for count, current_run in enumerate(test_list_n_node):
current_run.scale_n_cell(n_node)
runtime_param_string = ' amr.n_cell=' + ' '.join(str(i) for i in current_run.n_cell)
+ runtime_param_string += ' amr.max_grid_size=' + str(current_run.max_grid_size)
+ runtime_param_string += ' amr.blocking_factor=' + str(current_run.blocking_factor)
runtime_param_string += ' max_step=' + str( current_run.n_step )
- runtime_param_list.append( runtime_param_string )
+ # runtime_param_list.append( runtime_param_string )
+ run_string = get_run_string(current_run, architecture, n_node, count, bin_name, runtime_param_string)
+ batch_string += run_string
+ batch_string += 'rm -rf plotfiles lab_frame_data diags\n'
+
+ submit_job_command = get_submit_job_command()
# Run the simulations.
- run_batch_nnode(test_list_n_node, res_dir, bin_name, config_command,\
- architecture=args.architecture, Cname=module_Cname[args.architecture], \
- n_node=n_node, runtime_param_list=runtime_param_list)
+ run_batch_nnode(test_list_n_node, res_dir, bin_name, config_command, batch_string, submit_job_command)
os.chdir(cwd)
# submit batch for analysis
- process_analysis()
+ if os.path.exists( 'read_error.txt' ):
+ os.remove( 'read_error.txt' )
+ if os.path.exists( 'read_output.txt' ):
+ os.remove( 'read_output.txt' )
+ process_analysis(args.automated, cwd, compiler, architecture, args.n_node_list, start_date)
# read the output file from each test and store timers in
# hdf5 file with pandas format
@@ -279,10 +225,10 @@ if args.mode == 'run':
for n_node in n_node_list:
print(n_node)
if browse_output_files:
+ res_dir = res_dir_base
+ res_dir += '_'.join([run_name, compiler,\
+ architecture, str(n_node)]) + '/'
for count, current_run in enumerate(test_list):
- res_dir = res_dir_base
- res_dir += '_'.join([run_name, args.compiler,\
- args.architecture, str(n_node)]) + '/'
# Read performance data from the output file
output_filename = 'out_' + '_'.join([current_run.input_file, str(n_node), str(current_run.n_mpi_per_node), str(current_run.n_omp), str(count)]) + '.txt'
# Read data for all test to put in hdf5 a database
@@ -305,33 +251,20 @@ for n_node in n_node_list:
# Load file perf_database_file if exists, and
# append with results from this scan
if os.path.exists(perf_database_file):
- df_base = pd.read_hdf(perf_database_file, 'all_data', format='table')
- # df_base = pd.read_hdf(perf_database_file, 'all_data')
+ # df_base = pd.read_hdf(perf_database_file, 'all_data', format='table')
+ df_base = pd.read_hdf(perf_database_file, 'all_data')
updated_df = df_base.append(df_newline, ignore_index=True)
else:
updated_df = df_newline
- # Write dataframe to file perf_database_file
+ # Write dataframe to file perf_database_file
# (overwrite if file exists)
updated_df.to_hdf(perf_database_file, key='all_data', mode='w')
-
- # Rename directory with precise date+hour for archive purpose
- if rename_archive == True:
- loc_counter = 0
- res_dir_arch = res_dir_base
- res_dir_arch += '_'.join([year, month, day, run_name, args.compiler,\
- args.architecture, str(n_node), str(loc_counter)]) + '/'
- while os.path.exists( res_dir_arch ):
- loc_counter += 1
- res_dir_arch = res_dir_base
- res_dir_arch += '_'.join([year, month, day, run_name, args.compiler,\
- args.architecture, str(n_node), str(loc_counter)]) + '/'
- os.rename( res_dir, res_dir_arch )
# Extract sub-set of pandas data frame, write it to
# csv file and copy this file to perf_logs repo
# -------------------------------------------------
if write_csv:
- # Extract small data from data frame and write them to
+ # Extract small data from data frame and write them to
# First, generate csv files
df = pd.read_hdf( perf_database_file )
# One large file
@@ -342,19 +275,42 @@ if write_csv:
df_small[ df_small['input_file']=='automated_test_6_output_2ppc' ]['time_WritePlotFile']
df_small = df_small.loc[:, ['date', 'input_file', 'git_hashes', 'n_node', 'n_mpi_per_node', 'n_omp', 'rep', 'start_date', 'time_initialization', 'step_time'] ]
# Write to csv
- df_small.to_csv( 'cori_knl.csv' )
+ df_small.to_csv( csv_file[machine] )
# Errors may occur depending on the version of pandas. I had errors with v0.21.0 solved with 0.23.0
# Second, move files to perf_logs repo
if update_perf_log_repo:
+ # get perf_logs repo
git_repo = git.Repo( perf_logs_repo )
if push_on_perf_log_repo:
git_repo.git.stash('save')
git_repo.git.pull()
- shutil.move( 'cori_knl.csv', perf_logs_repo + '/logs_csv/cori_knl.csv' )
+ # move csv file to perf_logs repon and commit the new version
+ shutil.move( csv_file[machine], perf_logs_repo + '/logs_csv/' + csv_file[machine] )
os.chdir( perf_logs_repo )
sys.path.append('./')
import generate_index_html
git_repo.git.add('./index.html')
- git_repo.git.add('./logs_csv/cori_knl.csv')
+ git_repo.git.add('./logs_csv/' + csv_file[machine])
index = git_repo.index
index.commit("automated tests")
+
+# Rename all result directories for archiving purposes:
+# include date in the name, and a counter to avoid over-writing
+for n_node in n_node_list:
+ if browse_output_files:
+ res_dir = res_dir_base
+ res_dir += '_'.join([run_name, compiler,\
+ architecture, str(n_node)]) + '/'
+ # Rename directory with precise date+hour for archive purpose
+ if rename_archive == True:
+ loc_counter = 0
+ res_dir_arch = res_dir_base
+ res_dir_arch += '_'.join([year, month, day, run_name, compiler,\
+ architecture, str(n_node), str(loc_counter)]) + '/'
+ while os.path.exists( res_dir_arch ):
+ loc_counter += 1
+ res_dir_arch = res_dir_base
+ res_dir_arch += '_'.join([year, month, day, run_name, compiler,\
+ architecture, str(n_node), str(loc_counter)]) + '/'
+ print("renaming " + res_dir + " -> " + res_dir_arch)
+ os.rename( res_dir, res_dir_arch )