1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
|
#!/usr/common/software/python/2.7-anaconda-4.4/bin/python
import os, sys, shutil, datetime, git
import argparse, re, time, copy
import pandas as pd
from functions_perftest import store_git_hash, get_file_content, \
run_batch_nnode, extract_dataframe
# typical use: python run_automated.py --n_node_list='1,8,16,32' --automated
# Assume warpx, picsar, amrex and perf_logs repos ar in the same directory and
# environment variable AUTOMATED_PERF_TESTS contains the path to this directory
# Handle parser
###############
parser = argparse.ArgumentParser( description='Run performance tests and write results in files' )
parser.add_argument('--recompile',
dest='recompile',
action='store_true',
default=False)
parser.add_argument('--commit',
dest='commit',
action='store_true',
default=False)
parser.add_argument('--automated',
dest='automated',
action='store_true',
default=False,
help='Use to run the automated test list')
parser.add_argument('--n_node_list',
dest='n_node_list',
default=[],
help='list ofnumber of nodes for the runs', type=str)
parser.add_argument('--start_date',
dest='start_date' )
parser.add_argument('--compiler',
choices=['gnu', 'intel'],
default='intel',
help='which compiler to use')
parser.add_argument('--architecture',
choices=['cpu', 'knl'],
default='knl',
help='which architecture to cross-compile for NERSC machines')
parser.add_argument('--mode',
choices=['run', 'read', 'browse_output_files', 'write_csv'],
default='run',
help='whether to run perftests or read their perf output. run calls read')
args = parser.parse_args()
n_node_list_string = args.n_node_list.split(',')
n_node_list = [int(i) for i in n_node_list_string]
start_date = args.start_date
# Set behavior variables
########################
write_csv = False
browse_output_files = False
if args.mode == 'write_csv':
write_csv = True
if args.mode == 'browse_output_files':
browse_output_file = True
if args.mode == 'read':
write_csv = True
browse_output_files = True
recompile = args.recompile
perf_database_file = 'my_tests_database.h5'
if args.automated == True:
run_name = 'automated_tests'
perf_database_file = 'automated_tests_database.h5'
rename_archive = True
store_full_input = False
update_perf_log_repo = True
push_on_perf_log_repo = False
pull_3_repos = True
recompile = True
# Each instance of this class contains information for a single test.
class test_element():
def __init__(self, input_file=None, n_node=None, n_mpi_per_node=None,
n_omp=None, n_cell=None, n_step=None):
self.input_file = input_file
self.n_node = n_node
self.n_mpi_per_node = n_mpi_per_node
self.n_omp = n_omp
self.n_cell = n_cell
self.n_step = n_step
def scale_n_cell(self, n_node=0):
n_cell_scaled = copy.deepcopy(self.n_cell)
index_dim = 0
while n_node > 1:
n_cell_scaled[index_dim] *= 2
n_node /= 2
index_dim = (index_dim+1) % 3
self.n_cell = n_cell_scaled
# List of tests to perform
# ------------------------
test_list_unq = []
# Each test runs n_repeat times
n_repeat = 2
# n_node is kept to None and passed in functions as an external argument
# That way, several test_element_instance run with the same n_node on the same batch job
test_list_unq.append( test_element(input_file='automated_test_1_uniform_rest_32ppc',
n_mpi_per_node=8,
n_omp=8,
n_cell=[128, 128, 128],
n_step=10) )
test_list_unq.append( test_element(input_file='automated_test_2_uniform_rest_1ppc',
n_mpi_per_node=8,
n_omp=8,
n_cell=[256, 256, 512],
n_step=10) )
test_list_unq.append( test_element(input_file='automated_test_3_uniform_drift_4ppc',
n_mpi_per_node=8,
n_omp=8,
n_cell=[128, 128, 128],
n_step=10) )
test_list_unq.append( test_element(input_file='automated_test_4_labdiags_2ppc',
n_mpi_per_node=8,
n_omp=8,
n_cell=[64, 64, 128],
n_step=50) )
test_list_unq.append( test_element(input_file='automated_test_5_loadimbalance',
n_mpi_per_node=8,
n_omp=8,
n_cell=[128, 128, 128],
n_step=10) )
test_list_unq.append( test_element(input_file='automated_test_6_output_2ppc',
n_mpi_per_node=8,
n_omp=8,
n_cell=[128, 256, 256],
n_step=0) )
test_list = [copy.deepcopy(item) for item in test_list_unq for _ in range(n_repeat) ]
# Define directories
# ------------------
source_dir_base = os.environ['AUTOMATED_PERF_TESTS']
warpx_dir = source_dir_base + '/WarpX/'
picsar_dir = source_dir_base + '/picsar/'
amrex_dir = source_dir_base + '/amrex/'
res_dir_base = os.environ['SCRATCH'] + '/performance_warpx/'
perf_logs_repo = source_dir_base + 'perf_logs/'
# Define dictionaries
# -------------------
compiler_name = {'intel': 'intel', 'gnu': 'gcc'}
module_name = {'cpu': 'haswell', 'knl': 'mic-knl'}
module_Cname = {'cpu': 'haswell', 'knl': 'knl,quad,cache'}
cwd = os.getcwd() + '/'
bin_dir = cwd + 'Bin/'
bin_name = 'perf_tests3d.' + args.compiler + '.' + module_name[args.architecture] + '.TPROF.MPI.OMP.ex'
log_dir = cwd
perf_database_file = cwd + perf_database_file
day = time.strftime('%d')
month = time.strftime('%m')
year = time.strftime('%Y')
# Initialize tests
# ----------------
if args.mode == 'run':
start_date = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
# Set default options for compilation and execution
config_command = ''
config_command += 'module unload darshan;'
config_command += 'module load craype-hugepages4M;'
if args.architecture == 'knl':
if args.compiler == 'intel':
config_command += 'module unload PrgEnv-gnu;'
config_command += 'module load PrgEnv-intel;'
elif args.compiler == 'gnu':
config_command += 'module unload PrgEnv-intel;'
config_command += 'module load PrgEnv-gnu;'
config_command += 'module unload craype-haswell;'
config_command += 'module load craype-mic-knl;'
elif args.architecture == 'cpu':
if args.compiler == 'intel':
config_command += 'module unload PrgEnv-gnu;'
config_command += 'module load PrgEnv-intel;'
elif args.compiler == 'gnu':
config_command += 'module unload PrgEnv-intel;'
config_command += 'module load PrgEnv-gnu;'
config_command += 'module unload craype-mic-knl;'
config_command += 'module load craype-haswell;'
# Create main result directory if does not exist
if not os.path.exists(res_dir_base):
os.mkdir(res_dir_base)
# Recompile if requested
# ----------------------
if recompile == True:
if pull_3_repos == True:
git_repo = git.cmd.Git( picsar_dir )
git_repo.pull()
git_repo = git.cmd.Git( amrex_dir )
git_repo.pull()
git_repo = git.cmd.Git( warpx_dir )
git_repo.pull()
with open(cwd + 'GNUmakefile_perftest') as makefile_handler:
makefile_text = makefile_handler.read()
makefile_text = re.sub('\nCOMP.*', '\nCOMP=%s' %compiler_name[args.compiler], makefile_text)
with open(cwd + 'GNUmakefile_perftest', 'w') as makefile_handler:
makefile_handler.write( makefile_text )
os.system(config_command + " make -f GNUmakefile_perftest realclean ; " + " rm -r tmp_build_dir *.mod; make -j 8 -f GNUmakefile_perftest")
if os.path.exists( cwd + 'store_git_hashes.txt' ):
os.remove( cwd + 'store_git_hashes.txt' )
store_git_hash(repo_path=picsar_dir, filename=cwd + 'store_git_hashes.txt', name='picsar')
store_git_hash(repo_path=amrex_dir , filename=cwd + 'store_git_hashes.txt', name='amrex' )
store_git_hash(repo_path=warpx_dir , filename=cwd + 'store_git_hashes.txt', name='warpx' )
# This function runs a batch script with
# dependencies to perform the analysis
# after all performance tests are done.
def process_analysis():
dependencies = ''
f_log = open(cwd + 'log_jobids_tmp.txt' ,'r')
for line in f_log.readlines():
dependencies += line.split()[3] + ':'
batch_string = ''
batch_string += '#!/bin/bash\n'
batch_string += '#SBATCH --job-name=warpx_1node_read\n'
batch_string += '#SBATCH --time=00:07:00\n'
batch_string += '#SBATCH -C knl\n'
batch_string += '#SBATCH -N 1\n'
batch_string += '#SBATCH -S 4\n'
batch_string += '#SBATCH -q regular\n'
batch_string += '#SBATCH -e read_error.txt\n'
batch_string += '#SBATCH -o read_output.txt\n'
batch_string += '#SBATCH --mail-type=end\n'
batch_string += '#SBATCH --account=m2852\n'
batch_string += 'module load h5py-parallel\n'
batch_string += 'python ' + __file__ + ' --compiler=' + \
args.compiler + ' --architecture=' + args.architecture + \
' --mode=read' + \
' --n_node_list=' + '"' + args.n_node_list + '"' + \
' --start_date=' + start_date
if args.automated == True:
batch_string += ' --automated'
batch_string += '\n'
batch_file = 'slurm_perfread'
f_exe = open(batch_file,'w')
f_exe.write(batch_string)
f_exe.close()
os.system('chmod 700 ' + batch_file)
print( 'process_analysis line: ' + 'sbatch --dependency afterok:' + dependencies[0:-1] + ' ' + batch_file)
os.system('sbatch --dependency afterok:' + dependencies[0:-1] + ' ' + batch_file)
return 0
# Loop over the tests and run all simulations:
# One batch job submitted per n_node. Several
# tests run within the same batch job.
# --------------------------------------------
if args.mode == 'run':
if os.path.exists( 'log_jobids_tmp.txt' ):
os.remove( 'log_jobids_tmp.txt' )
# loop on n_node. One batch script per n_node
for n_node in n_node_list:
res_dir = res_dir_base
res_dir += '_'.join([run_name, args.compiler, args.architecture, str(n_node)]) + '/'
runtime_param_list = []
# Deep copy as we change the attribute n_cell of
# each instance of class test_element
test_list_n_node = copy.deepcopy(test_list)
# Loop on tests
for current_run in test_list_n_node:
current_run.scale_n_cell(n_node)
runtime_param_string = ' amr.n_cell=' + ' '.join(str(i) for i in current_run.n_cell)
runtime_param_string += ' max_step=' + str( current_run.n_step )
runtime_param_list.append( runtime_param_string )
# Run the simulations.
run_batch_nnode(test_list_n_node, res_dir, bin_name, config_command,\
architecture=args.architecture, Cname=module_Cname[args.architecture], \
n_node=n_node, runtime_param_list=runtime_param_list)
os.chdir(cwd)
# submit batch for analysis
process_analysis()
# read the output file from each test and store timers in
# hdf5 file with pandas format
# -------------------------------------------------------
for n_node in n_node_list:
print(n_node)
if browse_output_files:
for count, current_run in enumerate(test_list):
res_dir = res_dir_base
res_dir += '_'.join([run_name, args.compiler,\
args.architecture, str(n_node)]) + '/'
# Read performance data from the output file
output_filename = 'out_' + '_'.join([current_run.input_file, str(n_node), str(current_run.n_mpi_per_node), str(current_run.n_omp), str(count)]) + '.txt'
# Read data for all test to put in hdf5 a database
# This is an hdf5 file containing ALL the simulation
# parameters and results. Might be too large for a repo
df_newline = extract_dataframe(res_dir + output_filename, current_run.n_step)
# Add all simulation parameters to the dataframe
df_newline['git_hashes'] = get_file_content(filename=cwd+'store_git_hashes.txt')
df_newline['start_date'] = start_date
df_newline['run_name'] = run_name
df_newline['input_file'] = current_run.input_file
df_newline['n_node'] = n_node
df_newline['n_mpi_per_node'] = current_run.n_mpi_per_node
df_newline['n_omp'] = current_run.n_omp
df_newline['n_steps'] = current_run.n_step
df_newline['rep'] = count%n_repeat
df_newline['date'] = datetime.datetime.now()
if store_full_input:
df_newline['inputs_content'] = get_file_content( filename=cwd+current_run.input_file )
# Load file perf_database_file if exists, and
# append with results from this scan
if os.path.exists(perf_database_file):
df_base = pd.read_hdf(perf_database_file, 'all_data', format='table')
# df_base = pd.read_hdf(perf_database_file, 'all_data')
updated_df = df_base.append(df_newline, ignore_index=True)
else:
updated_df = df_newline
# Write dataframe to file perf_database_file
# (overwrite if file exists)
updated_df.to_hdf(perf_database_file, key='all_data', mode='w')
# Rename directory with precise date+hour for archive purpose
if rename_archive == True:
loc_counter = 0
res_dir_arch = res_dir_base
res_dir_arch += '_'.join([year, month, day, run_name, args.compiler,\
args.architecture, str(n_node), str(loc_counter)]) + '/'
while os.path.exists( res_dir_arch ):
loc_counter += 1
res_dir_arch = res_dir_base
res_dir_arch += '_'.join([year, month, day, run_name, args.compiler,\
args.architecture, str(n_node), str(loc_counter)]) + '/'
os.rename( res_dir, res_dir_arch )
# Extract sub-set of pandas data frame, write it to
# csv file and copy this file to perf_logs repo
# -------------------------------------------------
if write_csv:
# Extract small data from data frame and write them to
# First, generate csv files
df = pd.read_hdf( perf_database_file )
# One large file
df.loc[:,'step_time'] = pd.Series(df['time_running']/df['n_steps'], index=df.index)
# Make smaller dataframe with only data to be written to csv file
df_small = df.copy()
df_small.loc[ df_small['input_file']=='automated_test_6_output_2ppc', 'step_time'] = \
df_small[ df_small['input_file']=='automated_test_6_output_2ppc' ]['time_WritePlotFile']
df_small = df_small.loc[:, ['date', 'input_file', 'git_hashes', 'n_node', 'n_mpi_per_node', 'n_omp', 'rep', 'start_date', 'time_initialization', 'step_time'] ]
# Write to csv
df_small.to_csv( 'cori_knl.csv' )
# Errors may occur depending on the version of pandas. I had errors with v0.21.0 solved with 0.23.0
# Second, move files to perf_logs repo
if update_perf_log_repo:
git_repo = git.Repo( perf_logs_repo )
if push_on_perf_log_repo:
git_repo.git.stash('save')
git_repo.git.pull()
shutil.move( 'cori_knl.csv', perf_logs_repo + '/logs_csv/cori_knl.csv' )
os.chdir( perf_logs_repo )
sys.path.append('./')
import generate_index_html
git_repo.git.add('./index.html')
git_repo.git.add('./logs_csv/cori_knl.csv')
index = git_repo.index
index.commit("automated tests")
|