Tools/performance_tests/run_alltests.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297

import os, sys, shutil
import argparse, re, time
from functions_perftest import *

# This script runs automated performance tests for WarpX.
# It runs tests in list test_list defined below, and write
# results in file performance_log.txt in warpx/performance_tests/

# ---- User's manual ----
# Before running performance tests, make sure you have the latest version 
# of performance_log.txt
# A typical execution reads:
# > python run_alltests.py --no-recompile --compiler=gnu --architecture=cpu --mode=run --log_file='my_performance_log.txt'
# These are default values, and will give the same result as 
# > python run_alltests.py
# To add a new test item, extent the test_list with a line like
# test_list.extend([['my_input_file', n_node, n_mpi, n_omp]]*3)
# - my_input_file must be in warpx/performance_tests
# - the test will run 3 times, to have some statistics
# - the test must take <1h or it will timeout

# ---- Developer's manual ----
# This script can run in two modes:
# - 'run' mode: for each test item, a batch job is executed.
#     create folder '$SCRATCH/performance_warpx/'
#     recompile the code if option --recompile is used
#     loop over test_list and submit one batch script per item
#     Submit a batch job that executes the script in read mode
#     This last job runs once all others are completed
# - 'read' mode: Get performance data from all test items
#     create performance log file if does not exist
#     loop over test_file 
#         read initialization time and step time
#         write data into the performance log file
#         push file performance_log.txt on the repo

# Define the list of tests to run
# -------------------------------
# each element of test_list contains
# [str runname, int n_node, int n_mpi PER NODE, int n_omp]
test_list = []
n_repeat = 3
basename1 = 'uniform_t0.01_'

test_list.extend([[basename1 +  '128',    1, 16, 8]]*n_repeat)
test_list.extend([[basename1 +  '128',    1, 32, 16]]*n_repeat)

# test_list.extend([[basename1 +  '128',    1, 16, 8]]*n_repeat)
# test_list.extend([[basename1 +  '256',    8, 16, 8]]*n_repeat)
# test_list.extend([[basename1 +  '512',   64, 16, 8]]*n_repeat)
# test_list.extend([[basename1 + '1024',  512, 16, 8]]*n_repeat)
# test_list.extend([[basename1 + '2048', 4096, 16, 8]]*n_repeat)


# test_list.extend([['uniform_t0.01_direct1_1ppc_128', 1, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_direct3_1ppc_128', 1, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_esirk1_1ppc_128', 1, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_esirk3_1ppc_128', 1, 16, 8]]*n_repeat)

# test_list.extend([['uniform_t0.01_direct1_1ppc_256', 8, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_direct3_1ppc_256', 8, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_esirk1_1ppc_256', 8, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_esirk3_1ppc_256', 8, 16, 8]]*n_repeat)

# test_list.extend([['uniform_t0.01_direct1_1ppc_512', 64, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_direct3_1ppc_512', 64, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_esirk1_1ppc_512', 64, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_esirk3_1ppc_512', 64, 16, 8]]*n_repeat)

# test_list.extend([['uniform_t0.01_direct1_1ppc_1024', 512, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_direct3_1ppc_1024', 512, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_esirk1_1ppc_1024', 512, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_esirk3_1ppc_1024', 512, 16, 8]]*n_repeat)

# test_list.extend([['uniform_t0.01_direct1_1ppc_2048', 4096, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_direct3_1ppc_2048', 4096, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_esirk1_1ppc_2048', 4096, 16, 8]]*n_repeat)
# test_list.extend([['uniform_t0.01_esirk3_1ppc_2048', 4096, 16, 8]]*n_repeat)

n_tests   = len(test_list)

# Read command-line arguments
# ---------------------------
# Create parser and read arguments
parser = argparse.ArgumentParser(
    description='Run performance tests and write results in files')
parser.add_argument('--recompile', dest='recompile', action='store_true', default=False)
parser.add_argument('--no-recompile', dest='recompile', action='store_false', default=False)
parser.add_argument('--commit', dest='commit', action='store_true', default=False)
parser.add_argument( '--compiler', choices=['gnu', 'intel'], default='gnu',
    help='which compiler to use')
parser.add_argument( '--architecture', choices=['cpu', 'knl'], default='cpu',
    help='which architecture to cross-compile for NERSC machines')
parser.add_argument( '--mode', choices=['run', 'read'], default='run',
    help='whether to run perftests or read their perf output. run calls read')
parser.add_argument( '--log_file', dest = 'log_file', default='my_performance_log.txt',
    help='name of log file where data will be written. ignored if option --commit is used')

args = parser.parse_args()

log_file = args.log_file
if args.commit == True:
    log_file = 'performance_log.txt'

# Dictionaries
# compiler names. Used for WarpX executable name
compiler_name = {'intel': 'intel', 'gnu': 'gcc'}
# architecture. Used for WarpX executable name
module_name = {'cpu': 'haswell', 'knl': 'mic-knl'}
# architecture. Used in batch scripts
module_Cname = {'cpu': 'haswell', 'knl': 'knl,quad,cache'}
# Define environment variables
cwd = os.getcwd() + '/'
res_dir_base = os.environ['SCRATCH'] + '/performance_warpx/'
bin_dir = cwd + 'Bin/'
bin_name = 'perf_tests3d.' + args.compiler + '.' + module_name[args.architecture] + '.TPROF.MPI.OMP.ex'
log_dir  = cwd

day = time.strftime('%d')
month = time.strftime('%m')
year = time.strftime('%Y')

# Initialize tests
# ----------------
if args.mode == 'run':
# Set default options for compilation and execution
    config_command = ''
    config_command += 'module unload darshan;' 
    config_command += 'module load craype-hugepages4M;'
    if args.architecture == 'knl':
        if args.compiler == 'intel':
            config_command += 'module unload PrgEnv-gnu;'
            config_command += 'module load PrgEnv-intel;'
        elif args.compiler == 'gnu':
            config_command += 'module unload PrgEnv-intel;'
            config_command += 'module load PrgEnv-gnu;'
        config_command += 'module unload craype-haswell;'
        config_command += 'module load craype-mic-knl;'
    elif args.architecture == 'cpu':
        if args.compiler == 'intel':
            config_command += 'module unload PrgEnv-gnu;'
            config_command += 'module load PrgEnv-intel;'
        elif args.compiler == 'gnu':
            config_command += 'module unload PrgEnv-intel;'
            config_command += 'module load PrgEnv-gnu;'
        config_command += 'module unload craype-mic-knl;'
        config_command += 'module load craype-haswell;'
    # Create main result directory if does not exist
    if not os.path.exists(res_dir_base):
        os.mkdir(res_dir_base)    

# Recompile if requested
if args.recompile == True:
    with open(cwd + 'GNUmakefile_perftest') as makefile_handler:
        makefile_text = makefile_handler.read()
    makefile_text = re.sub('\nCOMP.*', '\nCOMP=%s' %compiler_name[args.compiler], makefile_text)
    with open(cwd + 'GNUmakefile_perftest', 'w') as makefile_handler:
        makefile_handler.write( makefile_text )
    os.system(config_command + " make -f GNUmakefile_perftest realclean ; " + " rm -r tmp_build_dir *.mod; make -j 8 -f GNUmakefile_perftest")

# Define functions to run a test and analyse results
# --------------------------------------------------
def process_analysis():
    dependencies = ''
    f_log = open(cwd + 'log_jobids_tmp.txt','r')
    for count, current_run in enumerate(test_list):
        line = f_log.readline()
        print(line)
        dependencies += line.split()[3] + ':'
    batch_string = ''
    batch_string += '#!/bin/bash\n'
    batch_string += '#SBATCH --job-name=warpx_read\n'
    batch_string += '#SBATCH --time=00:05:00\n'
    batch_string += '#SBATCH -C ' + module_Cname[args.architecture] + '\n'
    batch_string += '#SBATCH -N 1\n'
    batch_string += '#SBATCH -S 4\n'
    batch_string += '#SBATCH -q regular\n'
    batch_string += '#SBATCH -e read_error.txt\n'
    batch_string += '#SBATCH -o read_output.txt\n'
    batch_string += '#SBATCH --mail-type=end\n'
    batch_string += '#SBATCH --account=m2852\n'
    batch_string += 'python ' + __file__ + ' --no-recompile --compiler=' + \
                    args.compiler + ' --architecture=' + args.architecture + \
                    ' --mode=read' + ' --log_file=' + log_file
    if args.commit == True:
        batch_string += ' --commit'
    batch_string += '\n'
    batch_file = 'slurm_perfread'
    f_exe = open(batch_file,'w')
    f_exe.write(batch_string)
    f_exe.close()
    os.system('chmod 700 ' + batch_file)
    os.system('sbatch  --dependency afterok:' + dependencies[0:-1] + ' ' + batch_file)
    return 0
 
# Loop over the tests and return run time + details
# -------------------------------------------------
if args.mode == 'run':
    # Remove file log_jobids_tmp.txt if exists.
    # This file contains the jobid of every perf test
    # It is used to manage the analysis script dependencies
    if os.path.isfile(cwd + 'log_jobids_tmp.txt'):
        os.remove(cwd + 'log_jobids_tmp.txt')
    for count, current_run in enumerate(test_list):
        # Results folder
        print('run ' + str(current_run))
        run_name = current_run[0]
        n_node   = current_run[1]
        n_mpi    = current_run[2]
        n_omp    = current_run[3]
        n_steps  = get_nsteps(cwd + run_name)
        res_dir = res_dir_base
        res_dir += '_'.join([run_name, args.compiler,\
                         args.architecture, str(n_node), str(n_mpi),\
                             str(n_omp), str(count)]) + '/'
        # Run the simulation.
        # If you are currently in an interactive session and want to run interactive,
        # just replace run_batch with run_interactive
        run_batch(run_name, res_dir, bin_name, config_command, architecture=args.architecture, \
                  Cname=module_Cname[args.architecture], n_node=n_node, n_mpi=n_mpi, n_omp=n_omp)
    os.chdir(cwd)
    process_analysis()

if args.mode == 'read':
    # Create log_file for performance tests if does not exist
    if not os.path.isfile(log_dir + log_file):
        log_line = '## year month day run_name compiler architecture n_node n_mpi ' +\
                   'n_omp time_initialization time_one_iteration Redistribute '+\
                   'FillBoundary ParallelCopy CurrentDeposition FieldGather '+\
                   'ParthiclePush Copy EvolveEM Checkpoint '+\
                   'WriteParticles Write_FabArray '+\
                   'WriteMultiLevelPlotfile '+\
                   'RedistributeMPI(unit: second)\n'
        f_log = open(log_dir + log_file, 'a')
        f_log.write(log_line)
        f_log.close()
    for count, current_run in enumerate(test_list):
        # Results folder
        print('read ' + str(current_run))
        run_name = current_run[0]
        n_node   = current_run[1]
        n_mpi    = current_run[2]
        n_omp    = current_run[3]
        n_steps  = get_nsteps(cwd  + run_name)
        print('n_steps = ' + str(n_steps))
        res_dir = res_dir_base
        res_dir += '_'.join([run_name, args.compiler,\
                             args.architecture, str(n_node), str(n_mpi),\
                             str(n_omp), str(count)]) + '/'
#        res_dir += '_'.join([year, month, '25', run_name, args.compiler,\
#                             args.architecture, str(n_node), str(n_mpi), \
#                             str(n_omp)]) + '/'
      # Read performance data from the output file
        output_filename = 'perf_output.txt'
        timing_list = read_run_perf(res_dir + output_filename, n_steps)
        # Write performance data to the performance log file
        log_line = ' '.join([year, month, day, run_name, args.compiler,\
                             args.architecture, str(n_node), str(n_mpi),\
                             str(n_omp)] +  timing_list + ['\n'])
        write_perf_logfile(log_dir + log_file, log_line)

    # Store test parameters fot record
    dir_record_base = './perf_warpx_record/'
    if not os.path.exists(dir_record_base):
        os.mkdir(dir_record_base)
    count = 0
    dir_record = dir_record_base + '_'.join([year, month, day]) + '_0'
    while os.path.exists(dir_record):
        count += 1
        dir_record = dir_record[:-1] + str(count)
    os.mkdir(dir_record)
    shutil.copy(__file__, dir_record)
    shutil.copy(log_dir + log_file, dir_record)
    for count, current_run in enumerate(test_list):
        shutil.copy(current_run[0], dir_record)

    for count, current_run in enumerate(test_list):
        run_name = current_run[0]
        n_node   = current_run[1]
        n_mpi    = current_run[2]
        n_omp    = current_run[3]
        res_dir = res_dir_base
        res_dir += '_'.join([run_name, args.compiler,\
                             args.architecture, str(n_node), str(n_mpi),\
                             str(n_omp), str(count)]) + '/'
        res_dir_arch = res_dir_base
        res_dir_arch += '_'.join([year, month, day, run_name, args.compiler,\
                                  args.architecture, str(n_node), str(n_mpi), \
                                  str(n_omp), str(count)]) + '/'
        os.rename(res_dir, res_dir_arch)

    # Commit results to the Repo
    if args.commit == True:
        os.system('git add ' + log_dir + log_file + ';'\
                  'git commit -m "performance tests";'\
                  'git push -u origin master')