Tools/performance_tests/run_automated.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316

import os, sys, shutil, datetime, git
import argparse, re, time, copy
import pandas as pd
from functions_perftest import store_git_hash, get_file_content, \
    run_batch_nnode, extract_dataframe

# Get name of supercomputer and import configuration functions from 
# machine-specific file
if os.getenv("LMOD_SYSTEM_NAME") == 'summit':
    machine = 'summit'
    from summit import executable_name, process_analysis, \
        get_config_command, time_min, get_submit_job_command, \
        get_batch_string, get_run_string, get_test_list
if os.getenv("NERSC_HOST") == 'cori':
    machine = 'cori'
    from cori import executable_name, process_analysis, \
        get_config_command, time_min, get_submit_job_command, \
        get_batch_string, get_run_string, get_test_list

# typical use: python run_automated.py --n_node_list='1,8,16,32' --automated
# Assume warpx, picsar, amrex and perf_logs repos ar in the same directory and
# environment variable AUTOMATED_PERF_TESTS contains the path to this directory

# requirements:
# - python packages: gitpython and pandas
# - AUTOMATED_PERF_TESTS: environment variables where warpx, 
#   amrex and picsar are installed ($AUTOMATED_PERF_TESTS/warpx etc.)
# - SCRATCH: environment variable where performance results are written.
#   This script will create folder $SCRATCH/performance_warpx/

if "AUTOMATED_PERF_TESTS" not in os.environ:
    raise ValueError("environment variable AUTOMATED_PERF_TESTS is not defined.\n"
                     "It should contain the path to the directory where WarpX, "
                     "AMReX and PICSAR repos are.")
if "SCRATCH" not in os.environ:
    raise ValueError("environment variable SCRATCH is not defined.\n"
                     "This script will create $SCRATCH/performance_warpx/ "
                     "to store performance results.")
# Handle parser
###############
parser = argparse.ArgumentParser( description='Run performance tests and write results in files' )
parser.add_argument('--recompile',
                    dest='recompile',
                    action='store_true',
                    default=False)
parser.add_argument('--commit',
                    dest='commit',
                    action='store_true',
                    default=False)
parser.add_argument('--automated',
                    dest='automated',
                    action='store_true',
                    default=False,
                    help='Use to run the automated test list')
parser.add_argument('--n_node_list',
                    dest='n_node_list',
                    default=[],
                    help='list ofnumber of nodes for the runs', type=str)
parser.add_argument('--start_date',
                    dest='start_date' )
parser.add_argument('--compiler',
                    choices=['gnu', 'intel', 'pgi'],
                    default='intel',
                    help='which compiler to use')
parser.add_argument('--architecture',
                    choices=['cpu', 'knl', 'gpu'],
                    default='knl',
                    help='which architecture to cross-compile for NERSC machines')
parser.add_argument('--mode',
                    choices=['run', 'read', 'browse_output_files', 'write_csv'],
                    default='run',
                    help='whether to run perftests or read their perf output. run calls read')
args = parser.parse_args()
n_node_list_string   = args.n_node_list.split(',')
n_node_list = [int(i) for i in n_node_list_string]
start_date = args.start_date
compiler = args.compiler
architecture = args.architecture

# Set behavior variables
########################
write_csv = False
browse_output_files = False
if args.mode == 'write_csv':
    write_csv = True
if args.mode == 'browse_output_files':
    browse_output_file = True
if args.mode == 'read':
    write_csv = True
    browse_output_files = True
recompile = args.recompile
perf_database_file = 'my_tests_database.h5'
if args.automated == True:
    run_name = 'automated_tests'
    perf_database_file = 'automated_tests_database.h5'
    rename_archive = True
    store_full_input = False
    update_perf_log_repo = True
    push_on_perf_log_repo = False
    pull_3_repos = True
    recompile = True
    if machine == 'summit': 
        compiler = 'pgi'
        architecture = 'gpu'

# List of tests to perform
# ------------------------
# Each test runs n_repeat times
n_repeat = 2
# test_list is machine-specific
test_list = get_test_list(n_repeat)

# Define directories
# ------------------
source_dir_base = os.environ['AUTOMATED_PERF_TESTS']
warpx_dir = source_dir_base + '/warpx/'
picsar_dir = source_dir_base + '/picsar/'
amrex_dir = source_dir_base + '/amrex/'
res_dir_base = os.environ['SCRATCH'] + '/performance_warpx/'
perf_logs_repo = source_dir_base + 'perf_logs/'

# Define dictionaries
# -------------------
compiler_name = {'intel': 'intel', 'gnu': 'gcc', 'pgi':'pgi'}
module_Cname = {'cpu': 'haswell', 'knl': 'knl,quad,cache', 'gpu':''}
csv_file = {'cori':'cori_knl.csv', 'summit':'summit.csv'}
cwd = os.getcwd() + '/'
bin_dir = cwd + 'Bin/'
bin_name = executable_name(compiler, architecture)

log_dir  = cwd
perf_database_file = cwd + perf_database_file
day = time.strftime('%d')
month = time.strftime('%m')
year = time.strftime('%Y')

# Initialize tests
# ----------------
if args.mode == 'run':
    start_date = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
    # Set default options for compilation and execution
    config_command = get_config_command(compiler, architecture)
    # Create main result directory if does not exist
    if not os.path.exists(res_dir_base):
        os.mkdir(res_dir_base)

    # Recompile if requested
    # ----------------------
    if recompile == True:
        if pull_3_repos == True:
            git_repo = git.cmd.Git( picsar_dir )
            git_repo.pull()
            git_repo = git.cmd.Git( amrex_dir  )
            git_repo.pull()
            git_repo = git.cmd.Git( warpx_dir  )
            git_repo.pull()
        
        # Copy WarpX/GNUmakefile to current directory and recompile
        # with specific options for automated performance tests.
        # This way, performance test compilation does not mess with user's
        # compilation
        shutil.copyfile("../../GNUmakefile","./GNUmakefile")
        make_realclean_command = " make realclean WARPX_HOME=../.. " \
            "AMREX_HOME=../../../amrex/ PICSAR_HOME=../../../picsar/ " \
            "EBASE=perf_tests COMP=%s" %compiler_name[compiler] + ";"
        make_command = "make -j 16 WARPX_HOME=../.. " \
            "AMREX_HOME=../../../amrex/ PICSAR_HOME=../../../picsar/ " \
            "EBASE=perf_tests COMP=%s" %compiler_name[compiler]
        if machine == 'summit':
            make_command += ' USE_GPU=TRUE '
        os.system(config_command + make_realclean_command + \
                  "rm -r tmp_build_dir *.mod; " + make_command )

        # Store git hashes for WarpX, AMReX and PICSAR into file, so that
        # they can be read when running the analysis.
        if os.path.exists( cwd + 'store_git_hashes.txt' ):
            os.remove( cwd + 'store_git_hashes.txt' )
        store_git_hash(repo_path=picsar_dir, filename=cwd + 'store_git_hashes.txt', name='picsar')
        store_git_hash(repo_path=amrex_dir , filename=cwd + 'store_git_hashes.txt', name='amrex' )
        store_git_hash(repo_path=warpx_dir , filename=cwd + 'store_git_hashes.txt', name='warpx' )

# Loop over the tests and run all simulations:
# One batch job submitted per n_node. Several
# tests run within the same batch job.
# --------------------------------------------
if args.mode == 'run':
    if os.path.exists( 'log_jobids_tmp.txt' ):
        os.remove( 'log_jobids_tmp.txt' )
    # loop on n_node. One batch script per n_node
    for n_node in n_node_list:
        res_dir = res_dir_base
        res_dir += '_'.join([run_name, compiler, architecture, str(n_node)]) + '/'
        runtime_param_list = []
        # Deep copy as we change the attribute n_cell of
        # each instance of class test_element
        test_list_n_node = copy.deepcopy(test_list)
        job_time_min = time_min(len(test_list))
        batch_string = get_batch_string(test_list_n_node, job_time_min, module_Cname[architecture], n_node)
        # Loop on tests
        for count, current_run in enumerate(test_list_n_node):
            current_run.scale_n_cell(n_node)
            runtime_param_string  = ' amr.n_cell=' + ' '.join(str(i) for i in current_run.n_cell)
            runtime_param_string += ' amr.max_grid_size=' + str(current_run.max_grid_size)
            runtime_param_string += ' amr.blocking_factor=' + str(current_run.blocking_factor)
            runtime_param_string += ' max_step=' + str( current_run.n_step )
            # runtime_param_list.append( runtime_param_string )
            run_string = get_run_string(current_run, architecture, n_node, count, bin_name, runtime_param_string)
            batch_string += run_string
            batch_string += 'rm -rf plotfiles lab_frame_data diags\n'
            
        submit_job_command = get_submit_job_command()
        # Run the simulations.
        run_batch_nnode(test_list_n_node, res_dir, bin_name, config_command, batch_string, submit_job_command)
    os.chdir(cwd)
    # submit batch for analysis
    if os.path.exists( 'read_error.txt' ):
        os.remove( 'read_error.txt' )
    if os.path.exists( 'read_output.txt' ):
        os.remove( 'read_output.txt' )
    process_analysis(args.automated, cwd, compiler, architecture, args.n_node_list, start_date)

# read the output file from each test and store timers in
# hdf5 file with pandas format
# -------------------------------------------------------
for n_node in n_node_list:
    print(n_node)
    if browse_output_files:
        res_dir = res_dir_base
        res_dir += '_'.join([run_name, compiler,\
                             architecture, str(n_node)]) + '/'
        for count, current_run in enumerate(test_list):
            # Read performance data from the output file
            output_filename = 'out_' + '_'.join([current_run.input_file, str(n_node), str(current_run.n_mpi_per_node), str(current_run.n_omp), str(count)]) + '.txt'
            # Read data for all test to put in hdf5 a database
            # This is an hdf5 file containing ALL the simulation
            # parameters and results. Might be too large for a repo
            df_newline = extract_dataframe(res_dir + output_filename, current_run.n_step)
            # Add all simulation parameters to the dataframe
            df_newline['git_hashes'] = get_file_content(filename=cwd+'store_git_hashes.txt')
            df_newline['start_date'] = start_date
            df_newline['run_name'] = run_name
            df_newline['input_file'] = current_run.input_file
            df_newline['n_node'] = n_node
            df_newline['n_mpi_per_node'] = current_run.n_mpi_per_node
            df_newline['n_omp'] = current_run.n_omp
            df_newline['n_steps'] = current_run.n_step
            df_newline['rep'] = count%n_repeat
            df_newline['date'] = datetime.datetime.now()
            if store_full_input:
                df_newline['inputs_content'] = get_file_content( filename=cwd+current_run.input_file )
            # Load file perf_database_file if exists, and
            # append with results from this scan
            if os.path.exists(perf_database_file):
                # df_base = pd.read_hdf(perf_database_file, 'all_data', format='table')
                df_base = pd.read_hdf(perf_database_file, 'all_data')
                updated_df = df_base.append(df_newline, ignore_index=True)
            else:
                updated_df = df_newline
            # Write dataframe to file perf_database_file
            # (overwrite if file exists)
            updated_df.to_hdf(perf_database_file, key='all_data', mode='w')

# Extract sub-set of pandas data frame, write it to
# csv file and copy this file to perf_logs repo
# -------------------------------------------------
if write_csv:
    # Extract small data from data frame and write them to
    # First, generate csv files
    df = pd.read_hdf( perf_database_file )
    # One large file
    df.loc[:,'step_time'] = pd.Series(df['time_running']/df['n_steps'], index=df.index)
    # Make smaller dataframe with only data to be written to csv file
    df_small = df.copy()
    df_small.loc[ df_small['input_file']=='automated_test_6_output_2ppc', 'step_time'] = \
        df_small[ df_small['input_file']=='automated_test_6_output_2ppc' ]['time_WritePlotFile']
    df_small = df_small.loc[:, ['date', 'input_file', 'git_hashes', 'n_node', 'n_mpi_per_node', 'n_omp', 'rep', 'start_date', 'time_initialization', 'step_time'] ]
    # Write to csv
    df_small.to_csv( csv_file[machine] )
    # Errors may occur depending on the version of pandas. I had errors with v0.21.0 solved with 0.23.0
    # Second, move files to perf_logs repo
    if update_perf_log_repo:
        # get perf_logs repo
        git_repo = git.Repo( perf_logs_repo )
        if push_on_perf_log_repo:
            git_repo.git.stash('save')
            git_repo.git.pull()
        # move csv file to perf_logs repon and commit the new version
        shutil.move( csv_file[machine], perf_logs_repo + '/logs_csv/' + csv_file[machine] )
        os.chdir( perf_logs_repo )
        sys.path.append('./')
        import generate_index_html
        git_repo.git.add('./index.html')
        git_repo.git.add('./logs_csv/' + csv_file[machine])
        index = git_repo.index
        index.commit("automated tests")

# Rename all result directories for archiving purposes:
# include date in the name, and a counter to avoid over-writing
for n_node in n_node_list:
    if browse_output_files:
        res_dir = res_dir_base
        res_dir += '_'.join([run_name, compiler,\
                             architecture, str(n_node)]) + '/'
        # Rename directory with precise date+hour for archive purpose
        if rename_archive == True:
            loc_counter = 0
            res_dir_arch = res_dir_base
            res_dir_arch += '_'.join([year, month, day, run_name, compiler,\
                                      architecture, str(n_node), str(loc_counter)]) + '/'
            while os.path.exists( res_dir_arch ):
                loc_counter += 1
                res_dir_arch = res_dir_base
                res_dir_arch += '_'.join([year, month, day, run_name, compiler,\
                                          architecture, str(n_node), str(loc_counter)]) + '/'
            print("renaming " + res_dir + " -> " + res_dir_arch)
            os.rename( res_dir, res_dir_arch )