Tools/PerformanceTests/summit.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

# Copyright 2019 Axel Huebl, Luca Fedeli, Maxence Thevenet
#
#
# This file is part of WarpX.
#
# License: BSD-3-Clause-LBNL

# requirements:
# - module load python/3.7.0-anaconda3-5.3.0

import os, copy
from functions_perftest import test_element

def executable_name(compiler,architecture):
    return 'perf_tests3d.' + compiler + '.TPROF.MTMPI.CUDA.QED.GPUCLOCK.ex'

def get_config_command(compiler, architecture):
    config_command = ''
    config_command += 'module load gcc;'
    config_command += 'module load cuda;'
    return config_command

# This function runs a batch script with
# dependencies to perform the analysis
# after all performance tests are done.
def process_analysis(automated, cwd, compiler, architecture, n_node_list, start_date, path_source, path_results):

    batch_string = '''#!/bin/bash
#BSUB -P APH114
#BSUB -W 00:10
#BSUB -nnodes 1
#BSUB -J perf_test
#BSUB -o read_output.txt
#BSUB -e read_error.txt
'''
    f_log = open(cwd + 'log_jobids_tmp.txt' ,'r')
    for line in f_log.readlines():
        dependency = line.split()[1][1:-1]
        batch_string += '#BSUB -w ended(' + dependency + ')\n'

    batch_string += 'python run_automated.py --compiler=' + \
        compiler + ' --architecture=' + architecture + \
        ' --mode=read' + \
        ' --n_node_list=' + '"' + n_node_list + '"' + \
        ' --start_date=' + start_date + \
        ' --path_source=' + path_source + \
        ' --path_results=' + path_results
    if automated == True:
        batch_string += ' --automated'
    batch_string += '\n'
    batch_file = 'bsub_perfread'
    f_exe = open(batch_file,'w')
    f_exe.write(batch_string)
    f_exe.close()
    os.system('chmod 700 ' + batch_file)
    print( 'process_analysis line:  ' + 'bsub ' + batch_file)
    os.system('bsub ' + batch_file)

# Calculate simulation time. Take 2 min + 2 min / simulation
def time_min(nb_simulations):
    return 2. + nb_simulations*2.

def get_submit_job_command():
    return ' bsub '

def get_batch_string(test_list, job_time_min, Cname, n_node):

    job_time_str = str(int(job_time_min/60)) + ':' + str(int(job_time_min%60))

    batch_string = ''
    batch_string += '#!/bin/bash\n'
    batch_string += '#BSUB -P APH114\n'
    batch_string += '#BSUB -W ' + job_time_str + '\n'
    batch_string += '#BSUB -nnodes ' + str(n_node) + '\n'
    batch_string += '#BSUB -J ' + test_list[0].input_file + '\n'
    batch_string += '#BSUB -e error.txt\n'
    batch_string += 'module load gcc\n'
    batch_string += 'module load cuda\n'
    return batch_string

def get_run_string(current_test, architecture, n_node, count, bin_name, runtime_param_string):

    output_filename = 'out_' + '_'.join([current_test.input_file, str(n_node), str(current_test.n_mpi_per_node), str(current_test.n_omp), str(count)]) + '.txt'

    ngpu = str(current_test.n_mpi_per_node)
    srun_string = ''
    srun_string += 'jsrun '
    srun_string += ' -n ' + str(n_node)
    srun_string += ' -a ' + ngpu + ' -g ' + ngpu + ' -c ' + ngpu + ' --bind=packed:1 '
    srun_string += ' ./' + bin_name + ' '
    srun_string += current_test.input_file + ' '
    srun_string += runtime_param_string
    srun_string += ' > ' + output_filename + '\n'
    return srun_string

def get_test_list(n_repeat):
    test_list_unq = []
    # n_node is kept to None and passed in functions as an external argument
    # That way, several test_element_instance run with the same n_node on the same batch job
    test_list_unq.append( test_element(input_file='automated_test_1_uniform_rest_32ppc',
                                       n_mpi_per_node=6,
                                       n_omp=1,
                                       n_cell=[128, 128, 192],
                                       max_grid_size=256,
                                       blocking_factor=32,
                                       n_step=10) )
    test_list_unq.append( test_element(input_file='automated_test_2_uniform_rest_1ppc',
                                       n_mpi_per_node=6,
                                       n_omp=1,
                                       n_cell=[256, 512, 768],
                                       max_grid_size=512,
                                       blocking_factor=256,
                                       n_step=10) )
    test_list_unq.append( test_element(input_file='automated_test_3_uniform_drift_4ppc',
                                       n_mpi_per_node=6,
                                       n_omp=1,
                                       n_cell=[128, 128, 384],
                                       max_grid_size=256,
                                       blocking_factor=64,
                                       n_step=10) )
    test_list_unq.append( test_element(input_file='automated_test_4_labdiags_2ppc',
                                       n_mpi_per_node=6,
                                       n_omp=1,
                                       n_cell=[384, 256, 512],
                                       max_grid_size=256,
                                       blocking_factor=128,
                                       n_step=50) )
    test_list_unq.append( test_element(input_file='automated_test_5_loadimbalance',
                                       n_mpi_per_node=6,
                                       n_omp=1,
                                       n_cell=[64, 64, 192],
                                       max_grid_size=64,
                                       blocking_factor=32,
                                       n_step=10) )
    test_list_unq.append( test_element(input_file='automated_test_6_output_2ppc',
                                       n_mpi_per_node=6,
                                       n_omp=1,
                                       n_cell=[384, 256, 512],
                                       max_grid_size=256,
                                       blocking_factor=64,
                                       n_step=1) )
    test_list = [copy.deepcopy(item) for item in test_list_unq for _ in range(n_repeat) ]
    return test_list