4 #include "../shared/util.h" 8 #define MPI_CXX_DOUBLE_COMPLEX MPI::DOUBLE_COMPLEX 16 std::mersenne_twister_engine<std::uint_fast64_t, 64, 312, 156, 31,
17 0xb5026f5aa96619e9, 29,
18 0x5555555555555555, 17,
19 0x71d67fffeda60000, 37,
20 0xfff7eee000000000, 43, 6364136223846793005>
rng;
28 return ((
double)
rng()-(
double)
rng.min())/
rng.max();
49 template <
typename type>
59 for (i=0; i<order; i++){
63 (*iidx)[i] = (*iidx)[j];
75 template <
typename type>
87 n =
conv_idx(order_A, cidx_A, iidx_A);
88 for (i=0; i<order_B; i++){
90 for (j=0; j<order_A; j++){
92 (*iidx_B)[i] = (*iidx_A)[j];
99 (*iidx_B)[i] = (*iidx_B)[j];
113 template <
typename type>
128 n =
conv_idx(order_A, cidx_A, iidx_A,
129 order_B, cidx_B, iidx_B);
131 for (i=0; i<order_C; i++){
133 for (j=0; j<order_B; j++){
135 (*iidx_C)[i] = (*iidx_B)[j];
140 for (j=0; j<order_A; j++){
142 (*iidx_C)[i] = (*iidx_A)[j];
149 (*iidx_C)[i] = (*iidx_C)[j];
165 template int conv_idx<int>(int,
int const *,
int **, int,
int const *,
int **);
166 template int conv_idx<char>(int,
char const *,
int **, int,
char const *,
int **);
167 template int conv_idx<int>(int,
int const *,
int **, int,
int const *,
int **, int,
int const *,
int **);
168 template int conv_idx<char>(int,
char const *,
int **, int,
char const *,
int **, int,
char const *,
int **);
182 #if (!BGP && !BGQ && !HOPPER) 185 MPI_Comm_rank(MPI_COMM_WORLD,&rank);
190 size = backtrace(array, 50);
193 backtrace_symbols(array, size);
194 char syscom[2048*size];
195 for (i=1; i<size; ++i)
200 int sz = readlink(
"/proc/self/exe", buf, bufsize);
202 sprintf(buf2,
"addr2line %p -e %s", array[i], buf);
209 assert(system(syscom)==0);
213 printf(
"%d",iiarr[0]);
217 CommData::CommData(){
222 CommData::~CommData(){
246 CommData::CommData(MPI_Comm cm_){
248 MPI_Comm_rank(cm, &
rank);
249 MPI_Comm_size(cm, &
np);
254 CommData::CommData(
int rank_,
int color_,
int np_){
262 CommData::CommData(
int rank_,
int color_,
CommData parent){
266 MPI_Comm_split(parent.
cm, color, rank_, &cm);
267 MPI_Comm_size(cm, &
np);
272 void CommData::activate(MPI_Comm parent){
276 MPI_Comm_split(parent, color,
rank, &cm);
278 MPI_Comm_size(cm, &np_);
283 void CommData::deactivate(){
288 MPI_Finalized(&is_finalized);
289 if (!is_finalized) MPI_Comm_free(&cm);
295 double CommData::estimate_bcast_time(int64_t msg_sz){
296 double ps[] = {1.0, log2((
double)
np), (double)msg_sz};
300 double CommData::estimate_allred_time(int64_t msg_sz, MPI_Op op){
301 double ps[] = {1.0, log2((
double)
np), (double)msg_sz*log2((
double)(
np))};
302 if (op >= MPI_MAX && op <= MPI_REPLACE)
308 double CommData::estimate_red_time(int64_t msg_sz, MPI_Op op){
309 double ps[] = {1.0, log2((
double)
np), (double)msg_sz*log2((
double)(
np))};
310 if (op >= MPI_MAX && op <= MPI_REPLACE)
325 double CommData::estimate_alltoall_time(int64_t chunk_sz) {
326 double ps[] = {1.0, log2((
double)
np), log2((
double)np)*np*chunk_sz};
330 double CommData::estimate_alltoallv_time(int64_t tot_sz) {
331 double ps[] = {1.0, log2((
double)
np), log2((
double)np)*tot_sz};
336 void CommData::bcast(
void * buf, int64_t count, MPI_Datatype mdtype,
int root){
341 MPI_Type_size(mdtype, &tsize_);
342 double tps_[] = {0.0, 1.0, log2(
np), ((double)count)*tsize_};
347 double st_time = MPI_Wtime();
349 MPI_Bcast(buf, count, mdtype, root, cm);
352 double exe_time = MPI_Wtime()-st_time;
354 MPI_Type_size(mdtype, &tsize);
355 double tps[] = {exe_time, 1.0, log2(
np), ((double)count)*tsize};
360 void CommData::allred(
void * inbuf,
void * outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op){
367 MPI_Type_size(mdtype, &tsize_);
368 double tps_[] = {0.0, 1.0, log2(
np), ((double)count)*tsize_*std::max(.5,(
double)log2(
np))};
370 if (op >= MPI_MAX && op <= MPI_REPLACE)
377 double st_time = MPI_Wtime();
378 MPI_Allreduce(inbuf, outbuf, count, mdtype, op, cm);
382 double exe_time = MPI_Wtime()-st_time;
384 MPI_Type_size(mdtype, &tsize);
385 double tps[] = {exe_time, 1.0, log2(
np), ((double)count)*tsize*std::max(.5,(
double)log2(
np))};
386 if (op >= MPI_MAX && op <= MPI_REPLACE)
392 void CommData::red(
void * inbuf,
void * outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op,
int root){
398 MPI_Type_size(mdtype, &tsize_);
399 double tps_[] = {0.0, 1.0, log2(
np), ((double)count)*tsize_*std::max(.5,(
double)log2(
np))};
401 if (op >= MPI_MAX && op <= MPI_REPLACE)
408 double st_time = MPI_Wtime();
409 MPI_Reduce(inbuf, outbuf, count, mdtype, op, root, cm);
413 double exe_time = MPI_Wtime()-st_time;
415 MPI_Type_size(mdtype, &tsize);
416 double tps[] = {exe_time, 1.0, log2(
np), ((double)count)*tsize*std::max(.5,(
double)log2(
np))};
417 if (op >= MPI_MAX && op <= MPI_REPLACE)
424 void CommData::all_to_allv(
void * send_buffer,
425 int64_t
const * send_counts,
426 int64_t
const * send_displs,
429 int64_t
const * recv_counts,
430 int64_t
const * recv_displs){
435 int64_t tot_sz_ = std::max(send_displs[
np-1]+send_counts[
np-1], recv_displs[
np-1]+recv_counts[
np-1])*datum_size;
436 double tps_[] = {0.0, 1.0, log2(
np), (double)tot_sz_};
440 double st_time = MPI_Wtime();
441 int num_nnz_trgt = 0;
442 int num_nnz_recv = 0;
443 for (
int p=0; p<
np; p++){
444 if (send_counts[p] != 0) num_nnz_trgt++;
445 if (recv_counts[p] != 0) num_nnz_recv++;
447 double frac_nnz = ((double)num_nnz_trgt)/
np;
449 MPI_Allreduce(&frac_nnz, &tot_frac_nnz, 1, MPI_DOUBLE, MPI_SUM, cm);
450 tot_frac_nnz = tot_frac_nnz /
np;
452 int64_t max_displs = std::max(recv_displs[np-1], send_displs[np-1]);
453 int64_t tot_max_displs;
455 MPI_Allreduce(&max_displs, &tot_max_displs, 1, MPI_INT64_T, MPI_MAX, cm);
457 if (tot_max_displs >= INT32_MAX ||
458 (datum_size != 4 && datum_size != 8 && datum_size != 16) ||
459 (tot_frac_nnz <= .25 && tot_frac_nnz*np < 100)){
461 MPI_Type_contiguous(datum_size, MPI_CHAR, &mdt);
462 MPI_Type_commit(&mdt);
463 MPI_Request reqs[num_nnz_recv+num_nnz_trgt];
464 MPI_Status stat[num_nnz_recv+num_nnz_trgt];
466 for (
int p=0; p<
np; p++){
467 if (recv_counts[p] != 0){
468 MPI_Irecv(((
char*)recv_buffer)+recv_displs[p]*datum_size,
470 mdt, p, p, cm, reqs+nnr);
475 for (
int lp=0; lp<
np; lp++){
476 int p = (lp+
rank)%np;
477 if (send_counts[p] != 0){
478 MPI_Isend(((
char*)send_buffer)+send_displs[p]*datum_size,
480 mdt, p,
rank, cm, reqs+nnr+nns);
484 MPI_Waitall(num_nnz_recv+num_nnz_trgt, reqs, stat);
487 int * i32_send_counts, * i32_send_displs;
488 int * i32_recv_counts, * i32_recv_displs;
496 for (
int p=0; p<
np; p++){
497 i32_send_counts[p] = send_counts[p];
498 i32_send_displs[p] = send_displs[p];
499 i32_recv_counts[p] = recv_counts[p];
500 i32_recv_displs[p] = recv_displs[p];
504 MPI_Alltoallv(send_buffer, i32_send_counts, i32_send_displs, MPI_FLOAT,
505 recv_buffer, i32_recv_counts, i32_recv_displs, MPI_FLOAT, cm);
508 MPI_Alltoallv(send_buffer, i32_send_counts, i32_send_displs, MPI_DOUBLE,
509 recv_buffer, i32_recv_counts, i32_recv_displs, MPI_DOUBLE, cm);
512 MPI_Alltoallv(send_buffer, i32_send_counts, i32_send_displs, MPI_CXX_DOUBLE_COMPLEX,
513 recv_buffer, i32_recv_counts, i32_recv_displs, MPI_CXX_DOUBLE_COMPLEX, cm);
527 double exe_time = MPI_Wtime()-st_time;
528 int64_t tot_sz = std::max(send_displs[np-1]+send_counts[np-1], recv_displs[np-1]+recv_counts[np-1])*datum_size;
529 double tps[] = {exe_time, 1.0, log2(np), (double)tot_sz};
539 for (i=0; i<order; i++){
540 idx_arr[i] = cidx%lens[i];
550 cvrt_idx(order, lens, idx, *idx_arr);
560 for (i=0; i<order; i++){
561 (*idx) += idx_arr[i]*lda;
587 bool get_mpi_dt(int64_t count, int64_t datum_size, MPI_Datatype & dt){
604 MPI_Type_contiguous(datum_size, MPI_CHAR, &dt);
605 MPI_Type_commit(&dt);
template int conv_idx< int >(int, int const *, int **, int, int const *, int **, int, int const *, int **)
double allred_mdl_cst_init[]
bool get_mpi_dt(int64_t count, int64_t datum_size, MPI_Datatype &dt)
gives a datatype for arbitrary datum_size, errors if exceeding 32-bits
double est_time(double const *param)
estimates model time based on observarions
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values
double alltoallv_mdl_init[]
double red_mdl_cst_init[]
def array(A, dtype=None, copy=True, order='K', subok=False, ndmin=0)
double get_rand48()
returns new random number in [0,1)
void * alloc(int64_t len)
alloc abstraction
LinModel< 3 > bcast_mdl(bcast_mdl_init,"bcast_mdl")
void init_rng(int rank)
initialized random number generator
LinModel< 3 > alltoallv_mdl(alltoallv_mdl_init,"alltoallv_mdl")
int conv_idx(int order_A, type const *cidx_A, int **iidx_A, int order_B, type const *cidx_B, int **iidx_B, int order_C, type const *cidx_C, int **iidx_C)
void flops_add(int64_t n)
template int conv_idx< char >(int, char const *, int **, int, char const *, int **, int, char const *, int **)
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction
LinModel< 3 > allred_mdl_cst(allred_mdl_cst_init,"allred_mdl_cst")
double alltoall_mdl_init[]
Linear performance models, which given measurements, provides new model guess.
LinModel< 3 > red_mdl(red_mdl_init,"red_mdl")
LinModel< 3 > alltoall_mdl(alltoall_mdl_init,"alltoall_mdl")
bool should_observe(double const *time_param)
decides whether the current instance should be observed
std::mersenne_twister_engine< std::uint_fast64_t, 64, 312, 156, 31, 0xb5026f5aa96619e9, 29, 0x5555555555555555, 17, 0x71d67fffeda60000, 37, 0xfff7eee000000000, 43, 6364136223846793005 > rng
LinModel< 3 > allred_mdl(allred_mdl_init,"allred_mdl")
void cvrt_idx(int order, int const *lens, int const *idx_arr, int64_t *idx)
int cdealloc(void *ptr)
free abstraction
LinModel< 3 > red_mdl_cst(red_mdl_cst_init,"red_mdl_cst")
MPI_Datatype MPI_CTF_DOUBLE_COMPLEX