4 #include "../shared/util.h"     8 #define MPI_CXX_DOUBLE_COMPLEX MPI::DOUBLE_COMPLEX    16   std::mersenne_twister_engine<std::uint_fast64_t, 64, 312, 156, 31,
    17                                0xb5026f5aa96619e9, 29,
    18                                0x5555555555555555, 17,
    19                                0x71d67fffeda60000, 37,
    20                                0xfff7eee000000000, 43, 6364136223846793005> 
rng;
    28     return ((
double)
rng()-(
double)
rng.min())/
rng.max();
    49   template <
typename type>
    59     for (i=0; i<order; i++){
    63           (*iidx)[i] = (*iidx)[j];
    75   template <
typename type>
    87     n = 
conv_idx(order_A, cidx_A, iidx_A);
    88     for (i=0; i<order_B; i++){
    90       for (j=0; j<order_A; j++){
    92           (*iidx_B)[i] = (*iidx_A)[j];
    99             (*iidx_B)[i] = (*iidx_B)[j];
   113   template <
typename type>
   128     n = 
conv_idx(order_A, cidx_A, iidx_A,
   129                  order_B, cidx_B, iidx_B);
   131     for (i=0; i<order_C; i++){
   133       for (j=0; j<order_B; j++){
   135           (*iidx_C)[i] = (*iidx_B)[j];
   140         for (j=0; j<order_A; j++){
   142             (*iidx_C)[i] = (*iidx_A)[j];
   149               (*iidx_C)[i] = (*iidx_C)[j];
   165   template int conv_idx<int>(int, 
int const *, 
int **, int, 
int const *, 
int **);
   166   template int conv_idx<char>(int, 
char const *, 
int **, int, 
char const *, 
int **);
   167   template int conv_idx<int>(int, 
int const *, 
int **, int, 
int const *, 
int **, int, 
int const *, 
int **);
   168   template int conv_idx<char>(int, 
char const *, 
int **, int, 
char const *, 
int **, int, 
char const *, 
int **);
   182   #if (!BGP && !BGQ && !HOPPER)   185     MPI_Comm_rank(MPI_COMM_WORLD,&rank);
   190       size = backtrace(array, 50);
   193       backtrace_symbols(array, size);
   194       char syscom[2048*size];
   195       for (i=1; i<size; ++i)
   200         int sz = readlink(
"/proc/self/exe", buf, bufsize);
   202         sprintf(buf2,
"addr2line %p -e %s", array[i], buf);
   209       assert(system(syscom)==0);
   213     printf(
"%d",iiarr[0]);
   217   CommData::CommData(){
   222   CommData::~CommData(){
   246   CommData::CommData(MPI_Comm cm_){
   248     MPI_Comm_rank(cm, &
rank);
   249     MPI_Comm_size(cm, &
np);
   254   CommData::CommData(
int rank_, 
int color_, 
int np_){
   262   CommData::CommData(
int rank_, 
int color_, 
CommData parent){
   266     MPI_Comm_split(parent.
cm, color, rank_, &cm);
   267     MPI_Comm_size(cm, &
np);
   272   void CommData::activate(MPI_Comm parent){
   276       MPI_Comm_split(parent, color, 
rank, &cm);
   278       MPI_Comm_size(cm, &np_);
   283   void CommData::deactivate(){
   288         MPI_Finalized(&is_finalized);
   289         if (!is_finalized) MPI_Comm_free(&cm);
   295   double CommData::estimate_bcast_time(int64_t msg_sz){
   296     double ps[] = {1.0, log2((
double)
np), (double)msg_sz};
   300   double CommData::estimate_allred_time(int64_t msg_sz, MPI_Op op){
   301     double ps[] = {1.0, log2((
double)
np), (double)msg_sz*log2((
double)(
np))};
   302     if (op >= MPI_MAX && op <= MPI_REPLACE)
   308   double CommData::estimate_red_time(int64_t msg_sz, MPI_Op op){
   309     double ps[] = {1.0, log2((
double)
np), (double)msg_sz*log2((
double)(
np))};
   310     if (op >= MPI_MAX && op <= MPI_REPLACE)
   325   double CommData::estimate_alltoall_time(int64_t chunk_sz) {
   326     double ps[] = {1.0, log2((
double)
np), log2((
double)np)*np*chunk_sz};
   330   double CommData::estimate_alltoallv_time(int64_t tot_sz) {
   331     double ps[] = {1.0, log2((
double)
np), log2((
double)np)*tot_sz};
   336   void CommData::bcast(
void * buf, int64_t count, MPI_Datatype mdtype, 
int root){
   341     MPI_Type_size(mdtype, &tsize_);
   342     double tps_[] = {0.0, 1.0, log2(
np), ((double)count)*tsize_};
   347     double st_time = MPI_Wtime();
   349     MPI_Bcast(buf, count, mdtype, root, cm);
   352     double exe_time = MPI_Wtime()-st_time;
   354     MPI_Type_size(mdtype, &tsize);
   355     double tps[] = {exe_time, 1.0, log2(
np), ((double)count)*tsize};
   360   void CommData::allred(
void * inbuf, 
void * outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op){
   367     MPI_Type_size(mdtype, &tsize_);
   368     double tps_[] = {0.0, 1.0, log2(
np), ((double)count)*tsize_*std::max(.5,(
double)log2(
np))};
   370     if (op >= MPI_MAX && op <= MPI_REPLACE)
   377     double st_time = MPI_Wtime();
   378     MPI_Allreduce(inbuf, outbuf, count, mdtype, op, cm);
   382     double exe_time = MPI_Wtime()-st_time;
   384     MPI_Type_size(mdtype, &tsize);
   385     double tps[] = {exe_time, 1.0, log2(
np), ((double)count)*tsize*std::max(.5,(
double)log2(
np))};
   386     if (op >= MPI_MAX && op <= MPI_REPLACE)
   392   void CommData::red(
void * inbuf, 
void * outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op, 
int root){
   398     MPI_Type_size(mdtype, &tsize_);
   399     double tps_[] = {0.0, 1.0, log2(
np), ((double)count)*tsize_*std::max(.5,(
double)log2(
np))};
   401     if (op >= MPI_MAX && op <= MPI_REPLACE)
   408     double st_time = MPI_Wtime();
   409     MPI_Reduce(inbuf, outbuf, count, mdtype, op, root, cm);
   413     double exe_time = MPI_Wtime()-st_time;
   415     MPI_Type_size(mdtype, &tsize);
   416     double tps[] = {exe_time, 1.0, log2(
np), ((double)count)*tsize*std::max(.5,(
double)log2(
np))};
   417     if (op >= MPI_MAX && op <= MPI_REPLACE)
   424   void CommData::all_to_allv(
void *          send_buffer,
   425                              int64_t 
const * send_counts,
   426                              int64_t 
const * send_displs,
   429                              int64_t 
const * recv_counts,
   430                              int64_t 
const * recv_displs){
   435     int64_t tot_sz_ = std::max(send_displs[
np-1]+send_counts[
np-1], recv_displs[
np-1]+recv_counts[
np-1])*datum_size;
   436     double tps_[] = {0.0, 1.0, log2(
np), (double)tot_sz_};
   440     double st_time = MPI_Wtime();
   441     int num_nnz_trgt = 0;
   442     int num_nnz_recv = 0;
   443     for (
int p=0; p<
np; p++){
   444       if (send_counts[p] != 0) num_nnz_trgt++;
   445       if (recv_counts[p] != 0) num_nnz_recv++;
   447     double frac_nnz = ((double)num_nnz_trgt)/
np;
   449     MPI_Allreduce(&frac_nnz, &tot_frac_nnz, 1, MPI_DOUBLE, MPI_SUM, cm);
   450     tot_frac_nnz = tot_frac_nnz / 
np;
   452     int64_t max_displs = std::max(recv_displs[np-1], send_displs[np-1]);
   453     int64_t tot_max_displs;
   455     MPI_Allreduce(&max_displs, &tot_max_displs, 1, MPI_INT64_T, MPI_MAX, cm);
   457     if (tot_max_displs >= INT32_MAX ||
   458         (datum_size != 4 && datum_size != 8 && datum_size != 16) ||
   459         (tot_frac_nnz <= .25 && tot_frac_nnz*np < 100)){
   461       MPI_Type_contiguous(datum_size, MPI_CHAR, &mdt);
   462       MPI_Type_commit(&mdt);
   463       MPI_Request reqs[num_nnz_recv+num_nnz_trgt];
   464       MPI_Status stat[num_nnz_recv+num_nnz_trgt];
   466       for (
int p=0; p<
np; p++){
   467         if (recv_counts[p] != 0){
   468           MPI_Irecv(((
char*)recv_buffer)+recv_displs[p]*datum_size,
   470                     mdt, p, p, cm, reqs+nnr);
   475       for (
int lp=0; lp<
np; lp++){
   476         int p = (lp+
rank)%np;
   477         if (send_counts[p] != 0){
   478           MPI_Isend(((
char*)send_buffer)+send_displs[p]*datum_size,
   480                     mdt, p, 
rank, cm, reqs+nnr+nns);
   484       MPI_Waitall(num_nnz_recv+num_nnz_trgt, reqs, stat);
   487       int * i32_send_counts, * i32_send_displs;
   488       int * i32_recv_counts, * i32_recv_displs;
   496       for (
int p=0; p<
np; p++){
   497         i32_send_counts[p] = send_counts[p];
   498         i32_send_displs[p] = send_displs[p];
   499         i32_recv_counts[p] = recv_counts[p];
   500         i32_recv_displs[p] = recv_displs[p];
   504           MPI_Alltoallv(send_buffer, i32_send_counts, i32_send_displs, MPI_FLOAT,
   505                         recv_buffer, i32_recv_counts, i32_recv_displs, MPI_FLOAT, cm);
   508           MPI_Alltoallv(send_buffer, i32_send_counts, i32_send_displs, MPI_DOUBLE,
   509                         recv_buffer, i32_recv_counts, i32_recv_displs, MPI_DOUBLE, cm);
   512           MPI_Alltoallv(send_buffer, i32_send_counts, i32_send_displs, MPI_CXX_DOUBLE_COMPLEX,
   513                         recv_buffer, i32_recv_counts, i32_recv_displs, MPI_CXX_DOUBLE_COMPLEX, cm);
   527     double exe_time = MPI_Wtime()-st_time;
   528     int64_t tot_sz = std::max(send_displs[np-1]+send_counts[np-1], recv_displs[np-1]+recv_counts[np-1])*datum_size;
   529     double tps[] = {exe_time, 1.0, log2(np), (double)tot_sz};
   539     for (i=0; i<order; i++){
   540       idx_arr[i] = cidx%lens[i];
   550     cvrt_idx(order, lens, idx, *idx_arr);
   560     for (i=0; i<order; i++){
   561       (*idx) += idx_arr[i]*lda;
   587   bool get_mpi_dt(int64_t count, int64_t datum_size, MPI_Datatype & dt){
   604         MPI_Type_contiguous(datum_size, MPI_CHAR, &dt);
   605         MPI_Type_commit(&dt);
 
template int conv_idx< int >(int, int const *, int **, int, int const *, int **, int, int const *, int **)
double allred_mdl_cst_init[]
bool get_mpi_dt(int64_t count, int64_t datum_size, MPI_Datatype &dt)
gives a datatype for arbitrary datum_size, errors if exceeding 32-bits 
double est_time(double const *param)
estimates model time based on observarions 
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values 
double alltoallv_mdl_init[]
double red_mdl_cst_init[]
def array(A, dtype=None, copy=True, order='K', subok=False, ndmin=0)
double get_rand48()
returns new random number in [0,1) 
void * alloc(int64_t len)
alloc abstraction 
LinModel< 3 > bcast_mdl(bcast_mdl_init,"bcast_mdl")
void init_rng(int rank)
initialized random number generator 
LinModel< 3 > alltoallv_mdl(alltoallv_mdl_init,"alltoallv_mdl")
int conv_idx(int order_A, type const *cidx_A, int **iidx_A, int order_B, type const *cidx_B, int **iidx_B, int order_C, type const *cidx_C, int **iidx_C)
void flops_add(int64_t n)
template int conv_idx< char >(int, char const *, int **, int, char const *, int **, int, char const *, int **)
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction 
LinModel< 3 > allred_mdl_cst(allred_mdl_cst_init,"allred_mdl_cst")
double alltoall_mdl_init[]
Linear performance models, which given measurements, provides new model guess. 
LinModel< 3 > red_mdl(red_mdl_init,"red_mdl")
LinModel< 3 > alltoall_mdl(alltoall_mdl_init,"alltoall_mdl")
bool should_observe(double const *time_param)
decides whether the current instance should be observed 
std::mersenne_twister_engine< std::uint_fast64_t, 64, 312, 156, 31, 0xb5026f5aa96619e9, 29, 0x5555555555555555, 17, 0x71d67fffeda60000, 37, 0xfff7eee000000000, 43, 6364136223846793005 > rng
LinModel< 3 > allred_mdl(allred_mdl_init,"allred_mdl")
void cvrt_idx(int order, int const *lens, int const *idx_arr, int64_t *idx)
int cdealloc(void *ptr)
free abstraction 
LinModel< 3 > red_mdl_cst(red_mdl_cst_init,"red_mdl_cst")
MPI_Datatype MPI_CTF_DOUBLE_COMPLEX