4 #include "../shared/util.h"    12                         char **              tsr_cyclic_data,
    15     int old_num_virt, new_num_virt, numPes;
    16     int64_t new_nval, swp_nval;
    18     int * virt_phase_rank, * old_virt_phase_rank, * sub_edge_len;
    19     char * pairs, * tsr_new_data;
    24     numPes = ord_glb_comm.
np;
    27     alloc_ptr(old_dist.
order*
sizeof(
int), (
void**)&old_virt_phase_rank);
    32     idx_lyr = ord_glb_comm.
rank;
    33     for (
int i=0; i<old_dist.
order; i++){
    34       old_num_virt = old_num_virt*old_dist.
virt_phase[i];
    35       new_num_virt = new_num_virt*new_dist.
virt_phase[i];
    36       virt_phase_rank[i] = new_dist.
perank[i];
    37       old_virt_phase_rank[i] = old_dist.
perank[i];
    43                      old_dist.
virt_phase, old_virt_phase_rank, &new_nval, tsr_data,
    54     for (
int i=0; i<old_dist.
order; i++){
    57     if (ord_glb_comm.
rank == 0){
    58       DPRINTF(1,
"Tensor now has virtualization factor of %d\n",new_num_virt);
    61     if (ord_glb_comm.
rank == 0){
    62       DPRINTF(1,
"Tensor is of size %ld, has factor of %lf growth due to padding\n",
    64             ord_glb_comm.
np*(swp_nval/(
double)old_dist.
size));
    69     if (sr->
addid() != NULL)
    70       sr->
set(tsr_new_data, sr->
addid(), swp_nval);
   100     *tsr_cyclic_data = tsr_new_data;
   114                                 int const *          old_phys_edge_len,
   115                                 int const *          old_virt_lda,
   116                                 int const *          old_offsets,
   117                                 int * 
const *        old_permutation,
   118                                 int const *          new_phys_edge_len,
   119                                 int const *          new_virt_lda,
   123                                 int const *          old_virt_edge_len){
   126     int **bucket_offset; 
alloc_ptr(
sizeof(
int*)*old_dist.
order, (
void**)&bucket_offset);
   129       alloc_ptr(
sizeof(
int)*old_phys_edge_len[
dim], (
void**)&bucket_offset[dim]);
   131       for (
int vidx = 0;vidx < old_virt_edge_len[
dim];vidx++){
   135           if (_gidx > len[dim] || (old_offsets != NULL && _gidx < old_offsets[dim])){
   139             if (old_permutation == NULL || old_permutation[dim] == NULL){
   142               gidx = old_permutation[
dim][_gidx];
   148               bucket_offset[
dim][pidx] = phys_rank*
MAX(1,new_dist.
pe_lda[dim]);
   153               bucket_offset[
dim][pidx] = phys_rank*
MAX(1,new_dist.
pe_lda[dim]);
   158             bucket_offset[
dim][pidx] = -1;
   166     return bucket_offset;
   175                        int const *          old_virt_edge_len,
   176                        int const *          new_virt_lda,
   177                        int64_t *            send_counts,
   178                        int64_t *            recv_counts,
   179                        int64_t *            send_displs,
   180                        int64_t *            recv_displs,
   183                        int * 
const *        bucket_offset){
   184     int64_t *  all_virt_counts;
   189     int max_ntd = omp_get_max_threads();
   190     max_ntd = 
MAX(1,
MIN(max_ntd,vbs/np));
   195     mst_alloc_ptr(np*
sizeof(int64_t)*max_ntd, (
void**)&all_virt_counts);
   200       if (old_dist.
order == 0){
   201         memset(all_virt_counts, 0, np*
sizeof(int64_t));
   202         all_virt_counts[0]++;
   205   #pragma omp parallel num_threads(max_ntd)   207         int imax, act_max, skip;
   208         int start_ldim, end_ldim;
   210         int64_t *  virt_counts;
   211         int * old_virt_idx, * virt_rank;
   218         ntd = omp_get_num_threads();
   219         tid = omp_get_thread_num();
   246         virt_counts = all_virt_counts+np*tid;
   247         start_ldim = (last_len/ntd)*tid;
   248         start_ldim += 
MIN(tid,last_len%ntd);
   249         end_ldim = (last_len/ntd)*(tid+1);
   250         end_ldim += 
MIN(tid+1,last_len%ntd);
   253         int imax, act_max, skip;
   254         int start_ldim, end_ldim, i_st, vc, 
dim;
   255         int64_t *  virt_counts;
   256         int64_t *  old_virt_idx, * virt_rank;
   262         virt_counts = all_virt_counts;
   271         memset(virt_counts, 0, np*
sizeof(int64_t));
   272         memset(old_virt_idx, 0, old_dist.
order*
sizeof(int64_t));
   274         for (
int i=0; i<old_dist.
order; i++){
   275           virt_rank[i] = old_dist.
perank[i];
   278           memset(idx, 0, old_dist.
order*
sizeof(int64_t));
   279           memset(idx_offs, 0, old_dist.
order*
sizeof(int64_t));
   282           idx[old_dist.
order-1] = 
MAX(idx[old_dist.
order-1],start_ldim);
   283           for (dim=0; dim<old_dist.
order; dim++) {
   289               if (sym[dim] != 
SY && virt_rank[dim] < virt_rank[dim+1])
   291               if (sym[dim] == 
SY && virt_rank[dim] <= virt_rank[dim+1])
   294             if (sym[dim] != 
NS && idx[dim] >= idx[dim+1]-spad[dim]){
   295               idx[dim+1] = idx[
dim]+spad[
dim];
   303               if (dim == old_dist.
order - 1)
   304                 imax = 
MIN(imax, end_ldim);
   305               if (idx[dim] >= imax)
   309                 idx_offset += idx_offs[
dim];
   321                 imax = 
MIN(imax,idx[1]+1-spad[0]);
   323               if (old_dist.
order == 1){
   324                 imax = 
MIN(imax, end_ldim);
   330               for (
int i=i_st; i<imax; i++){
   331                 vc = bucket_offset[0][old_virt_idx[0]+i*old_dist.
virt_phase[0]];
   332                 virt_counts[idx_offset+vc]++;
   335               for (dim=1; dim < old_dist.
order; dim++){
   341                 if (dim == old_dist.
order - 1)
   342                   act_max = 
MIN(act_max, end_ldim);
   344                   act_max = 
MIN(act_max,idx[dim+1]+1-spad[dim]);
   346                 if (idx[dim] >= act_max){
   349                   if (sym[dim-1] != 
NS) idx[
dim] = idx[dim-1]+spad[dim-1];
   351                 idx_offset -= idx_offs[
dim];
   353                 idx_offset += idx_offs[
dim];
   357               if (dim == old_dist.
order) 
break;
   361           for (dim=0; dim<old_dist.
order; dim++){
   363             if (old_virt_idx[dim] >= old_dist.
virt_phase[dim])
   364               old_virt_idx[
dim] = 0;
   368             if (old_virt_idx[dim] > 0)
   371           if (dim == old_dist.
order) 
break;
   386         for (
int j=1; j<act_omp_ntd; j++){
   387           for (int64_t i=0; i<
np; i++){
   388             all_virt_counts[i] += all_virt_counts[i+np*j];
   394       memset(all_virt_counts, 0, 
sizeof(int64_t)*np);
   399     memcpy(send_counts, all_virt_counts, np*
sizeof(int64_t));
   417     MPI_Alltoall(send_counts, 1, MPI_INT64_T,
   418                  recv_counts, 1, MPI_INT64_T, ord_glb_comm.
cm);
   423     for (
int i=1; i<
np; i++){
   424       send_displs[i] = send_displs[i-1] + send_counts[i-1];
   425       recv_displs[i] = recv_displs[i-1] + recv_counts[i-1];
   450     double ps[] = {(double)nv0+nv1, (
double)tot_sz};
   457                        char *&              tsr_cyclic_data,
   461     int i, idx_lyr_new, idx_lyr_old, rem_idx, prc_idx, loc_idx;
   462     int num_old_virt, num_new_virt;
   463     int * idx, * old_loc_lda, * new_loc_lda, * phase_lda;
   466     int * phase = old_dist.
phase;
   467     int order = old_dist.
order;
   471       tsr_cyclic_data = sr->
alloc(new_dist.
size);
   473       if (glb_comm.
rank == 0){
   474         sr->
copy(tsr_cyclic_data,  tsr_data);
   483     MPI_Barrier(glb_comm.
cm);
   484     double st_time = MPI_Wtime();
   487     tsr_cyclic_data = sr->
alloc(new_dist.
size);
   488     alloc_ptr(
sizeof(
int)*order, (
void**)&idx);
   489     alloc_ptr(
sizeof(
int)*order, (
void**)&old_loc_lda);
   490     alloc_ptr(
sizeof(
int)*order, (
void**)&new_loc_lda);
   491     alloc_ptr(
sizeof(
int)*order, (
void**)&phase_lda);
   493     blk_sz = old_dist.
size;
   499     idx_lyr_old = glb_comm.
rank;
   500     idx_lyr_new = glb_comm.
rank;
   502     for (i=0; i<order; i++){
   509         old_loc_lda[i] = old_loc_lda[i-1]*old_dist.
virt_phase[i-1];
   510         new_loc_lda[i] = new_loc_lda[i-1]*new_dist.
virt_phase[i-1];
   511         phase_lda[i] = phase_lda[i-1]*phase[i-1];
   516     double * tps = (
double*)malloc(3*
sizeof(
double));
   518     tps[1] = (double)num_old_virt+num_new_virt;
   519     tps[2] = (double)std::max(new_dist.
size, new_dist.
size);
   530     alloc_ptr(
sizeof(MPI_Request)*(num_old_virt+num_new_virt), (
void**)&reqs);
   532     if (idx_lyr_new == 0){
   533       memset(idx, 0, 
sizeof(
int)*order);
   539         for (i=0; i<order; i++){
   540           loc_idx += idx[i]*new_loc_lda[i];
   544         DPRINTF(3,
"proc %d receiving blk %d (loc %d, size %ld) from proc %d\n",
   545                 glb_comm.
rank, rem_idx, loc_idx, blk_sz, prc_idx);
   546         MPI_Irecv(tsr_cyclic_data+sr->
el_size*loc_idx*blk_sz, blk_sz,
   547                   sr->
mdtype(), prc_idx, rem_idx, glb_comm.
cm, reqs+loc_idx);
   548         for (i=0; i<order; i++){
   559     if (idx_lyr_old == 0){
   560       memset(idx, 0, 
sizeof(
int)*order);
   565         for (i=0; i<order; i++){
   566           loc_idx += idx[i]*old_loc_lda[i];
   569         DPRINTF(3,
"proc %d sending blk %d (loc %d size %ld) to proc %d el_size = %d %p %p\n",
   570                 glb_comm.
rank, loc_idx, loc_idx, blk_sz, prc_idx, sr->
el_size, tsr_data, reqs+num_new_virt+loc_idx);
   571         MPI_Isend(tsr_data+sr->
el_size*loc_idx*blk_sz, blk_sz,
   572                   sr->
mdtype(), prc_idx, loc_idx, glb_comm.
cm, reqs+num_new_virt+loc_idx);
   573         for (i=0; i<order; i++){
   584     if (idx_lyr_new == 0 && idx_lyr_old == 0){
   585       MPI_Waitall(num_new_virt+num_old_virt, reqs, MPI_STATUSES_IGNORE);
   586     } 
else if (idx_lyr_new == 0){
   587       MPI_Waitall(num_new_virt, reqs, MPI_STATUSES_IGNORE);
   588     } 
else if (idx_lyr_old == 0){
   589       MPI_Waitall(num_old_virt, reqs+num_new_virt, MPI_STATUSES_IGNORE);
   590       if (sr->
addid() != NULL)
   593       if (sr->
addid() != NULL)
   604     MPI_Barrier(glb_comm.
cm);
   605     double exe_time = MPI_Wtime()-st_time;
   606     tps = (
double*)malloc(3*
sizeof(
double));
   608     tps[1] = (double)num_old_virt+num_new_virt;
   609     tps[2] = (double)std::max(new_dist.
size, new_dist.
size);
   619                           int const *     old_phase,
   622     int can_block_resh = 1;
   623     for (j=0; j<order; j++){
   625       if (new_phase != old_phase[j]) can_block_resh = 0;
   627     return can_block_resh;
 
int ** compute_bucket_offsets(distribution const &old_dist, distribution const &new_dist, int const *len, int const *old_phys_edge_len, int const *old_virt_lda, int const *old_offsets, int *const *old_permutation, int const *new_phys_edge_len, int const *new_virt_lda, int forward, int old_virt_np, int new_virt_np, int const *old_virt_edge_len)
computes offsets for redistribution targets along each edge length 
double est_time(double const *param)
estimates model time based on observarions 
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values 
int calc_phase() const 
compute the phase of a mapping 
void read_loc_pairs(int order, int64_t nval, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *phase_rank, int64_t *nread, char const *data, char **pairs, algstrct const *sr)
read tensor pairs local to processor 
virtual void copy(char *a, char const *b) const 
copies element b to element a 
virtual char const * addid() const 
MPI datatype for pairs. 
#define DEBUG_PRINTF(...)
LinModel< 2 > blres_mdl(blres_mdl_init,"blres_mdl")
virtual char * alloc(int64_t n) const 
allocate space for n items, necessary for object types 
void padded_reshuffle(int const *sym, distribution const &old_dist, distribution const &new_dist, char *tsr_data, char **tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm)
Reshuffle elements using key-value pair read/write. 
virtual void set(char *a, char const *b, int64_t n) const 
sets n elements of array a to value b 
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction 
Linear performance models, which given measurements, provides new model guess. 
int can_block_reshuffle(int order, int const *old_phase, mapping const *map)
determines if tensor can be permuted by block 
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction 
void calc_cnt_displs(int const *sym, distribution const &old_dist, distribution const &new_dist, int new_nvirt, int np, int const *old_virt_edge_len, int const *new_virt_lda, int64_t *send_counts, int64_t *recv_counts, int64_t *send_displs, int64_t *recv_displs, CommData ord_glb_comm, int idx_lyr, int *const *bucket_offset)
assigns keys to an array of values 
bool should_observe(double const *time_param)
decides whether the current instance should be observed 
void block_reshuffle(distribution const &old_dist, distribution const &new_dist, char *tsr_data, char *&tsr_cyclic_data, algstrct const *sr, CommData glb_comm)
Reshuffle elements by block given the global phases stay the same. 
int el_size
size of each element of algstrct in bytes 
int cdealloc(void *ptr)
free abstraction 
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
double blres_est_time(int64_t tot_sz, int nv0, int nv1)
estimates execution time, given this processor sends a receives tot_sz across np procs ...
void wr_pairs_layout(int order, int np, int64_t inwrite, char const *alpha, char const *beta, char rw, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *virt_phys_rank, int const *bucket_lda, char *wr_pairs_buf, char *rw_data, CommData glb_comm, algstrct const *sr, bool is_sparse, int64_t nnz_loc, int64_t *nnz_blk, char *&pprs_new, int64_t &nnz_loc_new)
read or write pairs from / to tensor 
virtual char const * mulid() const 
identity element for multiplication i.e. 1 
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout 
virtual MPI_Datatype mdtype() const 
MPI datatype.