4 #include "../shared/util.h"    12                    int const *          old_phys_dim,
    13                    int const *          old_phys_edge_len,
    14                    int const *          old_virt_edge_len,
    15                    int64_t              old_virt_nelem,
    16                    int const *          old_offsets,
    17                    int * 
const *        old_permutation,
    19                    int const *          new_phys_dim,
    20                    int const *          new_phys_edge_len,
    21                    int const *          new_virt_edge_len,
    22                    int64_t              new_virt_nelem,
    26                    int * 
const *        bucket_offset,
    32     if (old_dist.
order == 0){
    34         sr->
copy(new_data[0], old_data);
    37           sr->
copy(old_data, new_data[0]);
    39           sr->
acc(old_data, beta, new_data[0], alpha);
    50     int nbucket = total_np; 
    54     MPI_Comm_rank(MPI_COMM_WORLD,&
rank);
    63     int max_ntd = omp_get_max_threads();
    64     max_ntd = 
MAX(1,
MIN(max_ntd,new_virt_nelem/nbucket));
    66     int64_t old_size, new_size;
    77     int64_t * bucket_store;  
    78     int64_t * count_store;  
    79     int64_t * thread_store;  
    80     mst_alloc_ptr(
sizeof(int64_t)*
MAX(old_size,new_size), (
void**)&bucket_store);
    82     mst_alloc_ptr(
sizeof(int64_t)*
MAX(old_size,new_size), (
void**)&thread_store);
    83     std::fill(bucket_store, bucket_store+
MAX(old_size,new_size), -1);
    85     int64_t ** par_virt_counts;
    86     alloc_ptr(
sizeof(int64_t*)*max_ntd, (
void**)&par_virt_counts);
    87     for (
int t=0; t<max_ntd; t++){
    88       mst_alloc_ptr(
sizeof(int64_t)*nbucket, (
void**)&par_virt_counts[t]);
    89       std::fill(par_virt_counts[t], par_virt_counts[t]+nbucket, 0);
    91     #pragma omp parallel num_threads(max_ntd)    96     if (old_offsets == NULL)
   105     int tid = omp_get_thread_num();
   106     int ntd = omp_get_num_threads();
   109     int gidx_st[old_dist.
order];
   110     int gidx_end[old_dist.
order];
   111     if (old_dist.
order > 1){
   113       int64_t chnk = all_size/ntd;
   114       int64_t glb_idx_st = chnk*tid + 
MIN(tid,all_size%ntd);
   115       int64_t glb_idx_end = glb_idx_st+chnk+(tid<(all_size%ntd));
   125         if (gidx_end[old_dist.
order-1] != len[old_dist.
order-1]){
   127             printf(
"glb_idx_end = %ld, gidx_end[%d]= %d, len[%d] = %d\n", 
   128                    glb_idx_end, 
dim, gidx_end[
dim], dim, len[dim]);
   139         gidx_end[0] = ends[0];
   152     int64_t * count = par_virt_counts[tid];
   154     int64_t *count; 
alloc_ptr(
sizeof(int64_t)*nbucket, (
void**)&count);
   155     memset(count, 0, 
sizeof(int64_t)*nbucket);
   159     memset(gidx, 0, 
sizeof(
int)*old_dist.
order);
   164     int64_t *virt_offset; 
alloc_ptr(
sizeof(int64_t)*old_dist.
order, (
void**)&virt_offset);
   165     memset(virt_offset, 0, 
sizeof(int64_t)*old_dist.
order);
   168     memset(idx, 0, 
sizeof(
int)*old_dist.
order);
   170     int64_t *virt_acc; 
alloc_ptr(
sizeof(int64_t)*old_dist.
order, (
void**)&virt_acc);
   171     memset(virt_acc, 0, 
sizeof(int64_t)*old_dist.
order);
   173     int64_t *idx_acc; 
alloc_ptr(
sizeof(int64_t)*old_dist.
order, (
void**)&idx_acc);
   174     memset(idx_acc, 0, 
sizeof(int64_t)*old_dist.
order);
   176     int64_t *old_virt_lda; 
alloc_ptr(
sizeof(int64_t)*old_dist.
order, (
void**)&old_virt_lda);
   177     old_virt_lda[0] = old_virt_nelem;
   184     int64_t zero_len_toff = 0;
   189       int64_t ist = iist/old_dist.
phase[
dim];
   191       int plen[old_dist.
order];
   192       memcpy(plen,old_virt_edge_len,old_dist.
order*
sizeof(
int));
   197       } 
while (idim >= 0 && sym[idim] != 
NS);
   201       offset += idx_acc[
dim]; 
   206       if (gidx[dim] > gidx_st[dim]) 
break;
   208       int64_t vst = iist-ist*old_dist.
phase[
dim];
   212         virt_offset[
dim] = vst;
   213         offset += vst*old_virt_lda[
dim];
   215       if (gidx[dim] > gidx_st[dim]) 
break;
   219     ASSERT(old_permutation == NULL);
   220     int rep_phase0 = 
lcm(old_phys_dim[0],new_phys_dim[0])/old_phys_dim[0];
   223       printf(
"rep_phase0 = %d\n",rep_phase0);
   224     for (
int id=0; 
id<rep_phase0; 
id++){
   225       for (
int jd=0; jd<(old_phys_edge_len[0]-id)/rep_phase0; jd++){
   226         printf(
"bucket_offset[%d] = %d\n",
id+jd*rep_phase0,bucket_offset[0][
id+jd*rep_phase0]);
   227         ASSERT(bucket_offset[0][
id+jd*rep_phase0] == bucket_offset[0][
id] || bucket_offset[0][
id+jd*rep_phase0] == -1);
   235       bool outside0 = 
false;
   236       int len_zero_max = ends[0];
   238       bool is_at_end = 
true;
   239       bool is_at_start = 
true;
   241         if (gidx[
dim] > gidx_st[
dim]){
   245         if (gidx[dim] < gidx_st[dim]){
   251         zero_len_toff = gidx_st[0];
   254         if (gidx_end[
dim] < gidx[
dim]){
   259         if (gidx_end[dim] > gidx[dim]){
   265         len_zero_max = 
MIN(ends[0],gidx_end[0]);
   289       int idx_max = (sym[0] == 
NS ? old_virt_edge_len[0] : idx[1]+1);
   292         int gidx_min = 
MAX(zero_len_toff,offs[0]);
   293         int gidx_max = (sym[0] == 
NS ? ends[0] : (sym[0] == 
SY ? gidx[1]+1 : gidx[1]));
   294         gidx_max = 
MIN(gidx_max, len_zero_max);
   296         int idx0 = 
MAX(0,(gidx_min-gidx[0])/old_phys_dim[0]);
   298         int idx1 = 
MAX(0,(gidx_max-gidx[0]+old_phys_dim[0]-1)/old_phys_dim[0]);
   299         int lencp = 
MIN(rep_phase0,idx1-idx0);
   302           for (
int ia=0; ia<lencp; ia++){
   303             int64_t bucket = bucket0+bucket_offset[0][idx0];
   304             sr->
copy((idx1-idx0+rep_phase0-1)/rep_phase0, 
   305                      old_data+ sr->
el_size*(offset+idx0), rep_phase0,
   306                      new_data[bucket]+sr->
el_size*count[bucket], 1);
   307             count[bucket]+=(idx1-idx0+rep_phase0-1)/rep_phase0;
   314           for (
int ia=0; ia<lencp; ia++){
   315             int64_t bucket = bucket0+bucket_offset[0][idx0];
   316             sr->
copy((idx1-idx0+rep_phase0-1)/rep_phase0, 
   317                      new_data[bucket]+sr->
el_size*count[bucket], 1,
   318                      old_data+ sr->
el_size*(offset+idx0), rep_phase0);
   319             count[bucket]+=(idx1-idx0+rep_phase0-1)/rep_phase0;
   390           virt_offset[
dim] += 1; 
   395             virt_offset[
dim] = 0;
   400             if (idx[
dim] == (sym[
dim] == 
NS ? old_virt_edge_len[
dim] : idx[
dim+1]+1)){
   405               if (
dim == old_dist.
order-1) done = 
true;
   417         if (old_dist.
order <= 1) done = 
true;
   430     for (
int i = 0;i < nbucket-1;i++){
   431       if (count[i] != (int64_t)((new_data[i+1]-new_data[i])/sr->
el_size)){
   432         printf(
"rank = %d count %d should have been %d is %ld\n", 
rank, i, (
int)((new_data[i+1]-new_data[i])/sr->
el_size), count[i]);
   446     par_virt_counts[tid] = count;
   448     for (
int bckt=0; bckt<nbucket; bckt++){
   450       for (
int thread=0; thread<max_ntd; thread++){
   451         par_tmp += par_virt_counts[thread][bckt];
   452         par_virt_counts[thread][bckt] = par_tmp - par_virt_counts[thread][bckt];
   455       if (bckt < nbucket-1 && par_tmp != (new_data[bckt+1]-new_data[bckt])/sr->
el_size){
   456         printf(
"rank = %d count for bucket %d is %d should have been %ld\n",
rank,bckt,par_tmp,(int64_t)(new_data[bckt+1]-new_data[bckt])/sr->
el_size);
   464       int64_t tot_sz = 
MAX(old_size, new_size);
   468         #pragma omp parallel for private(i)   469         for (i=0; i<tot_sz; i++){
   470           if (bucket_store[i] != -1){
   471             int64_t pc = par_virt_counts[thread_store[i]][bucket_store[i]];
   472             int64_t ct = count_store[i]+pc;
   478           #pragma omp parallel for private(i)   479           for (i=0; i<tot_sz; i++){
   480             if (bucket_store[i] != -1){
   481               int64_t pc = par_virt_counts[thread_store[i]][bucket_store[i]];
   482               int64_t ct = count_store[i]+pc;
   487           #pragma omp parallel for private(i)   488           for (i=0; i<tot_sz; i++){
   489             if (bucket_store[i] != -1){
   490               int64_t pc = par_virt_counts[thread_store[i]][bucket_store[i]];
   491               int64_t ct = count_store[i]+pc;
   492               sr->
acc(old_data+i*sr->
el_size, beta, new_data[bucket_store[i]]+ct*sr->
el_size, alpha);
   499     for (
int t=0; t<max_ntd; t++){
   512   int64_t sy_packed_offset(
int dim, 
int const * len, 
int idx, 
int const * sym){
   513     if (idx == 0) 
return 0;
   514     if (sym[dim-1] == 
NS){
   520       int64_t offset = iidx;
   526       } 
while (i<=dim && sym[dim-i] != 
NS);
   535                int const *          virt_edge_len,
   536                int const *          virt_phase_lda,
   539                char const *         tsr_data_in,
   543                int64_t              glb_ord_offset=0,
   544                int64_t              blk_ord_offset=0){
   546     int imax=virt_edge_len[idim];
   547     if (sym[idim] != 
NS) imax = prev_idx+1;
   548     int vp_stride = virt_phase_lda[idim]*dist.
virt_phase[idim];
   549     for (
int i=0; i<imax; i++){
   550       int64_t dim_offset = sy_packed_offset(idim, virt_edge_len, i, sym);
   551       int64_t i_blk_ord_offset = blk_ord_offset + dim_offset;
   552       int64_t i_glb_ord_offset = glb_ord_offset + dim_offset*vp_stride;
   554         int64_t iv_blk_ord_offset = i_blk_ord_offset + v*virt_phase_lda[idim]*vbs;
   555         int64_t iv_glb_ord_offset = i_glb_ord_offset;
   557           int64_t glb_vrt_offset = sy_packed_offset(idim, virt_edge_len, i+1, sym);
   558           iv_glb_ord_offset += (glb_vrt_offset-dim_offset)*virt_phase_lda[idim]*v;
   560         ord_glb<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, i, iv_glb_ord_offset, iv_blk_ord_offset);
   568                          int const *          virt_edge_len,
   569                          int const *          virt_phase_lda,
   572                          char const *         tsr_data_in,
   576                          int64_t              glb_ord_offset,
   577                          int64_t              blk_ord_offset){
   578     int imax=virt_edge_len[0];
   579     if (sym[0] != 
NS) imax = prev_idx+1;
   580     for (
int v=0; v<dist.virt_phase[0]; v++){
   582         sr->copy(imax, tsr_data_in  + sr->el_size*(blk_ord_offset+v*vbs), 1, 
   583                        tsr_data_out + sr->el_size*(glb_ord_offset+v), dist.virt_phase[0]);
   585         sr->copy(imax, tsr_data_in  + sr->el_size*(glb_ord_offset+v), dist.virt_phase[0], 
   586                        tsr_data_out + sr->el_size*(blk_ord_offset+v*vbs), 1);
   594                   int const *          virt_edge_len,
   595                   int const *          virt_phase_lda,
   598                   char const *         tsr_data_in,
   602                   int64_t              glb_ord_offset,
   603                   int64_t              blk_ord_offset);
   608                    int const *          virt_edge_len,
   609                    int const *          virt_phase_lda,
   612                    char const *         tsr_data_in,
   618                    int64_t              glb_ord_offset=0,
   619                    int64_t              blk_ord_offset=0){
   620     int imax=virt_edge_len[idim];
   621     if (sym[idim] != 
NS) imax = prev_idx+1;
   623       imax = 
MIN(imax,idx_end[idim]+1);
   629     int vp_stride = virt_phase_lda[idim]*dist.
virt_phase[idim];
   631     for (
int i=ist; i<imax; i++){
   632       int64_t dim_offset = sy_packed_offset(idim, virt_edge_len, i, sym);
   633       int64_t i_blk_ord_offset = blk_ord_offset + dim_offset;
   634       int64_t i_glb_ord_offset = glb_ord_offset + dim_offset*vp_stride;
   636         int64_t iv_blk_ord_offset = i_blk_ord_offset + v*virt_phase_lda[idim]*vbs;
   637         int64_t iv_glb_ord_offset = i_glb_ord_offset;
   639           int64_t glb_vrt_offset = sy_packed_offset(idim, virt_edge_len, i+1, sym);
   640           iv_glb_ord_offset += (glb_vrt_offset-dim_offset)*virt_phase_lda[idim]*v;
   642         if (i==ist && i==imax-1)
   643           ord_glb_omp<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, idx_st, idx_end, i, iv_glb_ord_offset, iv_blk_ord_offset);
   645           ord_glb_omp<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, idx_st, NULL,    i, iv_glb_ord_offset, iv_blk_ord_offset);
   647           ord_glb_omp<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, NULL,   idx_end, i, iv_glb_ord_offset, iv_blk_ord_offset);
   649           ord_glb<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, i, iv_glb_ord_offset, iv_blk_ord_offset);
   657                       int const *          virt_edge_len,
   658                       int const *          virt_phase_lda,
   661                       char const *         tsr_data_in,
   667                       int64_t              glb_ord_offset,
   668                       int64_t              blk_ord_offset){
   669     ord_glb<0>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr,prev_idx,glb_ord_offset,blk_ord_offset);
   675                       int const *          virt_edge_len,
   676                       int const *          virt_phase_lda,
   679                       char const *         tsr_data_in,
   685                       int64_t              glb_ord_offset,
   686                       int64_t              blk_ord_offset);
   691                       int const *          virt_edge_len,
   692                       int const *          virt_phase_lda,
   695                       char const *         tsr_data_in,
   700     if (dist.
order == 1){
   701       return ord_glb<0>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr);
   703     if (dist.
order <= 8){
   709     int tid = omp_get_thread_num();
   710     int ntd = omp_get_num_threads();
   711     int64_t vbs_chunk = vbs/ntd;
   712     int64_t fidx_st = vbs_chunk*tid + 
MIN(tid,vbs%ntd);
   714     int64_t fidx_end = fidx_st + vbs_chunk;
   715     if (tid < vbs%ntd) fidx_end++;
   716     int * idx_st = (
int*)
alloc(dist.
order*
sizeof(
int));
   717     int * idx_end = (
int*)
alloc(dist.
order*
sizeof(
int));
   725       if (idx_end[idim] < 0 && idim+1<dist.
order){ 
   726         idx_end[idim] = virt_edge_len[idim]-1;
   740   #define CASE_ORD_GLB(n)                                                                                         \   742       ord_glb_omp<n-1>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr,idx_st,idx_end); \   745   #define CASE_ORD_GLB(n)                                                                      \   747       ord_glb<n-1>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr); \   770     int order = dist.
order;
   771     int * virt_idx = (
int*)
alloc(order*
sizeof(
int));
   772     int * idx = (
int*)
alloc(order*
sizeof(
int));
   774     std::fill(virt_idx, virt_idx+order, 0);
   776       std::fill(idx, idx+order, 0);
   781         int64_t glb_ord_offset = virt_idx[0];
   782         int64_t blk_ord_offset = virt_idx[0]*vbs;
   783         for (
int idim=1; idim<order; idim++){
   785           int64_t dim_offset = sy_packed_offset(idim, virt_edge_len, idx[idim], sym);
   787           blk_ord_offset += dim_offset;
   788           blk_ord_offset += virt_idx[idim]*virt_phase_lda[idim]*vbs;
   791           glb_ord_offset += dim_offset*virt_phase_lda[idim]*dist.
virt_phase[idim];
   797           if (virt_idx[idim] > 0){
   798             int64_t glb_vrt_offset = sy_packed_offset(idim, virt_edge_len, idx[idim]+1, sym);
   799             glb_ord_offset += (glb_vrt_offset-dim_offset)*virt_phase_lda[idim]*virt_idx[idim];
   805         int n = virt_edge_len[0];
   806         if (sym[0] != 
NS) n = idx[1]+1;
   834         bool exit, finish=
false;
   840             if (idx[dim] == virt_edge_len[dim]-1 || (sym[dim] != 
NS && idx[dim] == idx[dim+1])){
   854       bool exit, finish=
false;
   881                             int const *          old_offsets,
   882                             int * 
const *        old_permutation,
   884                             int const *          new_offsets,
   885                             int * 
const *        new_permutation,
   886                             char **              ptr_tsr_data,
   887                             char **              ptr_tsr_cyclic_data,
   893     int i, 
np, old_nvirt, new_nvirt, old_np, new_np, idx_lyr;
   894     int64_t vbs_old, vbs_new;
   897     int64_t * send_counts, * recv_counts;
   900     int64_t  * send_displs;
   901     int64_t * recv_displs;
   902     int * new_virt_lda, * old_virt_lda;
   903     int * old_sub_edge_len, * new_sub_edge_len;
   904     int order = old_dist.
order; 
   906     char * tsr_data = *ptr_tsr_data;
   907     char * tsr_cyclic_data = *ptr_tsr_cyclic_data;
   909       bool is_copy = 
false;
   912       if (ord_glb_comm.
rank == 0){
   914           sr->
copy(tsr_cyclic_data, tsr_data);
   916           sr->
acc(tsr_cyclic_data, beta, tsr_data, alpha);
   920       *ptr_tsr_cyclic_data = tsr_cyclic_data;
   921       return tsr_cyclic_data;
   928       np = ord_glb_comm.
np;
   930     alloc_ptr(order*
sizeof(
int),     (
void**)&hsym);
   931     alloc_ptr(order*
sizeof(
int),     (
void**)&idx);
   932     alloc_ptr(order*
sizeof(int64_t), (
void**)&idx_offs);
   933     alloc_ptr(order*
sizeof(
int),     (
void**)&old_virt_lda);
   934     alloc_ptr(order*
sizeof(
int),     (
void**)&new_virt_lda);
   940     idx_lyr = ord_glb_comm.
rank;
   941     for (i=0; i<order; i++) {
   942       new_virt_lda[i] = new_nvirt;
   943       old_virt_lda[i] = old_nvirt;
   957     vbs_old = old_dist.
size/old_nvirt;
   963     alloc_ptr(order*
sizeof(
int), (
void**)&old_sub_edge_len);
   964     alloc_ptr(order*
sizeof(
int), (
void**)&new_sub_edge_len);
   965     int ** bucket_offset;
   967     int *real_edge_len; 
alloc_ptr(
sizeof(
int)*order, (
void**)&real_edge_len);
   970     int *old_phys_edge_len; 
alloc_ptr(
sizeof(
int)*order, (
void**)&old_phys_edge_len);
   971     for (
int dim = 0;dim < order;dim++) old_phys_edge_len[dim] = (real_edge_len[dim]+old_dist.
padding[dim])/old_dist.
phys_phase[
dim];
   973     int *new_phys_edge_len; 
alloc_ptr(
sizeof(
int)*order, (
void**)&new_phys_edge_len);
   974     for (
int dim = 0;dim < order;dim++) new_phys_edge_len[dim] = (real_edge_len[dim]+new_dist.
padding[dim])/new_dist.
phys_phase[
dim];
   976     int *old_virt_edge_len; 
alloc_ptr(
sizeof(
int)*order, (
void**)&old_virt_edge_len);
   977     for (
int dim = 0;dim < order;dim++) old_virt_edge_len[dim] = old_phys_edge_len[dim]/old_dist.
virt_phase[dim];
   979     int *new_virt_edge_len; 
alloc_ptr(
sizeof(
int)*order, (
void**)&new_virt_edge_len);
   980     for (
int dim = 0;dim < order;dim++) new_virt_edge_len[dim] = new_phys_edge_len[dim]/new_dist.
virt_phase[dim];
  1044     for (i=0; i<order; i++){
  1048     for (i=0; i<order; i++){
  1049       new_sub_edge_len[i] = new_sub_edge_len[i] / new_dist.
phase[i];
  1050       old_sub_edge_len[i] = old_sub_edge_len[i] / old_dist.
phase[i];
  1052     for (i=1; i<order; i++){
  1055     swp_nval = new_nvirt*
sy_packed_size(order, new_sub_edge_len, sym);
  1056     vbs_new = swp_nval/new_nvirt;
  1058     char * send_buffer, * recv_buffer;
  1061       recv_buffer = tsr_cyclic_data;
  1069       order_globally(sym, old_dist, old_virt_edge_len, old_virt_lda, vbs_old, 1, tsr_data, tsr_cyclic_data, sr);
  1070       char **new_data; 
alloc_ptr(
sizeof(
char*)*np, (
void**)&new_data);
  1071       for (int64_t p = 0;p < 
np;p++){
  1072         new_data[p] = tsr_data+sr->
el_size*send_displs[p];
  1099     for (
int dim = 0;dim < order;dim++){
  1107       recv_buffer = tsr_cyclic_data;
  1108       send_buffer = tsr_data;
  1116     ord_glb_comm.all_to_allv(send_buffer, send_counts, send_displs, sr->el_size,
  1117                              recv_buffer, recv_counts, recv_displs);
  1121       if (swp_nval > old_dist.size){
  1128     if (recv_displs[ord_glb_comm.np-1] + recv_counts[ord_glb_comm.np-1] > 0){
  1129       sr->set(tsr_data, sr->addid(), swp_nval);
  1130       char **new_data; 
alloc_ptr(
sizeof(
char*)*
np, (
void**)&new_data);
  1131       for (int64_t p = 0;p < 
np;p++){
  1132         new_data[p] = recv_buffer+recv_displs[p]*sr->el_size;
  1154                   new_dist.phys_phase,
  1161                   old_dist.phys_phase,
  1174       order_globally(sym, new_dist, new_virt_edge_len, new_virt_lda, vbs_new, 0, tsr_data, tsr_cyclic_data, sr);
  1175       for (
int dim = 0;dim < order;dim++){
  1181       sr->set(tsr_cyclic_data, sr->addid(), swp_nval);
  1185     *ptr_tsr_cyclic_data = tsr_cyclic_data;
  1186     *ptr_tsr_data = tsr_data;
 void calc_idx_arr(int order, int const *lens, int const *sym, int64_t idx, int *idx_arr)
int ** compute_bucket_offsets(distribution const &old_dist, distribution const &new_dist, int const *len, int const *old_phys_edge_len, int const *old_virt_lda, int const *old_offsets, int *const *old_permutation, int const *new_phys_edge_len, int const *new_virt_lda, int forward, int old_virt_np, int new_virt_np, int const *old_virt_edge_len)
computes offsets for redistribution targets along each edge length 
void order_globally(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr)
reorder local buffer so that elements are in ordered according to where they are in the global tensor...
virtual bool isequal(char const *a, char const *b) const 
returns true if algstrct elements a and b are equal 
void acc(char *b, char const *beta, char const *a, char const *alpha) const 
compute b=beta*b + alpha*a 
void ord_glb< 0 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
virtual void copy(char *a, char const *b) const 
copies element b to element a 
void * alloc(int64_t len)
alloc abstraction 
void glb_ord_pup(int const *sym, distribution const &old_dist, distribution const &new_dist, int const *len, int const *old_phys_dim, int const *old_phys_edge_len, int const *old_virt_edge_len, int64_t old_virt_nelem, int const *old_offsets, int *const *old_permutation, int total_np, int const *new_phys_dim, int const *new_phys_edge_len, int const *new_virt_edge_len, int64_t new_virt_nelem, char *old_data, char **new_data, int forward, int *const *bucket_offset, char const *alpha, char const *beta, algstrct const *sr)
virtual char const * addid() const 
MPI datatype for pairs. 
void cyclic_reshuffle(int const *sym, distribution const &old_dist, int const *old_offsets, int *const *old_permutation, distribution const &new_dist, int const *new_offsets, int *const *new_permutation, char **ptr_tsr_data, char **ptr_tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm, bool reuse_buffers, char const *alpha, char const *beta)
Goes from any set of phases to any new set of phases. 
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction 
template void ord_glb< 7 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
char * glb_cyclic_reshuffle(int const *sym, distribution const &old_dist, int const *old_offsets, int *const *old_permutation, distribution const &new_dist, int const *new_offsets, int *const *new_permutation, char **ptr_tsr_data, char **ptr_tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm, bool reuse_buffers, char const *alpha, char const *beta)
Goes from any set of phases to any new set of phases. 
template void ord_glb_omp< 7 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int const *idx_st, int const *idx_end, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction 
void calc_cnt_displs(int const *sym, distribution const &old_dist, distribution const &new_dist, int new_nvirt, int np, int const *old_virt_edge_len, int const *new_virt_lda, int64_t *send_counts, int64_t *recv_counts, int64_t *send_displs, int64_t *recv_displs, CommData ord_glb_comm, int idx_lyr, int *const *bucket_offset)
assigns keys to an array of values 
void ord_glb_omp< 0 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int const *idx_st, int const *idx_end, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
void sy_calc_idx_arr(int order, int const *lens, int const *sym, int64_t idx, int *idx_arr)
same as above except assumes sym only NS or SY 
void ord_glb_omp(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int const *idx_st, int const *idx_end, int prev_idx=0, int64_t glb_ord_offset=0, int64_t blk_ord_offset=0)
int el_size
size of each element of algstrct in bytes 
int cdealloc(void *ptr)
free abstraction 
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
int64_t packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in packed symmetric (SY, SH, or AS) layout 
virtual char const * mulid() const 
identity element for multiplication i.e. 1 
void ord_glb(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int prev_idx=0, int64_t glb_ord_offset=0, int64_t blk_ord_offset=0)
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout