4 #include "../shared/util.h"     5 #include "../interface/common.h"     9   inline int get_glb(
int i, 
int s, 
int t){
    24                    int const * rep_phase,
    28                    int const * loc_edge_len){
    29     assert(sym[idim] == 
NS); 
    30     if (sym[idim-1] == 
NS){
    31       return (
get_loc(edge_len[idim]-1,sphase[idim],gidx_off[idim])+1)*
calc_cnt<idim-1>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
    33       int64_t * pfx = calc_sy_pfx<idim>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
    35       for (
int i=0; i<=
get_loc(edge_len[idim]-1,sphase[idim],gidx_off[idim]); i++){
    45                       int const * rep_phase,
    49                       int const * loc_edge_len){
    51     return get_loc(edge_len[0]-1, sphase[0], gidx_off[0])+1;
    56                         int const * rep_phase,
    60                         int const * loc_edge_len){
    61     int64_t * pfx = (int64_t*)
alloc(
sizeof(int64_t)*loc_edge_len[idim]);
    62     if (sym[idim-1] == 
NS){
    63       int64_t ns_size = 
calc_cnt<idim-1>(sym,rep_phase,sphase,gidx_off,edge_len,loc_edge_len);
    64       for (
int i=0; i<loc_edge_len[idim]; i++){
    68       int64_t * pfx_m1 = 
calc_sy_pfx<idim-1>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
    69       for (
int i=0; i<loc_edge_len[idim]; i++){
    73           if (sym[idim-1] == 
SY)
    74             jst = 
get_loc(
get_glb(i-1,sphase[idim],gidx_off[idim]),sphase[idim-1],gidx_off[idim-1])+1;
    76             jst = 
get_loc(
get_glb(i-1,sphase[idim],gidx_off[idim])-1,sphase[idim-1],gidx_off[idim-1])+1;
    82         if (sym[idim-1] == 
SY)
    83           jed = 
get_loc(std::min(edge_len[idim]-1,
get_glb(i,sphase[idim],gidx_off[idim])),sphase[idim-1],gidx_off[idim-1]);
    85           jed = 
get_loc(std::min(edge_len[idim]-1,
get_glb(i,sphase[idim],gidx_off[idim]))-1,sphase[idim-1],gidx_off[idim-1]);
    86         for (
int j=jst; j<=jed; j++){
    98                            int const * rep_phase,
   100                            int const * gidx_off,
   101                            int const * edge_len,
   102                            int const * loc_edge_len){
   103     int64_t * pfx= (int64_t*)
alloc(
sizeof(int64_t)*loc_edge_len[1]);
   105       int64_t cnt = 
calc_cnt<0>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
   106       std::fill(pfx, pfx+loc_edge_len[1], cnt);
   107     } 
else if (sym[0] == 
SY){
   108       for (
int i=0; i<loc_edge_len[1]; i++){
   109         pfx[i] = 
get_loc(
get_glb(i,sphase[1],gidx_off[1]),sphase[0],gidx_off[0])+1;
   112       for (
int i=0; i<loc_edge_len[1]; i++){
   113         pfx[i] = 
get_loc(
get_glb(i,sphase[1],gidx_off[1])-1,sphase[0],gidx_off[0])+1;
   123                      int const * rep_phase,
   124                      int const * rep_phase_lda,
   126                      int const * phys_phase,
   128                      int const * edge_len,
   129                      int const * loc_edge_len){
   130     for (
int i=0; i<rep_phase[idim]; i++, gidx_off[idim]+=phys_phase[idim]){
   131        calc_drv_cnts<idim-1>(order, sym, counts+i*rep_phase_lda[idim], rep_phase, rep_phase_lda, sphase, phys_phase,
   132                              gidx_off, edge_len, loc_edge_len);
   134     gidx_off[idim] -= phys_phase[idim]*rep_phase[idim];
   141                         int const * rep_phase,
   142                         int const * rep_phase_lda,
   144                         int const * phys_phase,
   146                         int const * edge_len,
   147                         int const * loc_edge_len){
   148     for (
int i=0; i<rep_phase[0]; i++, gidx_off[0]+=phys_phase[0]){
   151     gidx_off[0] -= phys_phase[0]*rep_phase[0];
   156                              int * 
const *   pe_offset,
   157                              int * 
const *   bucket_offset,
   158                              int64_t 
const * old_counts,
   163     for (
int i=0; i<rep_phase[idim]; i++){
   164       int rec_bucket_off = bucket_off+bucket_offset[idim][i];
   165       int rec_pe_off = pe_off+pe_offset[idim][i];
   166       calc_cnt_from_rep_cnt<idim-1>(rep_phase, pe_offset, bucket_offset, old_counts, counts, rec_bucket_off, rec_pe_off, dir);
   174                             (
int const *     rep_phase,
   175                              int * 
const *   pe_offset,
   176                              int * 
const *   bucket_offset,
   177                              int64_t 
const * old_counts,
   183       for (
int i=0; i<rep_phase[0]; i++){
   184         counts[pe_off+pe_offset[0][i]] = old_counts[bucket_off+i];
   187       for (
int i=0; i<rep_phase[0]; i++){
   188         counts[bucket_off+i] = old_counts[pe_off+pe_offset[0][i]];
   195 #define INST_CALC_CNT_BEC_ICPC_SUCKS(X) \   197   void calc_cnt_from_rep_cnt<X> \   198                             (int const *     rep_phase, \   199                              int * const *   pe_offset, \   200                              int * const *   bucket_offset, \   201                              int64_t const * old_counts, \   221                        int const *          edge_len,
   227     int * rep_phase, * gidx_off, * sphase;
   229     int * new_loc_edge_len;
   231       int order = old_dist.order;
   232       rep_phase     = (
int*)
alloc(order*
sizeof(
int));
   233       rep_phase_lda = (
int*)
alloc(order*
sizeof(
int));
   234       sphase        = (
int*)
alloc(order*
sizeof(
int));
   235       gidx_off      = (
int*)
alloc(order*
sizeof(
int));
   236       new_loc_edge_len      = (
int*)
alloc(order*
sizeof(
int));
   238       for (
int i=0; i<order; i++){
   240         rep_phase_lda[i]  = nrep;
   241         sphase[i]         = 
lcm(old_dist.phys_phase[i],new_dist.phys_phase[i]);
   242         rep_phase[i]      = sphase[i] / old_dist.phys_phase[i];
   243         gidx_off[i]       = old_dist.perank[i];
   244         nrep             *= rep_phase[i];
   245         new_loc_edge_len[i] = (edge_len[i]+sphase[i]-1)/sphase[i];
   249       SWITCH_ORD_CALL(
calc_drv_cnts, order-1, order, sym, counts, rep_phase, rep_phase_lda, sphase, old_dist.phys_phase, gidx_off, edge_len, new_loc_edge_len)
   265                           int const *          rep_phase,
   266                           int const *          phys_edge_len,
   267                           int const *          virt_edge_len,
   268                           int const *          virt_dim,
   269                           int const *          virt_lda,
   272                           int **               bucket_offset,
   273                           int64_t **           data_offset,
   277     int rep_phase_lda = 1; 
   278     alloc_ptr(
sizeof(int64_t)*1, (
void**)&ivmax_pre[old_dist.
order-1]);
   282       alloc_ptr(
sizeof(
int)*std::max(rep_phase[
dim],phys_edge_len[dim]), (
void**)&pe_offset[dim]);
   283       alloc_ptr(
sizeof(
int)*std::max(rep_phase[dim],phys_edge_len[dim]), (
void**)&bucket_offset[dim]);
   284       alloc_ptr(
sizeof(int64_t)*std::max(rep_phase[dim],phys_edge_len[dim]), (
void**)&data_offset[dim]);
   286         alloc_ptr(
sizeof(int64_t)*std::max(rep_phase[dim],phys_edge_len[dim]), (
void**)&ivmax_pre[dim-1]);
   290       int64_t data_stride, sub_data_stride;
   292       if (dim > 0 && sym[dim-1] != 
NS){
   295         while (jdim>=0 && sym[jdim] != 
NS){ nsym++; jdim--; }
   305         if (dim == 0) data_stride = 1;
   314            vidx < std::max((rep_phase[dim]+old_dist.
virt_phase[dim]-1)/old_dist.
virt_phase[dim],virt_edge_len[dim]);
   317         int64_t rec_data_off = data_off;
   318         if (dim > 0 && sym[dim-1] != 
NS){
   319           data_stride = (vidx+1)*sub_data_stride;
   320           for (
int j=1; j<nsym; j++){
   321             data_stride = (data_stride*(vidx+j+1))/(j+1);
   324         data_off += data_stride;
   325         for (
int vr = 0;vr < old_dist.
virt_phase[
dim] && pidx<std::max(rep_phase[dim],phys_edge_len[dim]) ;vr++,pidx++){
   328             if (sym[dim-1] == 
NS){
   330             } 
else if (sym[dim-1] == 
SY){
   339           data_offset[
dim][pidx] = rec_data_off;
   340           rec_data_off += virt_lda[
dim]*virt_nelem; 
   344           pe_offset[
dim][pidx] = phys_rank*
MAX(1,new_dist.
pe_lda[dim]);
   345           bucket_offset[
dim][pidx] = (pidx%rep_phase[
dim])*rep_phase_lda;
   348       rep_phase_lda *= rep_phase[
dim];
 
void calc_cnt_from_rep_cnt< 0 >(int const *rep_phase, int *const *pe_offset, int *const *bucket_offset, int64_t const *old_counts, int64_t *counts, int bucket_off, int pe_off, int dir)
void * alloc(int64_t len)
alloc abstraction 
void calc_drv_cnts(int order, int const *sym, int64_t *counts, int const *rep_phase, int const *rep_phase_lda, int const *sphase, int const *phys_phase, int *gidx_off, int const *edge_len, int const *loc_edge_len)
void calc_drv_displs(int const *sym, int const *edge_len, distribution const &old_dist, distribution const &new_dist, int64_t *counts, int idx_lyr)
int64_t calc_cnt< 0 >(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
int64_t * calc_sy_pfx(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
computes the cardinality of the sets of elements of a tensor of order idim+1 for different values of ...
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction 
#define SWITCH_ORD_CALL(F, act_ord,...)
void calc_drv_cnts< 0 >(int order, int const *sym, int64_t *counts, int const *rep_phase, int const *rep_phase_lda, int const *sphase, int const *phys_phase, int *gidx_off, int const *edge_len, int const *loc_edge_len)
#define SWITCH_ORD_CALL_RET(R, F, act_ord,...)
int64_t calc_cnt(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
computes the cardinality of the set of elements of a tensor of order idim+1 that are owned by process...
int get_loc(int g, int s, int t)
int cdealloc(void *ptr)
free abstraction 
#define INST_CALC_CNT_BEC_ICPC_SUCKS(X)
int64_t * calc_sy_pfx< 1 >(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
void precompute_offsets(distribution const &old_dist, distribution const &new_dist, int const *sym, int const *len, int const *rep_phase, int const *phys_edge_len, int const *virt_edge_len, int const *virt_dim, int const *virt_lda, int64_t virt_nelem, int **pe_offset, int **bucket_offset, int64_t **data_offset, int **ivmax_pre)
void calc_cnt_from_rep_cnt(int const *rep_phase, int *const *pe_offset, int *const *bucket_offset, int64_t const *old_counts, int64_t *counts, int bucket_off, int pe_off, int dir)
int get_glb(int i, int s, int t)
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout