Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
dgtog_calc_cnt.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #include "dgtog_calc_cnt.h"
4 #include "../shared/util.h"
5 #include "../interface/common.h"
6 
7 namespace CTF_int {
8  //correct for SY
9  inline int get_glb(int i, int s, int t){
10  return i*s+t;
11  }
12  //correct for SH/AS, but can treat everything as SY
13  /*inline int get_glb(int i, int s, int t){
14  return i*s+t-1;
15  }*/
16  inline int get_loc(int g, int s, int t){
17  //round down, dowwwwwn
18  if (t>g) return -1;
19  else return (g-t)/s;
20  }
21 
22  template <int idim>
23  int64_t calc_cnt(int const * sym,
24  int const * rep_phase,
25  int const * sphase,
26  int const * gidx_off,
27  int const * edge_len,
28  int const * loc_edge_len){
29  assert(sym[idim] == NS); //otherwise should be in calc_sy_pfx
30  if (sym[idim-1] == NS){
31  return (get_loc(edge_len[idim]-1,sphase[idim],gidx_off[idim])+1)*calc_cnt<idim-1>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
32  } else {
33  int64_t * pfx = calc_sy_pfx<idim>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
34  int64_t cnt = 0;
35  for (int i=0; i<=get_loc(edge_len[idim]-1,sphase[idim],gidx_off[idim]); i++){
36  cnt += pfx[i];
37  }
38  cdealloc(pfx);
39  return cnt;
40  }
41  }
42 
43  template <>
44  int64_t calc_cnt<0>(int const * sym,
45  int const * rep_phase,
46  int const * sphase,
47  int const * gidx_off,
48  int const * edge_len,
49  int const * loc_edge_len){
50  assert(sym[0] == NS);
51  return get_loc(edge_len[0]-1, sphase[0], gidx_off[0])+1;
52  }
53 
54  template <int idim>
55  int64_t * calc_sy_pfx(int const * sym,
56  int const * rep_phase,
57  int const * sphase,
58  int const * gidx_off,
59  int const * edge_len,
60  int const * loc_edge_len){
61  int64_t * pfx = (int64_t*)alloc(sizeof(int64_t)*loc_edge_len[idim]);
62  if (sym[idim-1] == NS){
63  int64_t ns_size = calc_cnt<idim-1>(sym,rep_phase,sphase,gidx_off,edge_len,loc_edge_len);
64  for (int i=0; i<loc_edge_len[idim]; i++){
65  pfx[i] = ns_size;
66  }
67  } else {
68  int64_t * pfx_m1 = calc_sy_pfx<idim-1>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
69  for (int i=0; i<loc_edge_len[idim]; i++){
70  int jst;
71  if (i>0){
72  pfx[i] = pfx[i-1];
73  if (sym[idim-1] == SY)
74  jst = get_loc(get_glb(i-1,sphase[idim],gidx_off[idim]),sphase[idim-1],gidx_off[idim-1])+1;
75  else
76  jst = get_loc(get_glb(i-1,sphase[idim],gidx_off[idim])-1,sphase[idim-1],gidx_off[idim-1])+1;
77  } else {
78  pfx[i] = 0;
79  jst = 0;
80  }
81  int jed;
82  if (sym[idim-1] == SY)
83  jed = get_loc(std::min(edge_len[idim]-1,get_glb(i,sphase[idim],gidx_off[idim])),sphase[idim-1],gidx_off[idim-1]);
84  else
85  jed = get_loc(std::min(edge_len[idim]-1,get_glb(i,sphase[idim],gidx_off[idim]))-1,sphase[idim-1],gidx_off[idim-1]);
86  for (int j=jst; j<=jed; j++){
87  //printf("idim = %d j=%d loc_edge[idim] = %d loc_Edge[idim-1]=%d\n",idim,j,loc_edge_len[idim],loc_edge_len[idim-1]);
88  pfx[i] += pfx_m1[j];
89  }
90  }
91  cdealloc(pfx_m1);
92  }
93  return pfx;
94  }
95 
96  template <>
97  int64_t * calc_sy_pfx<1>(int const * sym,
98  int const * rep_phase,
99  int const * sphase,
100  int const * gidx_off,
101  int const * edge_len,
102  int const * loc_edge_len){
103  int64_t * pfx= (int64_t*)alloc(sizeof(int64_t)*loc_edge_len[1]);
104  if (sym[0] == NS){
105  int64_t cnt = calc_cnt<0>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
106  std::fill(pfx, pfx+loc_edge_len[1], cnt);
107  } else if (sym[0] == SY){
108  for (int i=0; i<loc_edge_len[1]; i++){
109  pfx[i] = get_loc(get_glb(i,sphase[1],gidx_off[1]),sphase[0],gidx_off[0])+1;
110  }
111  } else {
112  for (int i=0; i<loc_edge_len[1]; i++){
113  pfx[i] = get_loc(get_glb(i,sphase[1],gidx_off[1])-1,sphase[0],gidx_off[0])+1;
114  }
115  }
116  return pfx;
117  }
118 
119  template <int idim>
120  void calc_drv_cnts(int order,
121  int const * sym,
122  int64_t * counts,
123  int const * rep_phase,
124  int const * rep_phase_lda,
125  int const * sphase,
126  int const * phys_phase,
127  int * gidx_off,
128  int const * edge_len,
129  int const * loc_edge_len){
130  for (int i=0; i<rep_phase[idim]; i++, gidx_off[idim]+=phys_phase[idim]){
131  calc_drv_cnts<idim-1>(order, sym, counts+i*rep_phase_lda[idim], rep_phase, rep_phase_lda, sphase, phys_phase,
132  gidx_off, edge_len, loc_edge_len);
133  }
134  gidx_off[idim] -= phys_phase[idim]*rep_phase[idim];
135  }
136 
137  template <>
138  void calc_drv_cnts<0>(int order,
139  int const * sym,
140  int64_t * counts,
141  int const * rep_phase,
142  int const * rep_phase_lda,
143  int const * sphase,
144  int const * phys_phase,
145  int * gidx_off,
146  int const * edge_len,
147  int const * loc_edge_len){
148  for (int i=0; i<rep_phase[0]; i++, gidx_off[0]+=phys_phase[0]){
149  SWITCH_ORD_CALL_RET(counts[i], calc_cnt, order-1, sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len)
150  }
151  gidx_off[0] -= phys_phase[0]*rep_phase[0];
152  }
153 
154  template <int idim>
155  void calc_cnt_from_rep_cnt(int const * rep_phase,
156  int * const * pe_offset,
157  int * const * bucket_offset,
158  int64_t const * old_counts,
159  int64_t * counts,
160  int bucket_off,
161  int pe_off,
162  int dir){
163  for (int i=0; i<rep_phase[idim]; i++){
164  int rec_bucket_off = bucket_off+bucket_offset[idim][i];
165  int rec_pe_off = pe_off+pe_offset[idim][i];
166  calc_cnt_from_rep_cnt<idim-1>(rep_phase, pe_offset, bucket_offset, old_counts, counts, rec_bucket_off, rec_pe_off, dir);
167 // coff+new_pe_lda[idim]*((rank[idim]+i*old_phys_phase[idim])%new_phys_phase[idim]),
168  // roff+rep_phase_lda[idim]*i, dir);
169  }
170  }
171 
172  template <>
174  (int const * rep_phase,
175  int * const * pe_offset,
176  int * const * bucket_offset,
177  int64_t const * old_counts,
178  int64_t * counts,
179  int bucket_off,
180  int pe_off,
181  int dir){
182  if (dir){
183  for (int i=0; i<rep_phase[0]; i++){
184  counts[pe_off+pe_offset[0][i]] = old_counts[bucket_off+i];
185  }
186  } else {
187  for (int i=0; i<rep_phase[0]; i++){
188  counts[bucket_off+i] = old_counts[pe_off+pe_offset[0][i]];
189  }
190 
191  }
192  }
193 
194 
195 #define INST_CALC_CNT_BEC_ICPC_SUCKS(X) \
196  template \
197  void calc_cnt_from_rep_cnt<X> \
198  (int const * rep_phase, \
199  int * const * pe_offset, \
200  int * const * bucket_offset, \
201  int64_t const * old_counts, \
202  int64_t * counts, \
203  int bucket_off, \
204  int pe_off, \
205  int dir);
206 
207 
219 
220  void calc_drv_displs(int const * sym,
221  int const * edge_len,
222  distribution const & old_dist,
223  distribution const & new_dist,
224  int64_t * counts,
225  int idx_lyr){
227  int * rep_phase, * gidx_off, * sphase;
228  int * rep_phase_lda;
229  int * new_loc_edge_len;
230  if (idx_lyr == 0){
231  int order = old_dist.order;
232  rep_phase = (int*)alloc(order*sizeof(int));
233  rep_phase_lda = (int*)alloc(order*sizeof(int));
234  sphase = (int*)alloc(order*sizeof(int));
235  gidx_off = (int*)alloc(order*sizeof(int));
236  new_loc_edge_len = (int*)alloc(order*sizeof(int));
237  int nrep = 1;
238  for (int i=0; i<order; i++){
239  //FIXME: computed elsewhere already
240  rep_phase_lda[i] = nrep;
241  sphase[i] = lcm(old_dist.phys_phase[i],new_dist.phys_phase[i]);
242  rep_phase[i] = sphase[i] / old_dist.phys_phase[i];
243  gidx_off[i] = old_dist.perank[i];
244  nrep *= rep_phase[i];
245  new_loc_edge_len[i] = (edge_len[i]+sphase[i]-1)/sphase[i];
246  //printf("rep_phase[%d] = %d, sphase = %d lda = %d, gidx = %d\n",i,rep_phase[i], sphase[i], rep_phase_lda[i], gidx_off[i]);
247  }
248  assert(order>0);
249  SWITCH_ORD_CALL(calc_drv_cnts, order-1, order, sym, counts, rep_phase, rep_phase_lda, sphase, old_dist.phys_phase, gidx_off, edge_len, new_loc_edge_len)
250 
251  cdealloc(rep_phase);
252  cdealloc(rep_phase_lda);
253  cdealloc(sphase);
254  cdealloc(gidx_off);
255  cdealloc(new_loc_edge_len);
256  }
258  }
259 
260 
261  void precompute_offsets(distribution const & old_dist,
262  distribution const & new_dist,
263  int const * sym,
264  int const * len,
265  int const * rep_phase,
266  int const * phys_edge_len,
267  int const * virt_edge_len,
268  int const * virt_dim,
269  int const * virt_lda,
270  int64_t virt_nelem,
271  int ** pe_offset,
272  int ** bucket_offset,
273  int64_t ** data_offset,
274  int ** ivmax_pre){
276 
277  int rep_phase_lda = 1;
278  alloc_ptr(sizeof(int64_t)*1, (void**)&ivmax_pre[old_dist.order-1]);
279  ivmax_pre[old_dist.order-1][0] = get_loc(len[old_dist.order-1]-1, old_dist.phys_phase[old_dist.order-1], old_dist.perank[old_dist.order-1]);
280 
281  for (int dim = 0;dim < old_dist.order;dim++){
282  alloc_ptr(sizeof(int)*std::max(rep_phase[dim],phys_edge_len[dim]), (void**)&pe_offset[dim]);
283  alloc_ptr(sizeof(int)*std::max(rep_phase[dim],phys_edge_len[dim]), (void**)&bucket_offset[dim]);
284  alloc_ptr(sizeof(int64_t)*std::max(rep_phase[dim],phys_edge_len[dim]), (void**)&data_offset[dim]);
285  if (dim > 0)
286  alloc_ptr(sizeof(int64_t)*std::max(rep_phase[dim],phys_edge_len[dim]), (void**)&ivmax_pre[dim-1]);
287 
288  int nsym;
289  int pidx = 0;
290  int64_t data_stride, sub_data_stride;
291 
292  if (dim > 0 && sym[dim-1] != NS){
293  int jdim=dim-2;
294  nsym = 1;
295  while (jdim>=0 && sym[jdim] != NS){ nsym++; jdim--; }
296  if (jdim+1 == 0){
297  sub_data_stride = 1;
298  } else {
299  sub_data_stride = sy_packed_size(jdim+1, virt_edge_len, sym);
300  }
301  data_stride = 0;
302  } else {
303  nsym = 0; //not used
304  sub_data_stride = 0; //not used
305  if (dim == 0) data_stride = 1;
306  else {
307  data_stride = sy_packed_size(dim, virt_edge_len, sym);
308  }
309  }
310 
311  int64_t data_off =0;
312 
313  for (int vidx = 0;
314  vidx < std::max((rep_phase[dim]+old_dist.virt_phase[dim]-1)/old_dist.virt_phase[dim],virt_edge_len[dim]);
315  vidx++){
316 
317  int64_t rec_data_off = data_off;
318  if (dim > 0 && sym[dim-1] != NS){
319  data_stride = (vidx+1)*sub_data_stride;
320  for (int j=1; j<nsym; j++){
321  data_stride = (data_stride*(vidx+j+1))/(j+1);
322  }
323  }
324  data_off += data_stride;
325  for (int vr = 0;vr < old_dist.virt_phase[dim] && pidx<std::max(rep_phase[dim],phys_edge_len[dim]) ;vr++,pidx++){
326 
327  if (dim>0){
328  if (sym[dim-1] == NS){
329  ivmax_pre[dim-1][pidx] = get_loc(len[dim-1]-1, old_dist.phys_phase[dim-1], old_dist.perank[dim-1]);
330  } else if (sym[dim-1] == SY){
331  ivmax_pre[dim-1][pidx] = get_loc(get_glb(pidx, old_dist.phys_phase[dim ], old_dist.perank[dim ]),
332  old_dist.phys_phase[dim-1], old_dist.perank[dim-1]);
333  } else {
334  ivmax_pre[dim-1][pidx] = get_loc(get_glb(pidx, old_dist.phys_phase[dim ], old_dist.perank[dim ])-1,
335  old_dist.phys_phase[dim-1], old_dist.perank[dim-1]);
336  }
337  }
338 
339  data_offset[dim][pidx] = rec_data_off;
340  rec_data_off += virt_lda[dim]*virt_nelem;
341 
342  int64_t gidx = (int64_t)vidx*old_dist.phase[dim]+old_dist.perank[dim]+(int64_t)vr*old_dist.phys_phase[dim];
343  int phys_rank = gidx%new_dist.phys_phase[dim];
344  pe_offset[dim][pidx] = phys_rank*MAX(1,new_dist.pe_lda[dim]);
345  bucket_offset[dim][pidx] = (pidx%rep_phase[dim])*rep_phase_lda;
346  }
347  }
348  rep_phase_lda *= rep_phase[dim];
349  }
350 
351 
353 
354  }
355 
356 }
357 
void calc_cnt_from_rep_cnt< 0 >(int const *rep_phase, int *const *pe_offset, int *const *bucket_offset, int64_t const *old_counts, int64_t *counts, int bucket_off, int pe_off, int dir)
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
Definition: common.h:37
void calc_drv_cnts(int order, int const *sym, int64_t *counts, int const *rep_phase, int const *rep_phase_lda, int const *sphase, int const *phys_phase, int *gidx_off, int const *edge_len, int const *loc_edge_len)
void calc_drv_displs(int const *sym, int const *edge_len, distribution const &old_dist, distribution const &new_dist, int64_t *counts, int idx_lyr)
int64_t calc_cnt< 0 >(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
#define MAX(a, b)
Definition: util.h:180
int64_t * calc_sy_pfx(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
computes the cardinality of the sets of elements of a tensor of order idim+1 for different values of ...
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
#define SWITCH_ORD_CALL(F, act_ord,...)
Definition: util.h:119
void calc_drv_cnts< 0 >(int order, int const *sym, int64_t *counts, int const *rep_phase, int const *rep_phase_lda, int const *sphase, int const *phys_phase, int *gidx_off, int const *edge_len, int const *loc_edge_len)
#define SWITCH_ORD_CALL_RET(R, F, act_ord,...)
Definition: util.h:127
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
int64_t calc_cnt(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
computes the cardinality of the set of elements of a tensor of order idim+1 that are owned by process...
int get_loc(int g, int s, int t)
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
int lcm(int a, int b)
Definition: util.h:340
#define INST_CALC_CNT_BEC_ICPC_SUCKS(X)
int64_t * calc_sy_pfx< 1 >(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
Definition: common.h:37
void precompute_offsets(distribution const &old_dist, distribution const &new_dist, int const *sym, int const *len, int const *rep_phase, int const *phys_edge_len, int const *virt_edge_len, int const *virt_dim, int const *virt_lda, int64_t virt_nelem, int **pe_offset, int **bucket_offset, int64_t **data_offset, int **ivmax_pre)
void calc_cnt_from_rep_cnt(int const *rep_phase, int *const *pe_offset, int *const *bucket_offset, int64_t const *old_counts, int64_t *counts, int bucket_off, int pe_off, int dir)
int get_glb(int i, int s, int t)
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
Definition: util.cxx:10