Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
redist.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #include "redist.h"
4 #include "../shared/util.h"
5 #include "sparse_rw.h"
6 
7 namespace CTF_int {
8  void padded_reshuffle(int const * sym,
9  distribution const & old_dist,
10  distribution const & new_dist,
11  char * tsr_data,
12  char ** tsr_cyclic_data,
13  algstrct const * sr,
14  CommData ord_glb_comm){
15  int old_num_virt, new_num_virt, numPes;
16  int64_t new_nval, swp_nval;
17  int idx_lyr;
18  int * virt_phase_rank, * old_virt_phase_rank, * sub_edge_len;
19  char * pairs, * tsr_new_data;
20  DEBUG_PRINTF("Performing padded reshuffle\n");
21 
23 
24  numPes = ord_glb_comm.np;
25 
26  alloc_ptr(old_dist.order*sizeof(int), (void**)&virt_phase_rank);
27  alloc_ptr(old_dist.order*sizeof(int), (void**)&old_virt_phase_rank);
28  alloc_ptr(old_dist.order*sizeof(int), (void**)&sub_edge_len);
29 
30  new_num_virt = 1;
31  old_num_virt = 1;
32  idx_lyr = ord_glb_comm.rank;
33  for (int i=0; i<old_dist.order; i++){
34  old_num_virt = old_num_virt*old_dist.virt_phase[i];
35  new_num_virt = new_num_virt*new_dist.virt_phase[i];
36  virt_phase_rank[i] = new_dist.perank[i];//*new_dist.virt_phase[i];
37  old_virt_phase_rank[i] = old_dist.perank[i];//*old_dist.virt_phase[i];
38  idx_lyr -= old_dist.perank[i]*old_dist.pe_lda[i];
39  }
40  if (idx_lyr == 0 ){
41  read_loc_pairs(old_dist.order, old_dist.size, old_num_virt, sym,
42  old_dist.pad_edge_len, old_dist.padding, old_dist.phase, old_dist.phys_phase,
43  old_dist.virt_phase, old_virt_phase_rank, &new_nval, tsr_data,
44  &pairs, sr);
45  } else {
46  new_nval = 0;
47  pairs = NULL;
48  }
49 
50  /*#if DEBUG >= 1
51  int64_t old_size = sy_packed_size(old_dist.order, new_dist.pad_edge_len, sym);
52  #endif*/
53 
54  for (int i=0; i<old_dist.order; i++){
55  sub_edge_len[i] = new_dist.pad_edge_len[i] / new_dist.phase[i];
56  }
57  if (ord_glb_comm.rank == 0){
58  DPRINTF(1,"Tensor now has virtualization factor of %d\n",new_num_virt);
59  }
60  swp_nval = new_num_virt*sy_packed_size(old_dist.order, sub_edge_len, sym);
61  if (ord_glb_comm.rank == 0){
62  DPRINTF(1,"Tensor is of size %ld, has factor of %lf growth due to padding\n",
63  swp_nval,
64  ord_glb_comm.np*(swp_nval/(double)old_dist.size));
65  }
66 
67  alloc_ptr(swp_nval*sr->el_size, (void**)&tsr_new_data);
68 
69  if (sr->addid() != NULL)
70  sr->set(tsr_new_data, sr->addid(), swp_nval);
71 
72 
73  int64_t ignrd;
74  char * aignrd;
75  wr_pairs_layout(old_dist.order,
76  numPes,
77  new_nval,
78  sr->mulid(),
79  sr->addid(),
80  'w',
81  new_num_virt,
82  sym,
83  new_dist.pad_edge_len,
84  new_dist.padding,
85  new_dist.phase,
86  new_dist.phys_phase,
87  new_dist.virt_phase,
88  virt_phase_rank,
89  new_dist.pe_lda,
90  pairs,
91  tsr_new_data,
92  ord_glb_comm,
93  sr,
94  false,
95  0,
96  NULL,
97  aignrd,
98  ignrd);
99 
100  *tsr_cyclic_data = tsr_new_data;
101 
102  cdealloc(old_virt_phase_rank);
103  if (pairs != NULL)
104  cdealloc(pairs);
105  cdealloc(virt_phase_rank);
106  cdealloc(sub_edge_len);
108  }
109 
110 
111  int ** compute_bucket_offsets(distribution const & old_dist,
112  distribution const & new_dist,
113  int const * len,
114  int const * old_phys_edge_len,
115  int const * old_virt_lda,
116  int const * old_offsets,
117  int * const * old_permutation,
118  int const * new_phys_edge_len,
119  int const * new_virt_lda,
120  int forward,
121  int old_virt_np,
122  int new_virt_np,
123  int const * old_virt_edge_len){
125 
126  int **bucket_offset; alloc_ptr(sizeof(int*)*old_dist.order, (void**)&bucket_offset);
127 
128  for (int dim = 0;dim < old_dist.order;dim++){
129  alloc_ptr(sizeof(int)*old_phys_edge_len[dim], (void**)&bucket_offset[dim]);
130  int pidx = 0;
131  for (int vidx = 0;vidx < old_virt_edge_len[dim];vidx++){
132  for (int vr = 0;vr < old_dist.virt_phase[dim];vr++,pidx++){
133  int64_t _gidx = (int64_t)vidx*old_dist.phase[dim]+old_dist.perank[dim]+(int64_t)vr*old_dist.phys_phase[dim];
134  int64_t gidx;
135  if (_gidx > len[dim] || (old_offsets != NULL && _gidx < old_offsets[dim])){
136  gidx = -1;
137 // printf("_gidx=%ld, len[%d]=%d, vidx=%d, vr=%d, old_phase=%d, old_perank =%d, old_virt_phase=%d\n",_gidx,dim,len[dim],vidx,vr,old_dist.phase[dim], old_dist.perank[dim[,old_dist.virt_phase[dim]);
138  } else {
139  if (old_permutation == NULL || old_permutation[dim] == NULL){
140  gidx = _gidx;
141  } else {
142  gidx = old_permutation[dim][_gidx];
143  }
144  }
145  if (gidx != -1){
146  int phys_rank = gidx%new_dist.phys_phase[dim];
147  if (forward){
148  bucket_offset[dim][pidx] = phys_rank*MAX(1,new_dist.pe_lda[dim]);
149  //printf("f %d - %d %d %d - %d - %d %d %d - %d\n", dim, vr, vidx, pidx, gidx, total_rank,
150  // phys_rank, virt_rank, bucket_offset[dim][pidx]);
151  }
152  else{
153  bucket_offset[dim][pidx] = phys_rank*MAX(1,new_dist.pe_lda[dim]);
154  //printf("r %d - %d %d %d - %d - %d %d - %d\n", dim, vr, vidx, pidx, gidx, total_rank,
155  // phys_rank, bucket_offset[dim][pidx]);
156  }
157  } else {
158  bucket_offset[dim][pidx] = -1;
159  }
160  }
161  }
162  }
163 
165 
166  return bucket_offset;
167  }
168 
169 
170  void calc_cnt_displs(int const * sym,
171  distribution const & old_dist,
172  distribution const & new_dist,
173  int new_nvirt,
174  int np,
175  int const * old_virt_edge_len,
176  int const * new_virt_lda,
177  int64_t * send_counts,
178  int64_t * recv_counts,
179  int64_t * send_displs,
180  int64_t * recv_displs,
181  CommData ord_glb_comm,
182  int idx_lyr,
183  int * const * bucket_offset){
184  int64_t * all_virt_counts;
185 
186  #ifdef USE_OMP
187  int act_omp_ntd;
188  int64_t vbs = sy_packed_size(old_dist.order, old_virt_edge_len, sym);
189  int max_ntd = omp_get_max_threads();
190  max_ntd = MAX(1,MIN(max_ntd,vbs/np));
191  #else
192  int max_ntd = 1;
193  #endif
194 
195  mst_alloc_ptr(np*sizeof(int64_t)*max_ntd, (void**)&all_virt_counts);
196 
197 
198  /* Count how many elements need to go to each new virtual bucket */
199  if (idx_lyr==0){
200  if (old_dist.order == 0){
201  memset(all_virt_counts, 0, np*sizeof(int64_t));
202  all_virt_counts[0]++;
203  } else {
204  #ifdef USE_OMP
205  #pragma omp parallel num_threads(max_ntd)
206  {
207  int imax, act_max, skip;
208  int start_ldim, end_ldim;
209  int i_st, vc, dim;
210  int64_t * virt_counts;
211  int * old_virt_idx, * virt_rank;
212  int * idx;
213  int64_t idx_offset;
214  int64_t * idx_offs;
215  int * spad;
216  int last_len = old_dist.pad_edge_len[old_dist.order-1]/old_dist.phase[old_dist.order-1]+1;
217  int ntd, tid;
218  ntd = omp_get_num_threads();
219  tid = omp_get_thread_num();
220 /*
221  int lidx_st[old_dist.order];
222  int lidx_end[old_dist.order];
223  if (old_dist.order > 1){
224  int64_t loc_upsize = packed_size(old_dist.order-1, old_virt_edge_len+1, sym+1);
225  int64_t chnk = loc_upsize/ntd;
226  int64_t loc_idx_st = chnk*tid + MIN(tid,loc_upsize%ntd);
227  int64_t loc_idx_end = loc_idx_st+chnk+(tid<(loc_upsize%ntd));
228  //calculate global indices along each dimension corresponding to partition
229  // printf("loc_idx_st = %ld, loc_idx_end = %ld\n",loc_idx_st,loc_idx_end);
230  calc_idx_arr(old_dist.order-1, len+1, sym+1, loc_idx_st, lidx_st+1);
231  calc_idx_arr(old_dist.order-1, len+1, sym+1, loc_idx_end, lidx_end+1);
232  lidx_st[0] = 0;
233  //FIXME: wrong but evidently not used
234  lidx_end[0] = 0
235  } else {
236  //FIXME the below means redistribution of a vector is non-threaded
237  if (tid == 0){
238  lidx_st[0] = 0;
239  lidx_end[0] = ends[0];
240  } else {
241  lidx_st[0] = 0;
242  lidx_end[0] = 0;
243  }
244  }*/
245 
246  virt_counts = all_virt_counts+np*tid;
247  start_ldim = (last_len/ntd)*tid;
248  start_ldim += MIN(tid,last_len%ntd);
249  end_ldim = (last_len/ntd)*(tid+1);
250  end_ldim += MIN(tid+1,last_len%ntd);
251  #else
252  {
253  int imax, act_max, skip;
254  int start_ldim, end_ldim, i_st, vc, dim;
255  int64_t * virt_counts;
256  int64_t * old_virt_idx, * virt_rank;
257  int64_t * idx;
258  int64_t idx_offset;
259  int64_t * idx_offs;
260  int * spad;
261  int last_len = old_dist.pad_edge_len[old_dist.order-1]/old_dist.phase[old_dist.order-1]+1;
262  virt_counts = all_virt_counts;
263  start_ldim = 0;
264  end_ldim = last_len;
265  #endif
266  alloc_ptr(old_dist.order*sizeof(int64_t), (void**)&idx);
267  alloc_ptr(old_dist.order*sizeof(int64_t), (void**)&idx_offs);
268  alloc_ptr(old_dist.order*sizeof(int64_t), (void**)&old_virt_idx);
269  alloc_ptr(old_dist.order*sizeof(int64_t), (void**)&virt_rank);
270  alloc_ptr(old_dist.order*sizeof(int), (void**)&spad);
271  memset(virt_counts, 0, np*sizeof(int64_t));
272  memset(old_virt_idx, 0, old_dist.order*sizeof(int64_t));
273  /* virt_rank = physical_rank + virtual_rank*num_phys_ranks */
274  for (int i=0; i<old_dist.order; i++){
275  virt_rank[i] = old_dist.perank[i];
276  }
277  for (;;){
278  memset(idx, 0, old_dist.order*sizeof(int64_t));
279  memset(idx_offs, 0, old_dist.order*sizeof(int64_t));
280  idx_offset = 0;
281  skip = 0;
282  idx[old_dist.order-1] = MAX(idx[old_dist.order-1],start_ldim);
283  for (dim=0; dim<old_dist.order; dim++) {
284  /* Warning: This next if block has a history of bugs */
285  //spad[dim] = old_dist.padding[dim];
286  if (sym[dim] != NS){
287  ASSERT(old_dist.padding[dim] < old_dist.phase[dim]);
288  spad[dim] = 1;
289  if (sym[dim] != SY && virt_rank[dim] < virt_rank[dim+1])
290  spad[dim]--;
291  if (sym[dim] == SY && virt_rank[dim] <= virt_rank[dim+1])
292  spad[dim]--;
293  }
294  if (sym[dim] != NS && idx[dim] >= idx[dim+1]-spad[dim]){
295  idx[dim+1] = idx[dim]+spad[dim];
296  // if (virt_rank[sym[dim]] + (sym[dim]==SY) <= virt_rank[dim])
297  // idx[sym[dim]]++;
298  }
299  if (dim > 0){
300  imax = (old_dist.pad_edge_len[dim]-old_dist.padding[dim])/old_dist.phase[dim];
301  if (virt_rank[dim] < (old_dist.pad_edge_len[dim]-old_dist.padding[dim])%old_dist.phase[dim])
302  imax++;
303  if (dim == old_dist.order - 1)
304  imax = MIN(imax, end_ldim);
305  if (idx[dim] >= imax)
306  skip = 1;
307  else {
308  idx_offs[dim] = bucket_offset[dim][old_virt_idx[dim]+idx[dim]*old_dist.virt_phase[dim]];
309  idx_offset += idx_offs[dim];
310  }
311  }
312  }
313  /* determine how many elements belong to each processor */
314  /* (INNER LOOP) */
315  if (!skip){
316  for (;;){
317  imax = (old_dist.pad_edge_len[0]-old_dist.padding[0])/old_dist.phase[0];
318  if (virt_rank[0] < (old_dist.pad_edge_len[0]-old_dist.padding[0])%old_dist.phase[0])
319  imax++;
320  if (sym[0] != NS) {
321  imax = MIN(imax,idx[1]+1-spad[0]);
322  }
323  if (old_dist.order == 1){
324  imax = MIN(imax, end_ldim);
325  i_st = start_ldim;
326  } else
327  i_st = 0;
328 
329  /* Increment virtual bucket */
330  for (int i=i_st; i<imax; i++){
331  vc = bucket_offset[0][old_virt_idx[0]+i*old_dist.virt_phase[0]];
332  virt_counts[idx_offset+vc]++;
333  }
334  /* Increment indices and set up offsets */
335  for (dim=1; dim < old_dist.order; dim++){
336  idx[dim]++;
337  act_max = (old_dist.pad_edge_len[dim]-old_dist.padding[dim])/old_dist.phase[dim];
338  if (virt_rank[dim] <
339  (old_dist.pad_edge_len[dim]-old_dist.padding[dim])%old_dist.phase[dim])
340  act_max++;
341  if (dim == old_dist.order - 1)
342  act_max = MIN(act_max, end_ldim);
343  if (sym[dim] != NS)
344  act_max = MIN(act_max,idx[dim+1]+1-spad[dim]);
345  bool ended = true;
346  if (idx[dim] >= act_max){
347  ended = false;
348  idx[dim] = 0;
349  if (sym[dim-1] != NS) idx[dim] = idx[dim-1]+spad[dim-1];
350  }
351  idx_offset -= idx_offs[dim];
352  idx_offs[dim] = bucket_offset[dim][old_virt_idx[dim]+idx[dim]*old_dist.virt_phase[dim]];
353  idx_offset += idx_offs[dim];
354  if (ended)
355  break;
356  }
357  if (dim == old_dist.order) break;
358  }
359  }
360  /* (OUTER LOOP) Iterate over virtual ranks on this pe */
361  for (dim=0; dim<old_dist.order; dim++){
362  old_virt_idx[dim]++;
363  if (old_virt_idx[dim] >= old_dist.virt_phase[dim])
364  old_virt_idx[dim] = 0;
365 
366  virt_rank[dim] = old_dist.perank[dim]+old_virt_idx[dim]*old_dist.phys_phase[dim];
367 
368  if (old_virt_idx[dim] > 0)
369  break;
370  }
371  if (dim == old_dist.order) break;
372  }
373  cdealloc(idx);
374  cdealloc(idx_offs);
375  cdealloc(old_virt_idx);
376  cdealloc(virt_rank);
377  cdealloc(spad);
378 #ifdef USE_OMP
379 #pragma omp master
380  {
381  act_omp_ntd = ntd;
382  }
383 #endif
384  }
385  #ifdef USE_OMP
386  for (int j=1; j<act_omp_ntd; j++){
387  for (int64_t i=0; i<np; i++){
388  all_virt_counts[i] += all_virt_counts[i+np*j];
389  }
390  }
391  #endif
392  }
393  } else
394  memset(all_virt_counts, 0, sizeof(int64_t)*np);
395 
396  //int tmp1, tmp2;
397  //int64_t * virt_counts = all_virt_counts;
398 
399  memcpy(send_counts, all_virt_counts, np*sizeof(int64_t));
400 // memset(send_counts, 0, np*sizeof(int64_t));
401 
402  /* Calculate All-to-all processor grid offsets from the virtual bucket counts */
403 /* if (old_dist.order == 0){
404  // send_counts[0] = virt_counts[0];
405  memset(svirt_displs, 0, np*sizeof(int64_t));
406  } else {
407  for (int i=0; i<np; i++){
408  for (int j=0; j<new_nvirt; j++){
409  //printf("virt_count[%d][%d]=%d\n",i,j,virt_counts[i*new_nvirt+j]);
410  send_counts[i] += virt_counts[i*new_nvirt+j];
411  svirt_displs[i*new_nvirt+j] = virt_counts[i*new_nvirt+j];
412  }
413  }
414  }*/
415 
416  /* Exchange counts */
417  MPI_Alltoall(send_counts, 1, MPI_INT64_T,
418  recv_counts, 1, MPI_INT64_T, ord_glb_comm.cm);
419 
420  /* Calculate displacements out of the count arrays */
421  send_displs[0] = 0;
422  recv_displs[0] = 0;
423  for (int i=1; i<np; i++){
424  send_displs[i] = send_displs[i-1] + send_counts[i-1];
425  recv_displs[i] = recv_displs[i-1] + recv_counts[i-1];
426  }
427 
428  /* Calculate displacements for virt buckets in each message */
429 /* for (int i=0; i<np; i++){
430  tmp2 = svirt_displs[i*new_nvirt];
431  svirt_displs[i*new_nvirt] = 0;
432  for (int j=1; j<new_nvirt; j++){
433  tmp1 = svirt_displs[i*new_nvirt+j];
434  svirt_displs[i*new_nvirt+j] = svirt_displs[i*new_nvirt+j-1]+tmp2;
435  tmp2 = tmp1;
436  }
437  }*/
438 
439  /* Exchange displacements for virt buckets */
440 /* MPI_Alltoall(svirt_displs, new_nvirt, MPI_INT64_T,
441  rvirt_displs, new_nvirt, MPI_INT64_T, ord_glb_comm.cm);*/
442 
443  cdealloc(all_virt_counts);
444  }
445 
446  //static double init_mdl[] = {COST_LATENCY, COST_LATENCY, COST_NETWBW};
447  LinModel<2> blres_mdl(blres_mdl_init,"blres_mdl");
448 
449  double blres_est_time(int64_t tot_sz, int nv0, int nv1){
450  double ps[] = {(double)nv0+nv1, (double)tot_sz};
451  return blres_mdl.est_time(ps);
452  }
453 
454  void block_reshuffle(distribution const & old_dist,
455  distribution const & new_dist,
456  char * tsr_data,
457  char *& tsr_cyclic_data,
458  algstrct const * sr,
459  CommData glb_comm){
460 
461  int i, idx_lyr_new, idx_lyr_old, rem_idx, prc_idx, loc_idx;
462  int num_old_virt, num_new_virt;
463  int * idx, * old_loc_lda, * new_loc_lda, * phase_lda;
464  int64_t blk_sz;
465  MPI_Request * reqs;
466  int * phase = old_dist.phase;
467  int order = old_dist.order;
468 
469 
470  if (order == 0){
471  tsr_cyclic_data = sr->alloc(new_dist.size);
472 
473  if (glb_comm.rank == 0){
474  sr->copy(tsr_cyclic_data, tsr_data);
475  } else {
476  sr->copy(tsr_cyclic_data, sr->addid());
477  }
478  return;
479  }
480 
482 #ifdef TUNE
483  MPI_Barrier(glb_comm.cm);
484  double st_time = MPI_Wtime();
485 #endif
486 
487  tsr_cyclic_data = sr->alloc(new_dist.size);
488  alloc_ptr(sizeof(int)*order, (void**)&idx);
489  alloc_ptr(sizeof(int)*order, (void**)&old_loc_lda);
490  alloc_ptr(sizeof(int)*order, (void**)&new_loc_lda);
491  alloc_ptr(sizeof(int)*order, (void**)&phase_lda);
492 
493  blk_sz = old_dist.size;
494  old_loc_lda[0] = 1;
495  new_loc_lda[0] = 1;
496  phase_lda[0] = 1;
497  num_old_virt = 1;
498  num_new_virt = 1;
499  idx_lyr_old = glb_comm.rank;
500  idx_lyr_new = glb_comm.rank;
501 
502  for (i=0; i<order; i++){
503  num_old_virt *= old_dist.virt_phase[i];
504  num_new_virt *= new_dist.virt_phase[i];
505  blk_sz = blk_sz/old_dist.virt_phase[i];
506  idx_lyr_old -= old_dist.perank[i]*old_dist.pe_lda[i];
507  idx_lyr_new -= new_dist.perank[i]*new_dist.pe_lda[i];
508  if (i>0){
509  old_loc_lda[i] = old_loc_lda[i-1]*old_dist.virt_phase[i-1];
510  new_loc_lda[i] = new_loc_lda[i-1]*new_dist.virt_phase[i-1];
511  phase_lda[i] = phase_lda[i-1]*phase[i-1];
512  }
513  }
514 
515 #ifdef TUNE
516  double * tps = (double*)malloc(3*sizeof(double));
517  tps[0] = 0;
518  tps[1] = (double)num_old_virt+num_new_virt;
519  tps[2] = (double)std::max(new_dist.size, new_dist.size);
520 
521  if (!(blres_mdl.should_observe(tps))){
522  cdealloc(idx);
523  cdealloc(old_loc_lda);
524  cdealloc(new_loc_lda);
525  cdealloc(phase_lda);
526  return;
527  }
528  free(tps);
529 #endif
530  alloc_ptr(sizeof(MPI_Request)*(num_old_virt+num_new_virt), (void**)&reqs);
531 
532  if (idx_lyr_new == 0){
533  memset(idx, 0, sizeof(int)*order);
534 
535  for (;;){
536  loc_idx = 0;
537  rem_idx = 0;
538  prc_idx = 0;
539  for (i=0; i<order; i++){
540  loc_idx += idx[i]*new_loc_lda[i];
541  rem_idx += ((idx[i]*new_dist.phys_phase[i] + new_dist.perank[i])/old_dist.phys_phase[i])*old_loc_lda[i];
542  prc_idx += ((idx[i]*new_dist.phys_phase[i] + new_dist.perank[i])%old_dist.phys_phase[i])*old_dist.pe_lda[i];
543  }
544  DPRINTF(3,"proc %d receiving blk %d (loc %d, size %ld) from proc %d\n",
545  glb_comm.rank, rem_idx, loc_idx, blk_sz, prc_idx);
546  MPI_Irecv(tsr_cyclic_data+sr->el_size*loc_idx*blk_sz, blk_sz,
547  sr->mdtype(), prc_idx, rem_idx, glb_comm.cm, reqs+loc_idx);
548  for (i=0; i<order; i++){
549  idx[i]++;
550  if (idx[i] >= new_dist.virt_phase[i])
551  idx[i] = 0;
552  else
553  break;
554  }
555  if (i==order) break;
556  }
557  }
558 
559  if (idx_lyr_old == 0){
560  memset(idx, 0, sizeof(int)*order);
561 
562  for (;;){
563  loc_idx = 0;
564  prc_idx = 0;
565  for (i=0; i<order; i++){
566  loc_idx += idx[i]*old_loc_lda[i];
567  prc_idx += ((idx[i]*old_dist.phys_phase[i] + old_dist.perank[i])%new_dist.phys_phase[i])*new_dist.pe_lda[i];
568  }
569  DPRINTF(3,"proc %d sending blk %d (loc %d size %ld) to proc %d el_size = %d %p %p\n",
570  glb_comm.rank, loc_idx, loc_idx, blk_sz, prc_idx, sr->el_size, tsr_data, reqs+num_new_virt+loc_idx);
571  MPI_Isend(tsr_data+sr->el_size*loc_idx*blk_sz, blk_sz,
572  sr->mdtype(), prc_idx, loc_idx, glb_comm.cm, reqs+num_new_virt+loc_idx);
573  for (i=0; i<order; i++){
574  idx[i]++;
575  if (idx[i] >= old_dist.virt_phase[i])
576  idx[i] = 0;
577  else
578  break;
579  }
580  if (i==order) break;
581  }
582  }
583 
584  if (idx_lyr_new == 0 && idx_lyr_old == 0){
585  MPI_Waitall(num_new_virt+num_old_virt, reqs, MPI_STATUSES_IGNORE);
586  } else if (idx_lyr_new == 0){
587  MPI_Waitall(num_new_virt, reqs, MPI_STATUSES_IGNORE);
588  } else if (idx_lyr_old == 0){
589  MPI_Waitall(num_old_virt, reqs+num_new_virt, MPI_STATUSES_IGNORE);
590  if (sr->addid() != NULL)
591  sr->set(tsr_cyclic_data, sr->addid(), new_dist.size);
592  } else {
593  if (sr->addid() != NULL)
594  sr->set(tsr_cyclic_data, sr->addid(), new_dist.size);
595  }
596 
597  cdealloc(idx);
598  cdealloc(old_loc_lda);
599  cdealloc(new_loc_lda);
600  cdealloc(phase_lda);
601  cdealloc(reqs);
602 
603 #ifdef TUNE
604  MPI_Barrier(glb_comm.cm);
605  double exe_time = MPI_Wtime()-st_time;
606  tps = (double*)malloc(3*sizeof(double));
607  tps[0] = exe_time;
608  tps[1] = (double)num_old_virt+num_new_virt;
609  tps[2] = (double)std::max(new_dist.size, new_dist.size);
610  blres_mdl.observe(tps);
611  free(tps);
612 #endif
613 
615 
616  }
617 
618  int can_block_reshuffle(int order,
619  int const * old_phase,
620  mapping const * map){
621  int new_phase, j;
622  int can_block_resh = 1;
623  for (j=0; j<order; j++){
624  new_phase = map[j].calc_phase();
625  if (new_phase != old_phase[j]) can_block_resh = 0;
626  }
627  return can_block_resh;
628  }
629 
630 
631 }
int ** compute_bucket_offsets(distribution const &old_dist, distribution const &new_dist, int const *len, int const *old_phys_edge_len, int const *old_virt_lda, int const *old_offsets, int *const *old_permutation, int const *new_phys_edge_len, int const *new_virt_lda, int forward, int old_virt_np, int new_virt_np, int const *old_virt_edge_len)
computes offsets for redistribution targets along each edge length
Definition: redist.cxx:111
double est_time(double const *param)
estimates model time based on observarions
Definition: model.cxx:530
#define DPRINTF(...)
Definition: util.h:235
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values
Definition: model.cxx:168
int calc_phase() const
compute the phase of a mapping
Definition: mapping.cxx:39
void read_loc_pairs(int order, int64_t nval, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *phase_rank, int64_t *nread, char const *data, char **pairs, algstrct const *sr)
read tensor pairs local to processor
Definition: sparse_rw.cxx:1235
virtual void copy(char *a, char const *b) const
copies element b to element a
Definition: algstrct.cxx:538
#define ASSERT(...)
Definition: util.h:88
Definition: common.h:37
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
#define DEBUG_PRINTF(...)
Definition: util.h:238
double blres_mdl_init[]
Definition: init_models.cxx:29
LinModel< 2 > blres_mdl(blres_mdl_init,"blres_mdl")
virtual char * alloc(int64_t n) const
allocate space for n items, necessary for object types
Definition: algstrct.cxx:685
void padded_reshuffle(int const *sym, distribution const &old_dist, distribution const &new_dist, char *tsr_data, char **tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm)
Reshuffle elements using key-value pair read/write.
Definition: redist.cxx:8
#define MAX(a, b)
Definition: util.h:180
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction
Definition: memcontrol.cxx:269
Linear performance models, which given measurements, provides new model guess.
Definition: model.h:32
int can_block_reshuffle(int order, int const *old_phase, mapping const *map)
determines if tensor can be permuted by block
Definition: redist.cxx:618
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
void calc_cnt_displs(int const *sym, distribution const &old_dist, distribution const &new_dist, int new_nvirt, int np, int const *old_virt_edge_len, int const *new_virt_lda, int64_t *send_counts, int64_t *recv_counts, int64_t *send_displs, int64_t *recv_displs, CommData ord_glb_comm, int idx_lyr, int *const *bucket_offset)
assigns keys to an array of values
Definition: redist.cxx:170
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
MPI_Comm cm
Definition: common.h:129
bool should_observe(double const *time_param)
decides whether the current instance should be observed
Definition: model.cxx:215
void block_reshuffle(distribution const &old_dist, distribution const &new_dist, char *tsr_data, char *&tsr_cyclic_data, algstrct const *sr, CommData glb_comm)
Reshuffle elements by block given the global phases stay the same.
Definition: redist.cxx:454
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
double blres_est_time(int64_t tot_sz, int nv0, int nv1)
estimates execution time, given this processor sends a receives tot_sz across np procs ...
Definition: redist.cxx:449
void wr_pairs_layout(int order, int np, int64_t inwrite, char const *alpha, char const *beta, char rw, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *virt_phys_rank, int const *bucket_lda, char *wr_pairs_buf, char *rw_data, CommData glb_comm, algstrct const *sr, bool is_sparse, int64_t nnz_loc, int64_t *nnz_blk, char *&pprs_new, int64_t &nnz_loc_new)
read or write pairs from / to tensor
Definition: sparse_rw.cxx:872
#define MIN(a, b)
Definition: util.h:176
Definition: common.h:37
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
Definition: util.cxx:10
virtual MPI_Datatype mdtype() const
MPI datatype.
Definition: algstrct.cxx:80
def np(self)
Definition: core.pyx:315