Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
glb_cyclic_reshuffle.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #include "glb_cyclic_reshuffle.h"
4 #include "../shared/util.h"
5 
6 
7 namespace CTF_int {
8  void glb_ord_pup(int const * sym,
9  distribution const & old_dist,
10  distribution const & new_dist,
11  int const * len,
12  int const * old_phys_dim,
13  int const * old_phys_edge_len,
14  int const * old_virt_edge_len,
15  int64_t old_virt_nelem,
16  int const * old_offsets,
17  int * const * old_permutation,
18  int total_np,
19  int const * new_phys_dim,
20  int const * new_phys_edge_len,
21  int const * new_virt_edge_len,
22  int64_t new_virt_nelem,
23  char * old_data,
24  char ** new_data,
25  int forward,
26  int * const * bucket_offset,
27  char const * alpha,
28  char const * beta,
29  algstrct const * sr){
30  bool is_copy = false;
31  if (sr->isequal(sr->mulid(), alpha) && sr->isequal(sr->addid(), beta)) is_copy = true;
32  if (old_dist.order == 0){
33  if (forward)
34  sr->copy(new_data[0], old_data);
35  else {
36  if (is_copy)
37  sr->copy(old_data, new_data[0]);
38  else
39  sr->acc(old_data, beta, new_data[0], alpha);
40  }
41  return;
42  }
43 
44  int old_virt_np = 1;
45  for (int dim = 0;dim < old_dist.order;dim++) old_virt_np *= old_dist.virt_phase[dim];
46 
47  int new_virt_np = 1;
48  for (int dim = 0;dim < old_dist.order;dim++) new_virt_np *= new_dist.virt_phase[dim];
49 
50  int nbucket = total_np; //*(forward ? new_virt_np : old_virt_np);
51 
52  #if DEBUG >= 1
53  int rank;
54  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
55  #endif
56 
57 #ifdef USE_OMP
58 // ASSERT(0);
59  // assert(0);
60 #endif
61  TAU_FSTART(cyclic_pup_bucket);
62  #ifdef USSSE_OMP
63  int max_ntd = omp_get_max_threads();
64  max_ntd = MAX(1,MIN(max_ntd,new_virt_nelem/nbucket));
65 
66  int64_t old_size, new_size;
67  old_size = sy_packed_size(old_dist.order, old_virt_edge_len, sym)*old_virt_np;
68  new_size = sy_packed_size(old_dist.order, new_virt_edge_len, sym)*new_virt_np;
69  /*if (forward){
70  } else {
71  old_size = sy_packed_size(old_dist.order, old_virt_edge_len, sym)*new_virt_np;
72  new_size = sy_packed_size(old_dist.order, new_virt_edge_len, sym)*old_virt_np;
73  }*/
74  /*printf("old_size=%d, new_size=%d,old_virt_np=%d,new_virt_np=%d\n",
75  old_size,new_size,old_virt_np,new_virt_np);
76  */
77  int64_t * bucket_store;
78  int64_t * count_store;
79  int64_t * thread_store;
80  mst_alloc_ptr(sizeof(int64_t)*MAX(old_size,new_size), (void**)&bucket_store);
81  mst_alloc_ptr(sizeof(int64_t)*MAX(old_size,new_size), (void**)&count_store);
82  mst_alloc_ptr(sizeof(int64_t)*MAX(old_size,new_size), (void**)&thread_store);
83  std::fill(bucket_store, bucket_store+MAX(old_size,new_size), -1);
84 
85  int64_t ** par_virt_counts;
86  alloc_ptr(sizeof(int64_t*)*max_ntd, (void**)&par_virt_counts);
87  for (int t=0; t<max_ntd; t++){
88  mst_alloc_ptr(sizeof(int64_t)*nbucket, (void**)&par_virt_counts[t]);
89  std::fill(par_virt_counts[t], par_virt_counts[t]+nbucket, 0);
90  }
91  #pragma omp parallel num_threads(max_ntd)
92  {
93  #endif
94 
95  int *offs; alloc_ptr(sizeof(int)*old_dist.order, (void**)&offs);
96  if (old_offsets == NULL)
97  for (int dim = 0;dim < old_dist.order;dim++) offs[dim] = 0;
98  else
99  for (int dim = 0;dim < old_dist.order;dim++) offs[dim] = old_offsets[dim];
100 
101  int *ends; alloc_ptr(sizeof(int)*old_dist.order, (void**)&ends);
102  for (int dim = 0;dim < old_dist.order;dim++) ends[dim] = len[dim];
103 
104  #ifdef USSSE_OMP
105  int tid = omp_get_thread_num();
106  int ntd = omp_get_num_threads();
107  //partition the global tensor among threads, to preserve
108  //global ordering and load balance in partitioning
109  int gidx_st[old_dist.order];
110  int gidx_end[old_dist.order];
111  if (old_dist.order > 1){
112  int64_t all_size = packed_size(old_dist.order, len, sym);
113  int64_t chnk = all_size/ntd;
114  int64_t glb_idx_st = chnk*tid + MIN(tid,all_size%ntd);
115  int64_t glb_idx_end = glb_idx_st+chnk+(tid<(all_size%ntd));
116  //calculate global indices along each dimension corresponding to partition
117 // printf("glb_idx_st = %ld, glb_idx_end = %ld\n",glb_idx_st,glb_idx_end);
118  calc_idx_arr(old_dist.order, len, sym, glb_idx_st, gidx_st);
119  calc_idx_arr(old_dist.order, len, sym, glb_idx_end, gidx_end);
120  gidx_st[0] = 0;
121  //FIXME: wrong but evidently not used
122  gidx_end[0] = 0;
123  #if DEBUG >= 1
124  if (ntd == 1){
125  if (gidx_end[old_dist.order-1] != len[old_dist.order-1]){
126  for (int dim=0; dim<old_dist.order; dim++){
127  printf("glb_idx_end = %ld, gidx_end[%d]= %d, len[%d] = %d\n",
128  glb_idx_end, dim, gidx_end[dim], dim, len[dim]);
129  }
130  ABORT;
131  }
132  ASSERT(gidx_end[old_dist.order-1] <= ends[old_dist.order-1]);
133  }
134  #endif
135  } else {
136  //FIXME the below means redistribution of a vector is non-threaded
137  if (tid == 0){
138  gidx_st[0] = 0;
139  gidx_end[0] = ends[0];
140  } else {
141  gidx_st[0] = 0;
142  gidx_end[0] = 0;
143  }
144 
145  }
146  //clip global indices to my physical cyclic phase (local tensor data)
147 
148  #endif
149  // FIXME: may be better to mst_alloc, but this should ensure the
150  // compiler knows there are no write conflicts
151  #ifdef USSSE_OMP
152  int64_t * count = par_virt_counts[tid];
153  #else
154  int64_t *count; alloc_ptr(sizeof(int64_t)*nbucket, (void**)&count);
155  memset(count, 0, sizeof(int64_t)*nbucket);
156  #endif
157 
158  int *gidx; alloc_ptr(sizeof(int)*old_dist.order, (void**)&gidx);
159  memset(gidx, 0, sizeof(int)*old_dist.order);
160  for (int dim = 0;dim < old_dist.order;dim++){
161  gidx[dim] = old_dist.perank[dim];
162  }
163 
164  int64_t *virt_offset; alloc_ptr(sizeof(int64_t)*old_dist.order, (void**)&virt_offset);
165  memset(virt_offset, 0, sizeof(int64_t)*old_dist.order);
166 
167  int *idx; alloc_ptr(sizeof(int)*old_dist.order, (void**)&idx);
168  memset(idx, 0, sizeof(int)*old_dist.order);
169 
170  int64_t *virt_acc; alloc_ptr(sizeof(int64_t)*old_dist.order, (void**)&virt_acc);
171  memset(virt_acc, 0, sizeof(int64_t)*old_dist.order);
172 
173  int64_t *idx_acc; alloc_ptr(sizeof(int64_t)*old_dist.order, (void**)&idx_acc);
174  memset(idx_acc, 0, sizeof(int64_t)*old_dist.order);
175 
176  int64_t *old_virt_lda; alloc_ptr(sizeof(int64_t)*old_dist.order, (void**)&old_virt_lda);
177  old_virt_lda[0] = old_virt_nelem;
178  for (int dim=1; dim<old_dist.order; dim++){
179  old_virt_lda[dim] = old_virt_lda[dim-1]*old_dist.virt_phase[dim-1];
180  }
181 
182  int64_t offset = 0;
183 
184  int64_t zero_len_toff = 0;
185 
186  #ifdef USSSE_OMP
187  for (int dim=old_dist.order-1; dim>=0; dim--){
188  int64_t iist = MAX(0,(gidx_st[dim]-old_dist.perank[dim]));
189  int64_t ist = iist/old_dist.phase[dim];//(old_phys_dim[dim]*old_dist.virt_phase[dim]);
190  if (sym[dim] != NS) ist = MIN(ist,idx[dim+1]);
191  int plen[old_dist.order];
192  memcpy(plen,old_virt_edge_len,old_dist.order*sizeof(int));
193  int idim = dim;
194  do {
195  plen[idim] = ist;
196  idim--;
197  } while (idim >= 0 && sym[idim] != NS);
198  gidx[dim] += ist*old_dist.phase[dim];//old_phys_dim[dim]*old_dist.virt_phase[dim];
199  idx[dim] = ist;
200  idx_acc[dim] = sy_packed_size(dim+1, plen, sym);
201  offset += idx_acc[dim];
202 
203  ASSERT(ist == 0 || gidx[dim] <= gidx_st[dim]);
204  // ASSERT(ist < old_virt_edge_len[dim]);
205 
206  if (gidx[dim] > gidx_st[dim]) break;
207 
208  int64_t vst = iist-ist*old_dist.phase[dim];//*old_phys_dim[dim]*old_dist.virt_phase[dim];
209  if (vst > 0 ){
210  vst = MIN(old_dist.virt_phase[dim]-1,vst);
211  gidx[dim] += vst*old_dist.phys_phase[dim];
212  virt_offset[dim] = vst;
213  offset += vst*old_virt_lda[dim];
214  } else vst = 0;
215  if (gidx[dim] > gidx_st[dim]) break;
216  }
217  #endif
218 
219  ASSERT(old_permutation == NULL);
220  int rep_phase0 = lcm(old_phys_dim[0],new_phys_dim[0])/old_phys_dim[0];
221  #if DEBUG >=2
222  if (rank == 0)
223  printf("rep_phase0 = %d\n",rep_phase0);
224  for (int id=0; id<rep_phase0; id++){
225  for (int jd=0; jd<(old_phys_edge_len[0]-id)/rep_phase0; jd++){
226  printf("bucket_offset[%d] = %d\n",id+jd*rep_phase0,bucket_offset[0][id+jd*rep_phase0]);
227  ASSERT(bucket_offset[0][id+jd*rep_phase0] == bucket_offset[0][id] || bucket_offset[0][id+jd*rep_phase0] == -1);
228  }
229  }
230  #endif
231 
232  bool done = false;
233  for (;!done;){
234  int64_t bucket0 = 0;
235  bool outside0 = false;
236  int len_zero_max = ends[0];
237  #ifdef USSSE_OMP
238  bool is_at_end = true;
239  bool is_at_start = true;
240  for (int dim = old_dist.order-1;dim >0;dim--){
241  if (gidx[dim] > gidx_st[dim]){
242  is_at_start = false;
243  break;
244  }
245  if (gidx[dim] < gidx_st[dim]){
246  outside0 = true;
247  break;
248  }
249  }
250  if (is_at_start){
251  zero_len_toff = gidx_st[0];
252  }
253  for (int dim = old_dist.order-1;dim >0;dim--){
254  if (gidx_end[dim] < gidx[dim]){
255  outside0 = true;
256  done = true;
257  break;
258  }
259  if (gidx_end[dim] > gidx[dim]){
260  is_at_end = false;
261  break;
262  }
263  }
264  if (is_at_end){
265  len_zero_max = MIN(ends[0],gidx_end[0]);
266  done = true;
267  }
268  #endif
269 
270  if (!outside0){
271  for (int dim = 1;dim < old_dist.order;dim++){
272  if (bucket_offset[dim][virt_offset[dim]+idx[dim]*old_dist.virt_phase[dim]] == -1) outside0 = true;
273  bucket0 += bucket_offset[dim][virt_offset[dim]+idx[dim]*old_dist.virt_phase[dim]];
274  }
275  }
276 
277  if (!outside0){
278  for (int dim = 1;dim < old_dist.order;dim++){
279  if (gidx[dim] >= (sym[dim] == NS ? ends[dim] :
280  (sym[dim] == SY ? gidx[dim+1]+1 :
281  gidx[dim+1])) ||
282  gidx[dim] < offs[dim]){
283  outside0 = true;
284  break;
285  }
286  }
287  }
288 
289  int idx_max = (sym[0] == NS ? old_virt_edge_len[0] : idx[1]+1);
290 
291  if (!outside0){
292  int gidx_min = MAX(zero_len_toff,offs[0]);
293  int gidx_max = (sym[0] == NS ? ends[0] : (sym[0] == SY ? gidx[1]+1 : gidx[1]));
294  gidx_max = MIN(gidx_max, len_zero_max);
295 
296  int idx0 = MAX(0,(gidx_min-gidx[0])/old_phys_dim[0]);
297  //int vidx0 = idx0%old_dist.virt_phase[0];
298  int idx1 = MAX(0,(gidx_max-gidx[0]+old_phys_dim[0]-1)/old_phys_dim[0]);
299  int lencp = MIN(rep_phase0,idx1-idx0);
300  ASSERT(is_copy);
301  if (forward){
302  for (int ia=0; ia<lencp; ia++){
303  int64_t bucket = bucket0+bucket_offset[0][idx0];//((vidx0+ia)%old_dist.virt_phase[0])*old_virt_edge_len[0]+idx0/old_dist.virt_phase[0]];
304  sr->copy((idx1-idx0+rep_phase0-1)/rep_phase0,
305  old_data+ sr->el_size*(offset+idx0), rep_phase0,
306  new_data[bucket]+sr->el_size*count[bucket], 1);
307  count[bucket]+=(idx1-idx0+rep_phase0-1)/rep_phase0;
308 #if DEBUG>=1
309  // printf("[%d] gidx[0]=%d,gidx_min=%d,idx0=%d,gidx_max=%d,idx1=%d,bucket=%d,len=%d\n",rank,gidx[0],gidx_min,idx0,gidx_max,idx1,bucket,(idx1-idx0+rep_phase0-1)/rep_phase0);
310 #endif
311  idx0++;
312  }
313  } else {
314  for (int ia=0; ia<lencp; ia++){
315  int64_t bucket = bucket0+bucket_offset[0][idx0];//((vidx0+ia)%old_dist.virt_phase[0])*old_virt_edge_len[0]+idx0/old_dist.virt_phase[0]];
316  sr->copy((idx1-idx0+rep_phase0-1)/rep_phase0,
317  new_data[bucket]+sr->el_size*count[bucket], 1,
318  old_data+ sr->el_size*(offset+idx0), rep_phase0);
319  count[bucket]+=(idx1-idx0+rep_phase0-1)/rep_phase0;
320  //printf("-r- gidx[0]=%d,gidx_min=%d,idx0=%d,gidx_max=%d,idx1=%d,bucket=%d,len=%d\n",gidx[0],gidx_min,idx0,gidx_max,idx1,bucket,(idx1-idx0+rep_phase0-1)/rep_phase0);
321  idx0++;
322  }
323  }
324 /*
325  gidx_max = MIN(gidx_max, len_zero_max);
326  int64_t moffset = offset;
327  for (idx[0] = idx_st;idx[0] < idx_max;idx[0]++){
328  int virt_min = MAX(0,MIN(old_dist.virt_phase[0],gidx_min-gidx[0]));
329  int virt_max = MAX(0,MIN(old_dist.virt_phase[0],gidx_max-gidx[0]));
330 
331  moffset += virt_min;
332  if (forward){
333  ASSERT(is_copy);
334  for (virt_offset[0] = virt_min*old_virt_edge_len[0];
335  virt_offset[0] < virt_max*old_virt_edge_len[0];
336  virt_offset[0] += old_virt_edge_len[0])
337  {
338  int64_t bucket = bucket0+bucket_offset[0][virt_offset[0]+idx[0]];
339  #ifdef USSSE_OMP
340  bucket_store[moffset] = bucket;
341  count_store[moffset] = count[bucket]++;
342  thread_store[moffset] = tid;
343  #else
344 // printf("[%d] bucket = %d offset = %ld\n", rank, bucket, offset);
345  // printf("[%d] count[bucket] = %d, nbucket = %d\n", rank, count[bucket]+1, nbucket);
346  // std::cout << "old_data[offset]=";
347  // sr->print(old_data+ sr->el_size*offset);
348  sr->copy(new_data[bucket]+sr->el_size*(count[bucket]++), old_data+ sr->el_size*moffset);
349 // std::cout << "\nnew_data[bucket][count[bucket]++]=";
350  // sr->print(new_data[bucket]+sr->el_size*(count[bucket]-1));
351  // std::cout << "\n";
352  #endif
353  moffset++;
354  }
355  }
356  else{
357  for (virt_offset[0] = virt_min*old_virt_edge_len[0];
358  virt_offset[0] < virt_max*old_virt_edge_len[0];
359  virt_offset[0] += old_virt_edge_len[0])
360  {
361  int64_t bucket = bucket0+bucket_offset[0][virt_offset[0]+idx[0]];
362  #ifdef USSSE_OMP
363  bucket_store[moffset] = bucket;
364  count_store[moffset] = count[bucket]++;
365  thread_store[moffset] = tid;
366  #else
367  if (is_copy)
368  sr->copy(old_data+sr->el_size*moffset, new_data[bucket]+sr->el_size*(count[bucket]++));
369  else
370  sr->acc( old_data+sr->el_size*moffset, beta, new_data[bucket]+sr->el_size*(count[bucket]++), alpha);
371 // old_data[moffset] = beta*old_data[moffset] + alpha*new_data[bucket][count[bucket]++];
372  #endif
373  moffset ++;
374  }
375  }
376  gidx[0] += old_phys_dim[0]*old_dist.virt_phase[0];
377  }
378  gidx[0] -= idx_max*old_phys_dim[0]*old_dist.virt_phase[0];
379  */
380  }
381  offset += idx_max*old_dist.virt_phase[0];
382 
383  idx[0] = 0;
384 
385  zero_len_toff = 0;
386 
387  /* Adjust outer indices */
388  if (!done){
389  for (int dim = 1;dim < old_dist.order;dim++){
390  virt_offset[dim] += 1; //old_virt_edge_len[dim];
391  gidx[dim]+=old_dist.phys_phase[dim];
392 
393  if (virt_offset[dim] == old_dist.virt_phase[dim]){
394  gidx[dim] -= old_dist.phase[dim];
395  virt_offset[dim] = 0;
396 
397  gidx[dim] -= idx[dim]*old_dist.phase[dim];//phys_dim[dim]*old_dist.virt_phase[dim];
398  idx[dim]++;
399 
400  if (idx[dim] == (sym[dim] == NS ? old_virt_edge_len[dim] : idx[dim+1]+1)){
401  //index should always be zero here sicne everything is SY and not SH
402  idx[dim] = 0;//(dim == 0 || sym[dim-1] == NS ? 0 : idx[dim-1]);
403  //gidx[dim] += idx[dim]*old_phys_dim[dim]*old_dist.virt_phase[dim];
404 
405  if (dim == old_dist.order-1) done = true;
406  }
407  else{
408  gidx[dim] += idx[dim]*old_dist.phase[dim];//old_phys_dim[dim]*old_dist.virt_phase[dim];
409  break;
410  }
411  }
412  else{
413  idx_acc[dim-1] = 0;
414  break;
415  }
416  }
417  if (old_dist.order <= 1) done = true;
418  }
419  }
420  cdealloc(gidx);
421  cdealloc(idx_acc);
422  cdealloc(virt_acc);
423  cdealloc(idx);
424  cdealloc(virt_offset);
425  cdealloc(old_virt_lda);
426 
427  #ifndef USSSE_OMP
428  #if DEBUG >= 1
429  bool pass = true;
430  for (int i = 0;i < nbucket-1;i++){
431  if (count[i] != (int64_t)((new_data[i+1]-new_data[i])/sr->el_size)){
432  printf("rank = %d count %d should have been %d is %ld\n", rank, i, (int)((new_data[i+1]-new_data[i])/sr->el_size), count[i]);
433  pass = false;
434  }
435  }
436  if (!pass) ABORT;
437  #endif
438  #endif
439  cdealloc(offs);
440  cdealloc(ends);
441 
442  #ifndef USSSE_OMP
443  cdealloc(count);
444  TAU_FSTOP(cyclic_pup_bucket);
445  #else
446  par_virt_counts[tid] = count;
447  } //#pragma omp endfor
448  for (int bckt=0; bckt<nbucket; bckt++){
449  int par_tmp = 0;
450  for (int thread=0; thread<max_ntd; thread++){
451  par_tmp += par_virt_counts[thread][bckt];
452  par_virt_counts[thread][bckt] = par_tmp - par_virt_counts[thread][bckt];
453  }
454  #if DEBUG >= 1
455  if (bckt < nbucket-1 && par_tmp != (new_data[bckt+1]-new_data[bckt])/sr->el_size){
456  printf("rank = %d count for bucket %d is %d should have been %ld\n",rank,bckt,par_tmp,(int64_t)(new_data[bckt+1]-new_data[bckt])/sr->el_size);
457  ABORT;
458  }
459  #endif
460  }
461  TAU_FSTOP(cyclic_pup_bucket);
462  TAU_FSTART(cyclic_pup_move);
463  {
464  int64_t tot_sz = MAX(old_size, new_size);
465  int64_t i;
466  if (forward){
467  ASSERT(is_copy);
468  #pragma omp parallel for private(i)
469  for (i=0; i<tot_sz; i++){
470  if (bucket_store[i] != -1){
471  int64_t pc = par_virt_counts[thread_store[i]][bucket_store[i]];
472  int64_t ct = count_store[i]+pc;
473  sr->copy(new_data[bucket_store[i]]+ct*sr->el_size, old_data+i*sr->el_size);
474  }
475  }
476  } else {
477  if (is_copy){// alpha == 1.0 && beta == 0.0){
478  #pragma omp parallel for private(i)
479  for (i=0; i<tot_sz; i++){
480  if (bucket_store[i] != -1){
481  int64_t pc = par_virt_counts[thread_store[i]][bucket_store[i]];
482  int64_t ct = count_store[i]+pc;
483  sr->copy(old_data+i*sr->el_size, new_data[bucket_store[i]]+ct*sr->el_size);
484  }
485  }
486  } else {
487  #pragma omp parallel for private(i)
488  for (i=0; i<tot_sz; i++){
489  if (bucket_store[i] != -1){
490  int64_t pc = par_virt_counts[thread_store[i]][bucket_store[i]];
491  int64_t ct = count_store[i]+pc;
492  sr->acc(old_data+i*sr->el_size, beta, new_data[bucket_store[i]]+ct*sr->el_size, alpha);
493  }
494  }
495  }
496  }
497  }
498  TAU_FSTOP(cyclic_pup_move);
499  for (int t=0; t<max_ntd; t++){
500  cdealloc(par_virt_counts[t]);
501  }
502  cdealloc(par_virt_counts);
503  cdealloc(count_store);
504  cdealloc(bucket_store);
505  cdealloc(thread_store);
506  #endif
507 
508  }
509 
510 
511  static inline
512  int64_t sy_packed_offset(int dim, int const * len, int idx, int const * sym){
513  if (idx == 0) return 0;
514  if (sym[dim-1] == NS){
515  return sy_packed_size(dim, len, sym)*idx;
516  } else {
517  int i=1;
518  int ii=1;
519  int iidx = idx;
520  int64_t offset = iidx;
521  do {
522  i++;
523  ii*=i;
524  iidx++;
525  offset *= iidx;
526  } while (i<=dim && sym[dim-i] != NS);
527  return (offset/ii)*sy_packed_size(dim-i+1,len,sym);
528 
529  }
530  }
531 
532  template <int idim>
533  void ord_glb(int const * sym,
534  distribution const & dist,
535  int const * virt_edge_len,
536  int const * virt_phase_lda,
537  int64_t vbs,
538  bool dir,
539  char const * tsr_data_in,
540  char * tsr_data_out,
541  algstrct const * sr,
542  int prev_idx=0,
543  int64_t glb_ord_offset=0,
544  int64_t blk_ord_offset=0){
545 
546  int imax=virt_edge_len[idim];
547  if (sym[idim] != NS) imax = prev_idx+1;
548  int vp_stride = virt_phase_lda[idim]*dist.virt_phase[idim];
549  for (int i=0; i<imax; i++){
550  int64_t dim_offset = sy_packed_offset(idim, virt_edge_len, i, sym);
551  int64_t i_blk_ord_offset = blk_ord_offset + dim_offset;
552  int64_t i_glb_ord_offset = glb_ord_offset + dim_offset*vp_stride;
553  for (int v=0; v<dist.virt_phase[idim]; v++){
554  int64_t iv_blk_ord_offset = i_blk_ord_offset + v*virt_phase_lda[idim]*vbs;
555  int64_t iv_glb_ord_offset = i_glb_ord_offset;
556  if (v>0){
557  int64_t glb_vrt_offset = sy_packed_offset(idim, virt_edge_len, i+1, sym);
558  iv_glb_ord_offset += (glb_vrt_offset-dim_offset)*virt_phase_lda[idim]*v;
559  }
560  ord_glb<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, i, iv_glb_ord_offset, iv_blk_ord_offset);
561  }
562  }
563  }
564 
565  template <>
566  inline void ord_glb<0>(int const * sym,
567  distribution const & dist,
568  int const * virt_edge_len,
569  int const * virt_phase_lda,
570  int64_t vbs,
571  bool dir,
572  char const * tsr_data_in,
573  char * tsr_data_out,
574  algstrct const * sr,
575  int prev_idx,
576  int64_t glb_ord_offset,
577  int64_t blk_ord_offset){
578  int imax=virt_edge_len[0];
579  if (sym[0] != NS) imax = prev_idx+1;
580  for (int v=0; v<dist.virt_phase[0]; v++){
581  if (dir){
582  sr->copy(imax, tsr_data_in + sr->el_size*(blk_ord_offset+v*vbs), 1,
583  tsr_data_out + sr->el_size*(glb_ord_offset+v), dist.virt_phase[0]);
584  } else {
585  sr->copy(imax, tsr_data_in + sr->el_size*(glb_ord_offset+v), dist.virt_phase[0],
586  tsr_data_out + sr->el_size*(blk_ord_offset+v*vbs), 1);
587  }
588  }
589  }
590 
591  template
592  void ord_glb<7>(int const * sym,
593  distribution const & dist,
594  int const * virt_edge_len,
595  int const * virt_phase_lda,
596  int64_t vbs,
597  bool dir,
598  char const * tsr_data_in,
599  char * tsr_data_out,
600  algstrct const * sr,
601  int prev_idx,
602  int64_t glb_ord_offset,
603  int64_t blk_ord_offset);
604 
605  template <int idim>
606  void ord_glb_omp(int const * sym,
607  distribution const & dist,
608  int const * virt_edge_len,
609  int const * virt_phase_lda,
610  int64_t vbs,
611  bool dir,
612  char const * tsr_data_in,
613  char * tsr_data_out,
614  algstrct const * sr,
615  int const * idx_st,
616  int const * idx_end,
617  int prev_idx=0,
618  int64_t glb_ord_offset=0,
619  int64_t blk_ord_offset=0){
620  int imax=virt_edge_len[idim];
621  if (sym[idim] != NS) imax = prev_idx+1;
622  if (idx_end != NULL)
623  imax = MIN(imax,idx_end[idim]+1);
624  int ist;
625  if (idx_st != NULL)
626  ist = idx_st[idim];
627  else
628  ist = 0;
629  int vp_stride = virt_phase_lda[idim]*dist.virt_phase[idim];
630  // first iteration
631  for (int i=ist; i<imax; i++){
632  int64_t dim_offset = sy_packed_offset(idim, virt_edge_len, i, sym);
633  int64_t i_blk_ord_offset = blk_ord_offset + dim_offset;
634  int64_t i_glb_ord_offset = glb_ord_offset + dim_offset*vp_stride;
635  for (int v=0; v<dist.virt_phase[idim]; v++){
636  int64_t iv_blk_ord_offset = i_blk_ord_offset + v*virt_phase_lda[idim]*vbs;
637  int64_t iv_glb_ord_offset = i_glb_ord_offset;
638  if (v>0){
639  int64_t glb_vrt_offset = sy_packed_offset(idim, virt_edge_len, i+1, sym);
640  iv_glb_ord_offset += (glb_vrt_offset-dim_offset)*virt_phase_lda[idim]*v;
641  }
642  if (i==ist && i==imax-1)
643  ord_glb_omp<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, idx_st, idx_end, i, iv_glb_ord_offset, iv_blk_ord_offset);
644  else if (i==ist)
645  ord_glb_omp<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, idx_st, NULL, i, iv_glb_ord_offset, iv_blk_ord_offset);
646  else if (i==imax-1)
647  ord_glb_omp<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, NULL, idx_end, i, iv_glb_ord_offset, iv_blk_ord_offset);
648  else
649  ord_glb<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, i, iv_glb_ord_offset, iv_blk_ord_offset);
650  }
651  }
652  }
653 
654  template <>
655  void ord_glb_omp<0>(int const * sym,
656  distribution const & dist,
657  int const * virt_edge_len,
658  int const * virt_phase_lda,
659  int64_t vbs,
660  bool dir,
661  char const * tsr_data_in,
662  char * tsr_data_out,
663  algstrct const * sr,
664  int const * idx_st,
665  int const * idx_end,
666  int prev_idx,
667  int64_t glb_ord_offset,
668  int64_t blk_ord_offset){
669  ord_glb<0>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr,prev_idx,glb_ord_offset,blk_ord_offset);
670  }
671 
672  template
673  void ord_glb_omp<7>(int const * sym,
674  distribution const & dist,
675  int const * virt_edge_len,
676  int const * virt_phase_lda,
677  int64_t vbs,
678  bool dir,
679  char const * tsr_data_in,
680  char * tsr_data_out,
681  algstrct const * sr,
682  int const * idx_st,
683  int const * idx_end,
684  int prev_idx,
685  int64_t glb_ord_offset,
686  int64_t blk_ord_offset);
687 
688 
689  void order_globally(int const * sym,
690  distribution const & dist,
691  int const * virt_edge_len,
692  int const * virt_phase_lda,
693  int64_t vbs,
694  bool dir,
695  char const * tsr_data_in,
696  char * tsr_data_out,
697  algstrct const * sr){
699  ASSERT(dist.order != 0);
700  if (dist.order == 1){
701  return ord_glb<0>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr);
702  }
703  if (dist.order <= 8){
704  //int rank;
705  //MPI_Comm_rank(MPI_COMM_WORLD, &rank);
706 #ifdef USE_OMP
707  #pragma omp parallel
708  {
709  int tid = omp_get_thread_num();
710  int ntd = omp_get_num_threads();
711  int64_t vbs_chunk = vbs/ntd;
712  int64_t fidx_st = vbs_chunk*tid + MIN(tid,vbs%ntd);
713  //limit (index of next thread)
714  int64_t fidx_end = fidx_st + vbs_chunk;
715  if (tid < vbs%ntd) fidx_end++;
716  int * idx_st = (int*)alloc(dist.order*sizeof(int));
717  int * idx_end = (int*)alloc(dist.order*sizeof(int));
718  sy_calc_idx_arr(dist.order, virt_edge_len, sym, fidx_st, idx_st);
719  sy_calc_idx_arr(dist.order, virt_edge_len, sym, fidx_end, idx_end);
720  int idim=1;
721  bool cont=true;
722  do {
723  ASSERT(idim<dist.order);
724  idx_end[idim]--;
725  if (idx_end[idim] < 0 && idim+1<dist.order){
726  idx_end[idim] = virt_edge_len[idim]-1;
727  idim++;
728  } else cont=false;
729  } while (cont);
730  /*if (rank == 0){
731  for (int itid =0; itid< ntd; itid++){
732  #pragma omp barrier
733  if (itid==tid){
734  for (int i=0; i<dist.order; i++){
735  printf("[%d] idx_st[%d] = %d, idx_end[%d] = %d, pad_edge_len[%d] = %d\n", tid, i, idx_st[i], i, idx_end[i], i, virt_edge_len[i]);
736  }
737  }
738  }
739  }*/
740  #define CASE_ORD_GLB(n) \
741  case n: \
742  ord_glb_omp<n-1>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr,idx_st,idx_end); \
743  break;
744 #else
745  #define CASE_ORD_GLB(n) \
746  case n: \
747  ord_glb<n-1>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr); \
748  break;
749 #endif
750  switch (dist.order){
751  CASE_ORD_GLB(1)
752  CASE_ORD_GLB(2)
753  CASE_ORD_GLB(3)
754  CASE_ORD_GLB(4)
755  CASE_ORD_GLB(5)
756  CASE_ORD_GLB(6)
757  CASE_ORD_GLB(7)
758  CASE_ORD_GLB(8)
759  default:
760  assert(0);
761  break;
762  }
763  #undef CASE_ORD_GLB
764 #ifdef USE_OMP
765  }
766 #endif
768  return;
769  }
770  int order = dist.order;
771  int * virt_idx = (int*)alloc(order*sizeof(int));
772  int * idx = (int*)alloc(order*sizeof(int));
773 
774  std::fill(virt_idx, virt_idx+order, 0);
775  for (;;){
776  std::fill(idx, idx+order, 0);
777 
778  //int _rank;
779  //MPI_Comm_rank(MPI_COMM_WORLD, &_rank);
780  for (;;){
781  int64_t glb_ord_offset = virt_idx[0];
782  int64_t blk_ord_offset = virt_idx[0]*vbs;
783  for (int idim=1; idim<order; idim++){
784  //calculate offset within virtual block
785  int64_t dim_offset = sy_packed_offset(idim, virt_edge_len, idx[idim], sym);
786  //when each virtual block is stored contiguously, this is the offset within the glock
787  blk_ord_offset += dim_offset;
788  blk_ord_offset += virt_idx[idim]*virt_phase_lda[idim]*vbs;
789  //when the virtual blocks are interleaved according to global order, this is a part of the
790  // offset and needs to be scaled by all smaller virtualization factors
791  glb_ord_offset += dim_offset*virt_phase_lda[idim]*dist.virt_phase[idim];
792  //an dditional offset is needed for the global ordering, if this is not the first virtual
793  // block along this dimension, in which case we must offset according to all elements in
794  // smaller virtual block with the same idim and greater indices
795  //if (_rank == 0)
796  // printf("idim = %d, idx[idim] = %d blk_ord_offset = %ld glb_ord_offset = %ld\n",idim,idx[idim],blk_ord_offset,glb_ord_offset);
797  if (virt_idx[idim] > 0){
798  int64_t glb_vrt_offset = sy_packed_offset(idim, virt_edge_len, idx[idim]+1, sym);
799  glb_ord_offset += (glb_vrt_offset-dim_offset)*virt_phase_lda[idim]*virt_idx[idim];
800  // if (_rank == 0)
801  // printf("idim = %d virt add glb_ord_offset = %ld\n",idim,glb_ord_offset);
802  }
803  }
804 
805  int n = virt_edge_len[0];
806  if (sym[0] != NS) n = idx[1]+1;
807  /*if (_rank == 0){
808  printf("blk_ord_offset = %ld, glb_ord_offset = %ld\n",blk_ord_offset,glb_ord_offset);
809  for (int _i=0; _i<order; _i++){
810  printf("idx[%d] = %d virt_idx[%d]=%d\n",_i,idx[_i],_i,virt_idx[_i]);
811  }
812  for (int _i=0; _i<n; _i++){
813  if (dir){
814  printf("Writing [%ld] ",blk_ord_offset+_i);
815  sr->print(tsr_data_in+sr->el_size*(blk_ord_offset+_i));
816  printf(" to [%ld] ", glb_ord_offset+_i*dist.virt_phase[0]);
817  sr->print(tsr_data_out+sr->el_size*(glb_ord_offset+_i*dist.virt_phase[0]));
818  printf("\n");
819  } else {
820  printf("Writing [%ld] ", glb_ord_offset+_i*dist.virt_phase[0]);
821  sr->print(tsr_data_in+sr->el_size*(glb_ord_offset+_i*dist.virt_phase[0]));
822  printf(" to [%ld] ",blk_ord_offset+_i);
823  sr->print(tsr_data_out+sr->el_size*(blk_ord_offset+_i));
824  printf("\n");
825  }
826  }}*/
827  if (dir){
828  sr->copy(n, tsr_data_in+sr->el_size*blk_ord_offset, 1, tsr_data_out+sr->el_size*glb_ord_offset, dist.virt_phase[0]);
829  } else {
830  sr->copy(n, tsr_data_in+sr->el_size*glb_ord_offset, dist.virt_phase[0], tsr_data_out+sr->el_size*blk_ord_offset, 1);
831  }
832 
833  int dim=1;
834  bool exit, finish=false;
835  do {
836  if (dim==order){
837  exit = true;
838  finish = true;
839  } else {
840  if (idx[dim] == virt_edge_len[dim]-1 || (sym[dim] != NS && idx[dim] == idx[dim+1])){
841  idx[dim] = 0;
842  dim++;
843  exit = false;
844  } else {
845  idx[dim]++;
846  exit = true;
847  }
848  }
849  } while (!exit);
850  if (finish) break;
851  }
852 
853  int dim=0;
854  bool exit, finish=false;
855  do {
856  if (dim==order){
857  exit = true;
858  finish = true;
859  } else {
860  if (virt_idx[dim] == dist.virt_phase[dim]-1){
861  virt_idx[dim] = 0;
862  dim++;
863  exit = false;
864  } else {
865  virt_idx[dim]++;
866  exit = true;
867  }
868  }
869  } while (!exit);
870  if (finish) break;
871  }
872  cdealloc(idx);
873  cdealloc(virt_idx);
875  }
876 
877 // void
878  char *
879  glb_cyclic_reshuffle(int const * sym,
880  distribution const & old_dist,
881  int const * old_offsets,
882  int * const * old_permutation,
883  distribution const & new_dist,
884  int const * new_offsets,
885  int * const * new_permutation,
886  char ** ptr_tsr_data,
887  char ** ptr_tsr_cyclic_data,
888  algstrct const * sr,
889  CommData ord_glb_comm,
890  bool reuse_buffers,
891  char const * alpha,
892  char const * beta){
893  int i, np, old_nvirt, new_nvirt, old_np, new_np, idx_lyr;
894  int64_t vbs_old, vbs_new;
895  int64_t swp_nval;
896  int * hsym;
897  int64_t * send_counts, * recv_counts;
898  int * idx;
899  int64_t * idx_offs;
900  int64_t * send_displs;
901  int64_t * recv_displs;
902  int * new_virt_lda, * old_virt_lda;
903  int * old_sub_edge_len, * new_sub_edge_len;
904  int order = old_dist.order;
905 
906  char * tsr_data = *ptr_tsr_data;
907  char * tsr_cyclic_data = *ptr_tsr_cyclic_data;
908  if (order == 0){
909  bool is_copy = false;
910  if (sr->isequal(sr->mulid(), alpha) && sr->isequal(sr->addid(), beta)) is_copy = true;
911  alloc_ptr(sr->el_size, (void**)&tsr_cyclic_data);
912  if (ord_glb_comm.rank == 0){
913  if (is_copy)
914  sr->copy(tsr_cyclic_data, tsr_data);
915  else
916  sr->acc(tsr_cyclic_data, beta, tsr_data, alpha);
917  } else {
918  sr->copy(tsr_cyclic_data, sr->addid());
919  }
920  *ptr_tsr_cyclic_data = tsr_cyclic_data;
921  return tsr_cyclic_data;
922  }
923 
924  ASSERT(!reuse_buffers || sr->isequal(beta, sr->addid()));
925  ASSERT(old_dist.is_cyclic&&new_dist.is_cyclic);
926 
928  np = ord_glb_comm.np;
929 
930  alloc_ptr(order*sizeof(int), (void**)&hsym);
931  alloc_ptr(order*sizeof(int), (void**)&idx);
932  alloc_ptr(order*sizeof(int64_t), (void**)&idx_offs);
933  alloc_ptr(order*sizeof(int), (void**)&old_virt_lda);
934  alloc_ptr(order*sizeof(int), (void**)&new_virt_lda);
935 
936  new_nvirt = 1;
937  old_nvirt = 1;
938  old_np = 1;
939  new_np = 1;
940  idx_lyr = ord_glb_comm.rank;
941  for (i=0; i<order; i++) {
942  new_virt_lda[i] = new_nvirt;
943  old_virt_lda[i] = old_nvirt;
944  // nbuf = nbuf*new_dist.phase[i];
945  /*printf("is_new_pad = %d\n", is_new_pad);
946  if (is_new_pad)
947  printf("new_dist.padding[%d] = %d\n", i, new_dist.padding[i]);
948  printf("is_old_pad = %d\n", is_old_pad);
949  if (is_old_pad)
950  printf("old_dist.padding[%d] = %d\n", i, old_dist.padding[i]);*/
951  old_nvirt = old_nvirt*old_dist.virt_phase[i];
952  new_nvirt = new_nvirt*new_dist.virt_phase[i];
953  new_np = new_np*new_dist.phase[i]/new_dist.virt_phase[i];
954  old_np = old_np*old_dist.phase[i]/old_dist.virt_phase[i];
955  idx_lyr -= old_dist.perank[i]*old_dist.pe_lda[i];
956  }
957  vbs_old = old_dist.size/old_nvirt;
958 
959  mst_alloc_ptr(np*sizeof(int64_t), (void**)&recv_counts);
960  mst_alloc_ptr(np*sizeof(int64_t), (void**)&send_counts);
961  mst_alloc_ptr(np*sizeof(int64_t), (void**)&send_displs);
962  mst_alloc_ptr(np*sizeof(int64_t), (void**)&recv_displs);
963  alloc_ptr(order*sizeof(int), (void**)&old_sub_edge_len);
964  alloc_ptr(order*sizeof(int), (void**)&new_sub_edge_len);
965  int ** bucket_offset;
966 
967  int *real_edge_len; alloc_ptr(sizeof(int)*order, (void**)&real_edge_len);
968  for (i=0; i<order; i++) real_edge_len[i] = old_dist.pad_edge_len[i]-old_dist.padding[i];
969 
970  int *old_phys_edge_len; alloc_ptr(sizeof(int)*order, (void**)&old_phys_edge_len);
971  for (int dim = 0;dim < order;dim++) old_phys_edge_len[dim] = (real_edge_len[dim]+old_dist.padding[dim])/old_dist.phys_phase[dim];
972 
973  int *new_phys_edge_len; alloc_ptr(sizeof(int)*order, (void**)&new_phys_edge_len);
974  for (int dim = 0;dim < order;dim++) new_phys_edge_len[dim] = (real_edge_len[dim]+new_dist.padding[dim])/new_dist.phys_phase[dim];
975 
976  int *old_virt_edge_len; alloc_ptr(sizeof(int)*order, (void**)&old_virt_edge_len);
977  for (int dim = 0;dim < order;dim++) old_virt_edge_len[dim] = old_phys_edge_len[dim]/old_dist.virt_phase[dim];
978 
979  int *new_virt_edge_len; alloc_ptr(sizeof(int)*order, (void**)&new_virt_edge_len);
980  for (int dim = 0;dim < order;dim++) new_virt_edge_len[dim] = new_phys_edge_len[dim]/new_dist.virt_phase[dim];
981 
982 
983 
984  bucket_offset =
985  compute_bucket_offsets( old_dist,
986  new_dist,
987  real_edge_len,
988  old_phys_edge_len,
989  old_virt_lda,
990  old_offsets,
991  old_permutation,
992  new_phys_edge_len,
993  new_virt_lda,
994  1,
995  old_nvirt,
996  new_nvirt,
997  old_virt_edge_len);
998 
999 
1000 
1002  /* Calculate bucket counts to begin exchange */
1003  calc_cnt_displs(sym,
1004  old_dist,
1005  new_dist,
1006  new_nvirt,
1007  np,
1008  old_virt_edge_len,
1009  new_virt_lda,
1010  send_counts,
1011  recv_counts,
1012  send_displs,
1013  recv_displs,
1014  ord_glb_comm,
1015  idx_lyr,
1016  bucket_offset);
1017 
1018 /* bool is_AS = false;
1019  for (int asd=0; asd<order; asd++){
1020  if (sym[asd] == AS || sym[asd] == SH){
1021  is_AS =true;
1022  }
1023  }
1024  if (!is_AS){
1025  int64_t send_counts2[np];
1026  calc_drv_displs(sym, real_edge_len, old_phys_edge_len, old_dist, new_dist, send_counts2, ord_glb_comm, idx_lyr);
1027 
1028  TAU_FSTOP(calc_cnt_displs);
1029  bool is_same = true;
1030  for (i=0; i<np; i++){
1031  //printf("[%d] send_counts[%d] = %ld send_counts2[%d] = %ld\n", ord_glb_comm.rank, i, send_counts[i], i, send_counts2[i]);
1032  if (send_counts[i] != send_counts2[i]) is_same = false;
1033  }
1034  assert(is_same);
1035  }*/
1036  /*for (i=0; i<np; i++){
1037  printf("[%d] send_counts[%d] = %d recv_counts[%d] = %d\n", ord_glb_comm.rank, i, send_counts[i], i, recv_counts[i]);
1038  }
1039  for (i=0; i<nbuf; i++){
1040  printf("[%d] svirt_displs[%d] = %d rvirt_displs[%d] = %d\n", ord_glb_comm.rank, i, svirt_displs[i], i, rvirt_displs[i]);
1041  }*/
1042 
1043  // }
1044  for (i=0; i<order; i++){
1045  new_sub_edge_len[i] = new_dist.pad_edge_len[i];
1046  old_sub_edge_len[i] = old_dist.pad_edge_len[i];
1047  }
1048  for (i=0; i<order; i++){
1049  new_sub_edge_len[i] = new_sub_edge_len[i] / new_dist.phase[i];
1050  old_sub_edge_len[i] = old_sub_edge_len[i] / old_dist.phase[i];
1051  }
1052  for (i=1; i<order; i++){
1053  hsym[i-1] = sym[i];
1054  }
1055  swp_nval = new_nvirt*sy_packed_size(order, new_sub_edge_len, sym);
1056  vbs_new = swp_nval/new_nvirt;
1057 
1058  char * send_buffer, * recv_buffer;
1059  if (reuse_buffers){
1060  mst_alloc_ptr(MAX(old_dist.size,swp_nval)*sr->el_size, (void**)&tsr_cyclic_data);
1061  recv_buffer = tsr_cyclic_data;
1062  } else {
1063  mst_alloc_ptr(old_dist.size*sr->el_size, (void**)&send_buffer);
1064  mst_alloc_ptr(swp_nval*sr->el_size, (void**)&recv_buffer);
1065  }
1066  ASSERT(reuse_buffers);
1067  TAU_FSTART(pack_virt_buf);
1068  if (idx_lyr == 0){
1069  order_globally(sym, old_dist, old_virt_edge_len, old_virt_lda, vbs_old, 1, tsr_data, tsr_cyclic_data, sr);
1070  char **new_data; alloc_ptr(sizeof(char*)*np, (void**)&new_data);
1071  for (int64_t p = 0;p < np;p++){
1072  new_data[p] = tsr_data+sr->el_size*send_displs[p];
1073  }
1074 
1075  glb_ord_pup(sym,
1076  old_dist,
1077  new_dist,
1078  real_edge_len,
1079  old_dist.phys_phase,
1080  old_phys_edge_len,
1081  old_virt_edge_len,
1082  vbs_old,
1083  old_offsets,
1084  old_permutation,
1085  np,
1086  new_dist.phys_phase,
1087  new_phys_edge_len,
1088  new_virt_edge_len,
1089  vbs_new,
1090  tsr_cyclic_data,
1091  new_data,
1092  1,
1093  bucket_offset,
1094  sr->mulid(),
1095  sr->addid(),
1096  sr);
1097  cdealloc(new_data);
1098  }
1099  for (int dim = 0;dim < order;dim++){
1100  cdealloc(bucket_offset[dim]);
1101  }
1102  cdealloc(bucket_offset);
1103 
1104  TAU_FSTOP(pack_virt_buf);
1105 
1106  if (reuse_buffers){
1107  recv_buffer = tsr_cyclic_data;
1108  send_buffer = tsr_data;
1109  }
1110  return tsr_data;
1111  }
1112 #if 0
1113 
1114  /* Communicate data */
1115  TAU_FSTART(ALL_TO_ALL_V);
1116  ord_glb_comm.all_to_allv(send_buffer, send_counts, send_displs, sr->el_size,
1117  recv_buffer, recv_counts, recv_displs);
1118  TAU_FSTOP(ALL_TO_ALL_V);
1119 
1120  if (reuse_buffers){
1121  if (swp_nval > old_dist.size){
1122  cdealloc(tsr_data);
1123  mst_alloc_ptr(swp_nval*sr->el_size, (void**)&tsr_data);
1124  }
1125  }
1126  TAU_FSTART(unpack_virt_buf);
1127  /* Deserialize data into correctly ordered virtual sub blocks */
1128  if (recv_displs[ord_glb_comm.np-1] + recv_counts[ord_glb_comm.np-1] > 0){
1129  sr->set(tsr_data, sr->addid(), swp_nval);
1130  char **new_data; alloc_ptr(sizeof(char*)*np, (void**)&new_data);
1131  for (int64_t p = 0;p < np;p++){
1132  new_data[p] = recv_buffer+recv_displs[p]*sr->el_size;
1133  }
1134  bucket_offset =
1135  compute_bucket_offsets( new_dist,
1136  old_dist,
1137  real_edge_len,
1138  new_phys_edge_len,
1139  new_virt_lda,
1140  new_offsets,
1141  new_permutation,
1142  old_phys_edge_len,
1143  old_virt_lda,
1144  0,
1145  new_nvirt,
1146  old_nvirt,
1147  new_virt_edge_len);
1148 
1149 
1150  glb_ord_pup(sym,
1151  new_dist,
1152  old_dist,
1153  real_edge_len,
1154  new_dist.phys_phase,
1155  new_phys_edge_len,
1156  new_virt_edge_len,
1157  vbs_new,
1158  new_offsets,
1159  new_permutation,
1160  np,
1161  old_dist.phys_phase,
1162  old_phys_edge_len,
1163  old_virt_edge_len,
1164  vbs_old,
1165  tsr_data,
1166  new_data,
1167  0,
1168  bucket_offset,
1169  alpha,
1170  beta,
1171  sr);
1172 
1173 
1174  order_globally(sym, new_dist, new_virt_edge_len, new_virt_lda, vbs_new, 0, tsr_data, tsr_cyclic_data, sr);
1175  for (int dim = 0;dim < order;dim++){
1176  cdealloc(bucket_offset[dim]);
1177  }
1178  cdealloc(bucket_offset);
1179  cdealloc(new_data);
1180  } else {
1181  sr->set(tsr_cyclic_data, sr->addid(), swp_nval);
1182  }
1183  TAU_FSTOP(unpack_virt_buf);
1184 
1185  *ptr_tsr_cyclic_data = tsr_cyclic_data;
1186  *ptr_tsr_data = tsr_data;
1187 
1188  cdealloc(real_edge_len);
1189  cdealloc(hsym);
1190  cdealloc(idx);
1191  cdealloc(idx_offs);
1192  cdealloc(old_virt_lda);
1193  cdealloc(new_virt_lda);
1194  cdealloc(recv_counts);
1195  cdealloc(send_counts);
1196  cdealloc(send_displs);
1197  cdealloc(recv_displs);
1198  cdealloc(old_sub_edge_len);
1199  cdealloc(new_sub_edge_len);
1200  cdealloc(new_virt_edge_len);
1201  cdealloc(old_virt_edge_len);
1202  cdealloc(new_phys_edge_len);
1203  cdealloc(old_phys_edge_len);
1204 
1206 
1207  }
1208 #endif
1209 }
void calc_idx_arr(int order, int const *lens, int const *sym, int64_t idx, int *idx_arr)
Definition: util.cxx:72
int ** compute_bucket_offsets(distribution const &old_dist, distribution const &new_dist, int const *len, int const *old_phys_edge_len, int const *old_virt_lda, int const *old_offsets, int *const *old_permutation, int const *new_phys_edge_len, int const *new_virt_lda, int forward, int old_virt_np, int new_virt_np, int const *old_virt_edge_len)
computes offsets for redistribution targets along each edge length
Definition: redist.cxx:111
void order_globally(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr)
reorder local buffer so that elements are in ordered according to where they are in the global tensor...
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340
def rank(self)
Definition: core.pyx:312
void acc(char *b, char const *beta, char const *a, char const *alpha) const
compute b=beta*b + alpha*a
Definition: algstrct.cxx:514
void ord_glb< 0 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
virtual void copy(char *a, char const *b) const
copies element b to element a
Definition: algstrct.cxx:538
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
Definition: common.h:37
void glb_ord_pup(int const *sym, distribution const &old_dist, distribution const &new_dist, int const *len, int const *old_phys_dim, int const *old_phys_edge_len, int const *old_virt_edge_len, int64_t old_virt_nelem, int const *old_offsets, int *const *old_permutation, int total_np, int const *new_phys_dim, int const *new_phys_edge_len, int const *new_virt_edge_len, int64_t new_virt_nelem, char *old_data, char **new_data, int forward, int *const *bucket_offset, char const *alpha, char const *beta, algstrct const *sr)
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
void cyclic_reshuffle(int const *sym, distribution const &old_dist, int const *old_offsets, int *const *old_permutation, distribution const &new_dist, int const *new_offsets, int *const *new_permutation, char **ptr_tsr_data, char **ptr_tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm, bool reuse_buffers, char const *alpha, char const *beta)
Goes from any set of phases to any new set of phases.
#define MAX(a, b)
Definition: util.h:180
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction
Definition: memcontrol.cxx:269
template void ord_glb< 7 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
char * glb_cyclic_reshuffle(int const *sym, distribution const &old_dist, int const *old_offsets, int *const *old_permutation, distribution const &new_dist, int const *new_offsets, int *const *new_permutation, char **ptr_tsr_data, char **ptr_tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm, bool reuse_buffers, char const *alpha, char const *beta)
Goes from any set of phases to any new set of phases.
template void ord_glb_omp< 7 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int const *idx_st, int const *idx_end, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
void calc_cnt_displs(int const *sym, distribution const &old_dist, distribution const &new_dist, int new_nvirt, int np, int const *old_virt_edge_len, int const *new_virt_lda, int64_t *send_counts, int64_t *recv_counts, int64_t *send_displs, int64_t *recv_displs, CommData ord_glb_comm, int idx_lyr, int *const *bucket_offset)
assigns keys to an array of values
Definition: redist.cxx:170
void ord_glb_omp< 0 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int const *idx_st, int const *idx_end, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
void sy_calc_idx_arr(int order, int const *lens, int const *sym, int64_t idx, int *idx_arr)
same as above except assumes sym only NS or SY
Definition: util.cxx:121
void ord_glb_omp(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int const *idx_st, int const *idx_end, int prev_idx=0, int64_t glb_ord_offset=0, int64_t blk_ord_offset=0)
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
int lcm(int a, int b)
Definition: util.h:340
#define MIN(a, b)
Definition: util.h:176
int64_t packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in packed symmetric (SY, SH, or AS) layout
Definition: util.cxx:38
Definition: common.h:37
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
void ord_glb(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int prev_idx=0, int64_t glb_ord_offset=0, int64_t blk_ord_offset=0)
#define ABORT
Definition: util.h:162
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
Definition: util.cxx:10
#define CASE_ORD_GLB(n)
def np(self)
Definition: core.pyx:315