Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
contraction.cxx
Go to the documentation of this file.
1 #include "contraction.h"
2 #include "../redistribution/nosym_transp.h"
3 #include "../scaling/strp_tsr.h"
4 #include "../mapping/mapping.h"
5 #include "../mapping/distribution.h"
6 #include "../tensor/untyped_tensor.h"
7 #include "../shared/util.h"
8 #include "../shared/memcontrol.h"
9 #include "sym_seq_ctr.h"
10 #include "spctr_comm.h"
11 #include "ctr_tsr.h"
12 #include "ctr_offload.h"
13 #include "ctr_2d_general.h"
14 #include "spctr_offload.h"
15 #include "spctr_2d_general.h"
16 #include "../symmetry/sym_indices.h"
17 #include "../symmetry/symmetrization.h"
18 #include "../redistribution/nosym_transp.h"
19 #include "../redistribution/redist.h"
20 #include "../sparse_formats/coo.h"
21 #include "../sparse_formats/csr.h"
22 #include <cfloat>
23 #include <limits>
24 
25 namespace CTF_int {
26 
27  using namespace CTF;
28 
30  if (idx_A != NULL) cdealloc(idx_A);
31  if (idx_B != NULL) cdealloc(idx_B);
32  if (idx_C != NULL) cdealloc(idx_C);
33  }
34 
36  A = other.A;
37  idx_A = (int*)alloc(sizeof(int)*other.A->order);
38  memcpy(idx_A, other.idx_A, sizeof(int)*other.A->order);
39  B = other.B;
40  idx_B = (int*)alloc(sizeof(int)*other.B->order);
41  memcpy(idx_B, other.idx_B, sizeof(int)*other.B->order);
42  C = other.C;
43  idx_C = (int*)alloc(sizeof(int)*other.C->order);
44  memcpy(idx_C, other.idx_C, sizeof(int)*other.C->order);
45  if (other.is_custom) is_custom = 1;
46  else is_custom = 0;
47  func = other.func;
48  alpha = other.alpha;
49  beta = other.beta;
50  }
51 
53  int const * idx_A_,
54  tensor * B_,
55  int const * idx_B_,
56  char const * alpha_,
57  tensor * C_,
58  int const * idx_C_,
59  char const * beta_,
60  bivar_function const * func_){
61  A = A_;
62  B = B_;
63  C = C_;
64  if (func_ == NULL) is_custom = 0;
65  else is_custom = 1;
66  func = func_;
67  alpha = alpha_;
68  beta = beta_;
69 
70  idx_A = (int*)alloc(sizeof(int)*A->order);
71  idx_B = (int*)alloc(sizeof(int)*B->order);
72  idx_C = (int*)alloc(sizeof(int)*C->order);
73  memcpy(idx_A, idx_A_, sizeof(int)*A->order);
74  memcpy(idx_B, idx_B_, sizeof(int)*B->order);
75  memcpy(idx_C, idx_C_, sizeof(int)*C->order);
76  }
77 
79  char const * cidx_A,
80  tensor * B_,
81  char const * cidx_B,
82  char const * alpha_,
83  tensor * C_,
84  char const * cidx_C,
85  char const * beta_,
86  bivar_function const * func_){
87  A = A_;
88  B = B_;
89  C = C_;
90  if (func_ == NULL) is_custom = 0;
91  else is_custom = 1;
92  func = func_;
93  alpha = alpha_;
94  beta = beta_;
95 
96  conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B, C->order, cidx_C, &idx_C);
97  }
98 
100 #if (DEBUG >= 2 || VERBOSE >= 1)
101  #if DEBUG >= 2
102  if (A->wrld->cdt.rank == 0) printf("Contraction::execute (head):\n");
103  #endif
104  print();
105 #endif
106 
107  //if (A->wrld->cdt.cm == MPI_COMM_WORLD){
108 // update_all_models(A->wrld->cdt.cm);
109  //}
110 
111  int stat = home_contract();
112  if (stat != SUCCESS)
113  printf("CTF ERROR: Failed to perform contraction\n");
114  }
115 
116  template<typename ptype>
117  void get_perm(int perm_order,
118  ptype A,
119  ptype B,
120  ptype C,
121  ptype & tA,
122  ptype & tB,
123  ptype & tC){
124  switch (perm_order){
125  case 0:
126  tA = A;
127  tB = B;
128  tC = C;
129  break;
130  case 1:
131  tA = A;
132  tB = C;
133  tC = B;
134  break;
135  case 2:
136  tA = C;
137  tB = A;
138  tC = B;
139  break;
140 
141  case 3:
142  tA = B;
143  tB = C;
144  tC = A;
145  break;
146  case 4:
147  tA = B;
148  tB = A;
149  tC = C;
150  break;
151  case 5:
152  tA = C;
153  tB = B;
154  tC = A;
155  break;
156  default:
157  assert(0);
158  break;
159  }
160  }
161 
162 
164  assert(0); //FIXME
165  return 0.0;
166  }
167 
169  if (this->A != os.A) return 0;
170  if (this->B != os.B) return 0;
171  if (this->C != os.C) return 0;
172 
173  for (int i=0; i<A->order; i++){
174  if (idx_A[i] != os.idx_A[i]) return 0;
175  }
176  for (int i=0; i<B->order; i++){
177  if (idx_B[i] != os.idx_B[i]) return 0;
178  }
179  for (int i=0; i<C->order; i++){
180  if (idx_C[i] != os.idx_C[i]) return 0;
181  }
182  return 1;
183  }
184 
201  tensor const * A,
202  tensor const * B,
203  tensor const * C,
204  int const * idx_A,
205  int const * idx_B,
206  int const * idx_C,
207  int const * ordering_A,
208  int const * ordering_B,
209  iparam * inner_prm){
210  int i, num_tot, num_ctr, num_no_ctr_A, num_no_ctr_B, num_weigh;
211  int * idx_arr;
212 
213  inv_idx(A->order, idx_A,
214  B->order, idx_B,
215  C->order, idx_C,
216  &num_tot, &idx_arr);
217  num_ctr = 0, num_no_ctr_A = 0, num_no_ctr_B = 0, num_weigh = 0;
218  for (i=0; i<num_tot; i++){
219  if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
220  num_weigh++;
221  } else if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1){
222  num_ctr++;
223  } else if (idx_arr[3*i] != -1){
224  num_no_ctr_A++;
225  } else if (idx_arr[3*i+1] != -1){
226  num_no_ctr_B++;
227  }
228  }
229  inner_prm->l = 1;
230  inner_prm->m = 1;
231  inner_prm->n = 1;
232  inner_prm->k = 1;
233  for (i=0; i<A->order; i++){
234  if (i >= num_ctr+num_no_ctr_A){
235  inner_prm->l = inner_prm->l * A->pad_edge_len[ordering_A[i]];
236  }
237  else if (i >= num_ctr)
238  inner_prm->m = inner_prm->m * A->pad_edge_len[ordering_A[i]];
239  else
240  inner_prm->k = inner_prm->k * A->pad_edge_len[ordering_A[i]];
241  }
242  for (i=0; i<B->order; i++){
243  if (i >= num_ctr && i< num_ctr + num_no_ctr_B)
244  inner_prm->n = inner_prm->n * B->pad_edge_len[ordering_B[i]];
245  }
246  /* This gets set later */
247  inner_prm->sz_C = 0;
248  CTF_int::cdealloc(idx_arr);
249  }
250 
251  bool contraction::is_sparse(){
252  return A->is_sparse || B->is_sparse || C->is_sparse;
253  }
254 
255  void contraction::get_fold_indices(int * num_fold,
256  int ** fold_idx){
257  int i, in, num_tot, nfold, broken;
258  int iA, iB, iC, inA, inB, inC, iiA, iiB, iiC;
259  int * idx_arr, * idx;
260  inv_idx(A->order, idx_A,
261  B->order, idx_B,
262  C->order, idx_C,
263  &num_tot, &idx_arr);
264  CTF_int::alloc_ptr(num_tot*sizeof(int), (void**)&idx);
265 
266  for (i=0; i<num_tot; i++){
267  idx[i] = 1;
268  }
269 
270  for (iA=0; iA<A->order; iA++){
271  i = idx_A[iA];
272  iB = idx_arr[3*i+1];
273  iC = idx_arr[3*i+2];
274  broken = 0;
275  inA = iA;
276  do {
277  in = idx_A[inA];
278  inB = idx_arr[3*in+1];
279  inC = idx_arr[3*in+2];
280  if ((iA>=0) + (iB>=0) + (iC>=0) == 2){
281  if (((inA>=0) + (inB>=0) + (inC>=0) != 2) ||
282  ((inB == -1) ^ (iB == -1)) ||
283  ((inC == -1) ^ (iC == -1)) ||
284  (iB != -1 && inB - iB != in-i) ||
285  (iC != -1 && inC - iC != in-i) ||
286  (iB != -1 && A->sym[inA] != B->sym[inB]) ||
287  (iC != -1 && A->sym[inA] != C->sym[inC])){
288  broken = 1;
289  }
290  } else if ((iA>=0) + (iB>=0) + (iC>=0) != 3){
291  if ((inA>=0) + (inB>=0) + (inC>=0) != 3 ||
292  A->sym[inA] != B->sym[inB] ||
293  A->sym[inA] != C->sym[inC]){
294  broken = 1;
295  }
296  } else {
297  if (((inA>=0) + (inB>=0) + (inC>=0) != 3) ||
298  ((inB == -1) ^ (iB == -1)) ||
299  ((inC == -1) ^ (iC == -1)) ||
300  ((inA == -1) ^ (iA == -1)) ||
301  (inB - iB != in-i) ||
302  (inC - iC != in-i) ||
303  (inA - iA != in-i) ||
304  (A->sym[inA] != B->sym[inB]) ||
305  (B->sym[inB] != C->sym[inC]) ||
306  (A->sym[inA] != C->sym[inC])){
307  broken = 1;
308  }
309  }
310  inA++;
311  } while (A->sym[inA-1] != NS);
312  if (broken){
313  for (iiA=iA;iiA<inA;iiA++){
314  idx[idx_A[iiA]] = 0;
315  }
316  }
317  }
318 
319  for (iC=0; iC<C->order; iC++){
320  i = idx_C[iC];
321  iA = idx_arr[3*i+0];
322  iB = idx_arr[3*i+1];
323  broken = 0;
324  inC = iC;
325  do {
326  in = idx_C[inC];
327  inA = idx_arr[3*in+0];
328  inB = idx_arr[3*in+1];
329  if (((inC>=0) + (inA>=0) + (inB>=0) == 1) ||
330  ((inA == -1) ^ (iA == -1)) ||
331  ((inB == -1) ^ (iB == -1)) ||
332  (iA != -1 && inA - iA != in-i) ||
333  (iB != -1 && inB - iB != in-i) ||
334  (iA != -1 && C->sym[inC] != A->sym[inA]) ||
335  (iB != -1 && C->sym[inC] != B->sym[inB])){
336  broken = 1;
337  }
338  inC++;
339  } while (C->sym[inC-1] != NS);
340  if (broken){
341  for (iiC=iC;iiC<inC;iiC++){
342  idx[idx_C[iiC]] = 0;
343  }
344  }
345  }
346 
347  for (iB=0; iB<B->order; iB++){
348  i = idx_B[iB];
349  iC = idx_arr[3*i+2];
350  iA = idx_arr[3*i+0];
351  broken = 0;
352  inB = iB;
353  do {
354  in = idx_B[inB];
355  inC = idx_arr[3*in+2];
356  inA = idx_arr[3*in+0];
357  if (((inB>=0) + (inC>=0) + (inA>=0) == 1) ||
358  ((inC == -1) ^ (iC == -1)) ||
359  ((inA == -1) ^ (iA == -1)) ||
360  (iC != -1 && inC - iC != in-i) ||
361  (iA != -1 && inA - iA != in-i) ||
362  (iC != -1 && B->sym[inB] != C->sym[inC]) ||
363  (iA != -1 && B->sym[inB] != A->sym[inA])){
364  broken = 1;
365  }
366  inB++;
367  } while (B->sym[inB-1] != NS);
368  if (broken){
369  for (iiB=iB;iiB<inB;iiB++){
370  idx[idx_B[iiB]] = 0;
371  }
372  }
373  }
374 
375  nfold = 0;
376  for (i=0; i<num_tot; i++){
377  if (idx[i] == 1){
378  idx[nfold] = i;
379  nfold++;
380  }
381  }
382  *num_fold = nfold;
383  *fold_idx = idx;
384  CTF_int::cdealloc(idx_arr);
385 
386  }
387 
388  int contraction::can_fold(){
389  int nfold, * fold_idx, i, j;
390  if (!is_sparse() && is_custom) return 0;
391  for (i=0; i<A->order; i++){
392  for (j=i+1; j<A->order; j++){
393  if (idx_A[i] == idx_A[j]) return 0;
394  }
395  }
396  for (i=0; i<B->order; i++){
397  for (j=i+1; j<B->order; j++){
398  if (idx_B[i] == idx_B[j]) return 0;
399  }
400  }
401  for (i=0; i<C->order; i++){
402  for (j=i+1; j<C->order; j++){
403  if (idx_C[i] == idx_C[j]) return 0;
404  }
405  }
406  get_fold_indices(&nfold, &fold_idx);
407  if (is_sparse()){
408  //when A is sparse we must fold all indices and reduce block contraction entirely to coomm
409  if ((A->order+B->order+C->order)%2 == 1 ||
410  (A->order+B->order+C->order)/2 < nfold ){
411  CTF_int::cdealloc(fold_idx);
412  return 0;
413  } else {
414  // do not allow weigh indices for sparse contractions
415  int num_tot;
416  int * idx_arr;
417 
418  inv_idx(A->order, idx_A,
419  B->order, idx_B,
420  C->order, idx_C,
421  &num_tot, &idx_arr);
422  for (i=0; i<num_tot; i++){
423  if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
424  CTF_int::cdealloc(idx_arr);
425  CTF_int::cdealloc(fold_idx);
426  return 0;
427  }
428  }
429  CTF_int::cdealloc(idx_arr);
430  }
431  }
432  CTF_int::cdealloc(fold_idx);
433  /* FIXME: 1 folded index is good enough for now, in the future model */
434  return nfold > 0;
435  }
436 
437 
452  tensor const * A,
453  tensor const * B,
454  tensor const * C,
455  int const * idx_A,
456  int const * idx_B,
457  int const * idx_C,
458  int ** new_ordering_A,
459  int ** new_ordering_B,
460  int ** new_ordering_C){
461  int i, num_tot, num_ctr, num_no_ctr_A, num_no_ctr_B, num_weigh;
462  int idx_no_ctr_A, idx_no_ctr_B, idx_ctr, idx_weigh;
463  int idx_self_C, idx_self_A, idx_self_B;
464  int num_self_C, num_self_A, num_self_B;
465  int * ordering_A, * ordering_B, * ordering_C, * idx_arr;
466 
467  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&ordering_A);
468  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&ordering_B);
469  CTF_int::alloc_ptr(sizeof(int)*C->order, (void**)&ordering_C);
470 
471  inv_idx(A->order, idx_A,
472  B->order, idx_B,
473  C->order, idx_C,
474  &num_tot, &idx_arr);
475  num_no_ctr_B = 0, num_ctr = 0, num_no_ctr_A = 0, num_weigh = 0;
476  num_self_A = 0, num_self_B = 0, num_self_C = 0;
477  for (i=0; i<num_tot; i++){
478  if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
479  num_weigh++;
480  } else if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1){
481  num_ctr++;
482  } else if (idx_arr[3*i] != -1 && idx_arr[3*i+2] != -1){
483  num_no_ctr_A++;
484  } else if (idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
485  num_no_ctr_B++;
486  } else if (idx_arr[3*i] != -1){
487  num_self_A++;
488  } else if (idx_arr[3*i+1] != -1){
489  num_self_B++;
490  } else if (idx_arr[3*i+2] != -1){
491  num_self_C++;
492  } else {
493  assert(0);
494  }
495  }
496  idx_self_A = 0, idx_self_B = 0, idx_self_C=0;
497  /* Put all weigh indices in back, w ut all contraction indices up front, put A indices in front for C */
498  idx_ctr = 0, idx_no_ctr_A = 0, idx_no_ctr_B = 0, idx_weigh = 0;
499  for (i=0; i<num_tot; i++){
500  if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
501  ordering_A[idx_weigh+num_no_ctr_A+num_ctr] = idx_arr[3*i];
502  ordering_B[idx_weigh+num_no_ctr_B+num_ctr] = idx_arr[3*i+1];
503  ordering_C[idx_weigh+num_no_ctr_A+num_no_ctr_B] = idx_arr[3*i+2];
504  idx_weigh++;
505  } else if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1){
506  ordering_A[idx_ctr] = idx_arr[3*i];
507  ordering_B[idx_ctr] = idx_arr[3*i+1];
508  idx_ctr++;
509  } else {
510  if (idx_arr[3*i] != -1 && idx_arr[3*i+2] != -1){
511  ordering_A[num_ctr+idx_no_ctr_A] = idx_arr[3*i];
512  ordering_C[idx_no_ctr_A] = idx_arr[3*i+2];
513  idx_no_ctr_A++;
514  } else if (idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
515  ordering_B[num_ctr+idx_no_ctr_B] = idx_arr[3*i+1];
516  ordering_C[num_no_ctr_A+idx_no_ctr_B] = idx_arr[3*i+2];
517  idx_no_ctr_B++;
518  } else if (idx_arr[3*i] != -1){
519  idx_self_A++;
520  ordering_A[num_ctr+num_no_ctr_A+num_weigh+idx_self_A] = idx_arr[3*i];
521  } else if (idx_arr[3*i+1] != -1){
522  idx_self_B++;
523  ordering_B[num_ctr+num_no_ctr_B+num_weigh+idx_self_B] = idx_arr[3*i+1];
524  } else if (idx_arr[3*i+2] != -1){
525  idx_self_C++;
526  ordering_C[num_no_ctr_A+num_no_ctr_B+num_weigh+idx_self_C] = idx_arr[3*i+2];
527  } else assert(0);
528  }
529  }
530  CTF_int::cdealloc(idx_arr);
531  *new_ordering_A = ordering_A;
532  *new_ordering_B = ordering_B;
533  *new_ordering_C = ordering_C;
534 
535  //iparam iprm;
536  //calc_fold_nmk(A, B, C, idx_A, idx_B, idx_C, *new_ordering_A, *new_ordering_B, &iprm);
537  //return iprm;
538  }
539 
540 
541  void contraction::get_fold_ctr(contraction *& fold_ctr,
542  int & all_fdim_A,
543  int & all_fdim_B,
544  int & all_fdim_C,
545  int *& all_flen_A,
546  int *& all_flen_B,
547  int *& all_flen_C){
548  int i, j, nfold, nf;
549  int * fold_idx, * fidx_A, * fidx_B, * fidx_C;
550  tensor * fA, * fB, * fC;
551 
552  get_fold_indices(&nfold, &fold_idx);
553  if (nfold == 0) {
554  CTF_int::cdealloc(fold_idx);
555  assert(0); //return ERROR;
556  }
557 
558  /* overestimate this space to not bother with it later */
559  CTF_int::alloc_ptr(nfold*sizeof(int), (void**)&fidx_A);
560  CTF_int::alloc_ptr(nfold*sizeof(int), (void**)&fidx_B);
561  CTF_int::alloc_ptr(nfold*sizeof(int), (void**)&fidx_C);
562 
563  A->fold(nfold, fold_idx, idx_A,
564  &all_fdim_A, &all_flen_A);
565  B->fold(nfold, fold_idx, idx_B,
566  &all_fdim_B, &all_flen_B);
567  C->fold(nfold, fold_idx, idx_C,
568  &all_fdim_C, &all_flen_C);
569 
570 // printf("rec tsr C order is %d\n",C->rec_tsr->order);
571 
572  nf = 0;
573  for (i=0; i<A->order; i++){
574  for (j=0; j<nfold; j++){
575  if (A->sym[i] == NS && idx_A[i] == fold_idx[j]){
576  fidx_A[nf] = j;
577  nf++;
578  }
579  }
580  }
581  nf = 0;
582  for (i=0; i<B->order; i++){
583  for (j=0; j<nfold; j++){
584  if (B->sym[i] == NS && idx_B[i] == fold_idx[j]){
585  fidx_B[nf] = j;
586  nf++;
587  }
588  }
589  }
590  nf = 0;
591  for (i=0; i<C->order; i++){
592  for (j=0; j<nfold; j++){
593  if (C->sym[i] == NS && idx_C[i] == fold_idx[j]){
594  fidx_C[nf] = j;
595  nf++;
596  }
597  }
598  }
599 
600  fA = A->rec_tsr;
601  fB = B->rec_tsr;
602  fC = C->rec_tsr;
603 
604  int * sidx_A, * sidx_B, * sidx_C;
605  CTF_int::conv_idx<int>(fA->order, fidx_A, &sidx_A,
606  fB->order, fidx_B, &sidx_B,
607  fC->order, fidx_C, &sidx_C);
608 
609  fold_ctr = new contraction(fA, sidx_A, fB, sidx_B, alpha, fC, sidx_C, beta, func);
610 
611  CTF_int::cdealloc(sidx_A);
612  CTF_int::cdealloc(sidx_B);
613  CTF_int::cdealloc(sidx_C);
614 
615  CTF_int::cdealloc(fidx_A);
616  CTF_int::cdealloc(fidx_B);
617  CTF_int::cdealloc(fidx_C);
618  CTF_int::cdealloc(fold_idx);
619  }
620 
621  void contraction::select_ctr_perm(contraction const * fold_ctr,
622  int all_fdim_A,
623  int all_fdim_B,
624  int all_fdim_C,
625  int const * all_flen_A,
626  int const * all_flen_B,
627  int const * all_flen_C,
628  int & bperm_order,
629  double & btime,
630  iparam & iprm){
631  bperm_order = -1;
632  btime = DBL_MAX;
633  int tall_fdim_A, tall_fdim_B, tall_fdim_C;
634  int const * tall_flen_A, * tall_flen_B, * tall_flen_C;
635  tensor * tA, * tB, * tC;
636  tensor * tfA, * tfB, * tfC;
637  int * tidx_A, * tidx_B, * tidx_C;
638  int * tfnew_ord_A, * tfnew_ord_B, * tfnew_ord_C;
639  int * tAiord, * tBiord, * tCiord;
640 
641  // if function not commutative consider only orderings where we call dgemm without interchanging A and B
642  int max_ord = 6;
643  if (is_custom && !func->commutative) max_ord = 3;
644 
645  //iterate over permutations of {A,B,C}
646  for (int iord=0; iord<max_ord; iord++){
647  get_perm<tensor*>(iord, A, B, C,
648  tA, tB, tC);
649  get_perm<tensor*>(iord, fold_ctr->A, fold_ctr->B, fold_ctr->C,
650  tfA, tfB, tfC);
651  get_perm<int*>(iord, fold_ctr->idx_A, fold_ctr->idx_B, fold_ctr->idx_C,
652  tidx_A, tidx_B, tidx_C);
653  get_perm<int const*>(iord, all_flen_A, all_flen_B, all_flen_C,
654  tall_flen_A, tall_flen_B, tall_flen_C);
655  get_perm<int>(iord, all_fdim_A, all_fdim_B, all_fdim_C,
656  tall_fdim_A, tall_fdim_B, tall_fdim_C);
657  get_len_ordering(tfA, tfB, tfC, tidx_A, tidx_B, tidx_C,
658  &tfnew_ord_A, &tfnew_ord_B, &tfnew_ord_C);
659  // m,n,k should be invarient to what transposes are done
660  if (iord == 0){
661  calc_fold_lnmk(tfA, tfB, tfC, tidx_A, tidx_B, tidx_C, tfnew_ord_A, tfnew_ord_B, &iprm);
662  }
663 
664  CTF_int::alloc_ptr(tall_fdim_A*sizeof(int), (void**)&tAiord);
665  CTF_int::alloc_ptr(tall_fdim_B*sizeof(int), (void**)&tBiord);
666  CTF_int::alloc_ptr(tall_fdim_C*sizeof(int), (void**)&tCiord);
667 
668  memcpy(tAiord, tA->inner_ordering, tall_fdim_A*sizeof(int));
669  memcpy(tBiord, tB->inner_ordering, tall_fdim_B*sizeof(int));
670  memcpy(tCiord, tC->inner_ordering, tall_fdim_C*sizeof(int));
671 
672  permute_target(tfA->order, tfnew_ord_A, tAiord);
673  permute_target(tfB->order, tfnew_ord_B, tBiord);
674  permute_target(tfC->order, tfnew_ord_C, tCiord);
675 
676  double time_est = 0.0;
677  if (tA->is_sparse)
678  time_est += tA->nnz_tot/(tA->size*tA->calc_npe())*tA->calc_nvirt()*est_time_transp(tall_fdim_A, tAiord, tall_flen_A, 1, tA->sr);
679  else
680  time_est += tA->calc_nvirt()*est_time_transp(tall_fdim_A, tAiord, tall_flen_A, 1, tA->sr);
681  if (tB->is_sparse)
682  time_est += tB->nnz_tot/(tB->size*tB->calc_npe())*tB->calc_nvirt()*est_time_transp(tall_fdim_B, tBiord, tall_flen_B, 1, tB->sr);
683  else
684  time_est += tB->calc_nvirt()*est_time_transp(tall_fdim_B, tBiord, tall_flen_B, 1, tB->sr);
685  if (tC->is_sparse)
686  time_est += 2.*tC->nnz_tot/(tC->size*tC->calc_npe())*tC->calc_nvirt()*est_time_transp(tall_fdim_C, tCiord, tall_flen_C, 1, tC->sr);
687  else
688  time_est += 2.*tC->calc_nvirt()*est_time_transp(tall_fdim_C, tCiord, tall_flen_C, 1, tC->sr);
689  if (is_sparse()){
690  if (iord == 1){
691  if (time_est <= btime){
692  btime = time_est;
693  bperm_order = iord;
694  }
695  }
696  } else {
697  if (time_est <= btime){
698  btime = time_est;
699  bperm_order = iord;
700  }
701  }
702  cdealloc(tAiord);
703  cdealloc(tBiord);
704  cdealloc(tCiord);
705  cdealloc(tfnew_ord_A);
706  cdealloc(tfnew_ord_B);
707  cdealloc(tfnew_ord_C);
708  }
709 
710  switch (bperm_order){
711  case 0: // A B C
712  //index order : 1. AB 2. AC 3. BC
713  iprm.tA = 'T';
714  iprm.tB = 'N';
715  iprm.tC = 'N';
716  break;
717  case 1: // A C B
718  //index order : 1. AC 2. AB 3. BC
719  iprm.tA = 'N';
720  iprm.tB = 'N';
721  iprm.tC = 'N';
722  //calc_fold_nmk(fold_ctr->A, fold_ctr->B, fold_ctr->C, fold_ctr->idx_A, fold_ctr->idx_B, fold_ctr->idx_C, fnew_ord_A, fnew_ord_C, &iprm);
723  break;
724 
725  case 2: // C A B
726  //index order : 1. CA 2. BC 3. AB
727  iprm.tA = 'N';
728  iprm.tB = 'T';
729  iprm.tC = 'N';
730  //calc_fold_nmk(fold_ctr->A, fold_ctr->B, fold_ctr->C, fold_ctr->idx_A, fold_ctr->idx_B, fold_ctr->idx_C, fnew_ord_B, fnew_ord_C, &iprm);
731  break;
732 
733  case 3: // B C A
734  //index order : 1. BC 2. AB 3. AC
735  //C^T=B^T*A^T
736  iprm.tA = 'N';
737  iprm.tB = 'N';
738  iprm.tC = 'T';
739  break;
740  case 4: // B A C
741  //index order : 1. AB 2. BC 3. AC
742  //C^T=B^T*A^T
743  iprm.tA = 'N';
744  iprm.tB = 'T';
745  iprm.tC = 'T';
746  break;
747  case 5: // C B A
748  //index order : 1. BC 2. AC 3. AB
749  //C^T=B^T*A^T
750  iprm.tA = 'T';
751  iprm.tB = 'N';
752  iprm.tC = 'T';
753  break;
754  default:
755  assert(0);
756  break;
757  }
758 
759  }
760 
761  iparam contraction::map_fold(bool do_transp){
762  int all_fdim_A, all_fdim_B, all_fdim_C;
763  int * fnew_ord_A, * fnew_ord_B, * fnew_ord_C;
764  int * all_flen_A, * all_flen_B, * all_flen_C;
765  iparam iprm;
766  contraction * fold_ctr;
767  int bperm_order = -1;
768  double btime = DBL_MAX;
769  int tall_fdim_A, tall_fdim_B, tall_fdim_C;
770  tensor * tA, * tB, * tC;
771  tensor * tfA, * tfB, * tfC;
772  int * tidx_A, * tidx_B, * tidx_C;
773 
774  get_fold_ctr(fold_ctr, all_fdim_A, all_fdim_B, all_fdim_C,
775  all_flen_A, all_flen_B, all_flen_C);
776 
777  #if DEBUG>=2
778  if (do_transp){
779  CommData global_comm = A->wrld->cdt;
780  if (global_comm.rank == 0){
781  printf("Folded contraction type:\n");
782  }
783  fold_ctr->print();
784  }
785  #endif
786  select_ctr_perm(fold_ctr, all_fdim_A, all_fdim_B, all_fdim_C,
787  all_flen_A, all_flen_B, all_flen_C,
788  bperm_order, btime, iprm);
789  if (is_sparse()){
790  bperm_order = 1;
791  iprm.tA = 'N';
792  iprm.tB = 'N';
793  iprm.tC = 'N';
794  }
795 // printf("bperm_order = %d\n", bperm_order);
796  get_perm<tensor*>(bperm_order, A, B, C,
797  tA, tB, tC);
798  get_perm<tensor*>(bperm_order, fold_ctr->A, fold_ctr->B, fold_ctr->C,
799  tfA, tfB, tfC);
800  get_perm<int*>(bperm_order, fold_ctr->idx_A, fold_ctr->idx_B, fold_ctr->idx_C,
801  tidx_A, tidx_B, tidx_C);
802  get_perm<int>(bperm_order, all_fdim_A, all_fdim_B, all_fdim_C,
803  tall_fdim_A, tall_fdim_B, tall_fdim_C);
804 
805  get_len_ordering(tfA, tfB, tfC, tidx_A, tidx_B, tidx_C,
806  &fnew_ord_A, &fnew_ord_B, &fnew_ord_C);
807 
808  permute_target(tfA->order, fnew_ord_A, tA->inner_ordering);
809  permute_target(tfB->order, fnew_ord_B, tB->inner_ordering);
810  permute_target(tfC->order, fnew_ord_C, tC->inner_ordering);
811 
812  if (do_transp){
813  bool csr_or_coo = B->is_sparse || C->is_sparse || is_custom || !A->sr->has_coo_ker;
814  if (!A->is_sparse){
815  nosym_transpose(A, all_fdim_A, all_flen_A, A->inner_ordering, 1);
816  } else {
817  int nrow_idx = 0;
818  for (int i=0; i<A->order; i++){
819  for (int j=0; j<C->order; j++){
820  if (idx_A[i] == idx_C[j]) nrow_idx++;
821  }
822  }
823  A->spmatricize(iprm.m, iprm.k, nrow_idx, csr_or_coo);
824  }
825  if (!B->is_sparse){
826  nosym_transpose(B, all_fdim_B, all_flen_B, B->inner_ordering, 1);
827  /*for (i=0; i<nvirt_B; i++){
828  nosym_transpose(all_fdim_B, B->inner_ordering, all_flen_B,
829  B->data + B->sr->el_size*i*(B->size/nvirt_B), 1, B->sr);
830  }*/
831  } else {
832  int nrow_idx = 0;
833  for (int i=0; i<B->order; i++){
834  for (int j=0; j<A->order; j++){
835  if (idx_B[i] == idx_A[j]) nrow_idx++;
836  }
837  }
838  B->spmatricize(iprm.k, iprm.n, nrow_idx, csr_or_coo);
839  }
840 
841  if (!C->is_sparse){
842  nosym_transpose(C, all_fdim_C, all_flen_C, C->inner_ordering, 1);
843  } else {
844  int nrow_idx = 0;
845  for (int i=0; i<C->order; i++){
846  for (int j=0; j<A->order; j++){
847  if (idx_C[i] == idx_A[j]) nrow_idx++;
848  }
849  }
850  C->spmatricize(iprm.m, iprm.n, nrow_idx, csr_or_coo);
851  C->sr->dealloc(C->data);
852  }
853 
854  }
855 
856  CTF_int::cdealloc(fnew_ord_A);
857  CTF_int::cdealloc(fnew_ord_B);
858  CTF_int::cdealloc(fnew_ord_C);
859  CTF_int::cdealloc(all_flen_A);
860  CTF_int::cdealloc(all_flen_B);
861  CTF_int::cdealloc(all_flen_C);
862  delete fold_ctr;
863 
864  return iprm;
865  }
866 
867 
868  double contraction::est_time_fold(){
869  int all_fdim_A, all_fdim_B, all_fdim_C;
870  int * all_flen_A, * all_flen_B, * all_flen_C;
871  iparam iprm;
872  contraction * fold_ctr;
873  int bperm_order = -1;
874  double btime = DBL_MAX;
875 
876  get_fold_ctr(fold_ctr, all_fdim_A, all_fdim_B, all_fdim_C,
877  all_flen_A, all_flen_B, all_flen_C);
878 
879  select_ctr_perm(fold_ctr, all_fdim_A, all_fdim_B, all_fdim_C,
880  all_flen_A, all_flen_B, all_flen_C,
881  bperm_order, btime, iprm);
882 
883  CTF_int::cdealloc(all_flen_A);
884  CTF_int::cdealloc(all_flen_B);
885  CTF_int::cdealloc(all_flen_C);
886  delete fold_ctr;
887  A->remove_fold();
888  B->remove_fold();
889  C->remove_fold();
890 
891  return btime;
892  }
893 
894 
895 
896  int contraction::unfold_broken_sym(contraction ** new_contraction){
897  int i, num_tot, iA, iB, iC;
898  int * idx_arr;
899  tensor * nA, * nB, * nC;
900 
901  contraction * nctr;
902 
903  if (new_contraction != NULL){
904  nA = new tensor(A, 0, 0);
905  nB = new tensor(B, 0, 0);
906  nC = new tensor(C, 0, 0);
907  nctr = new contraction(nA, idx_A, nB, idx_B, alpha, nC, idx_C, beta, func);
908  *new_contraction = nctr;
909 
910  nA->clear_mapping();
911  nA->set_padding();
912  copy_mapping(A->order, A->edge_map, nA->edge_map);
913  nA->is_mapped = 1;
914  nA->topo = A->topo;
915  nA->set_padding();
916 
917  nB->clear_mapping();
918  nB->set_padding();
919  copy_mapping(B->order, B->edge_map, nB->edge_map);
920  nB->is_mapped = 1;
921  nB->topo = B->topo;
922  nB->set_padding();
923 
924  nC->clear_mapping();
925  nC->set_padding();
926  copy_mapping(C->order, C->edge_map, nC->edge_map);
927  nC->is_mapped = 1;
928  nC->topo = C->topo;
929  nC->set_padding();
930  } else {
931  nA = NULL;
932  nB = NULL;
933  nC = NULL;
934  }
935 
936  inv_idx(A->order, idx_A,
937  B->order, idx_B,
938  C->order, idx_C,
939  &num_tot, &idx_arr);
940 
941  int nA_sym[A->order];
942  if (new_contraction != NULL)
943  memcpy(nA_sym, nA->sym, sizeof(int)*nA->order);
944  for (i=0; i<A->order; i++){
945  if (A->sym[i] != NS){
946  iA = idx_A[i];
947  if (idx_arr[3*iA+1] != -1){
948  if (B->sym[idx_arr[3*iA+1]] != A->sym[i] ||
949  idx_A[i+1] != idx_B[idx_arr[3*iA+1]+1]){
950  if (new_contraction != NULL){
951  nA_sym[i] = NS;
952  nA->set_sym(nA_sym);
953  }
954  CTF_int::cdealloc(idx_arr);
955  return 3*i;
956  }
957  } else {
958  if (idx_arr[3*idx_A[i+1]+1] != -1){
959  if (new_contraction != NULL){
960  nA_sym[i] = NS;
961  nA->set_sym(nA_sym);
962  }
963  CTF_int::cdealloc(idx_arr);
964  return 3*i;
965  }
966  }
967  if (idx_arr[3*iA+2] != -1){
968  if (C->sym[idx_arr[3*iA+2]] != A->sym[i] ||
969  idx_A[i+1] != idx_C[idx_arr[3*iA+2]+1]){
970  if (new_contraction != NULL){
971  nA_sym[i] = NS;
972  nA->set_sym(nA_sym);
973  }
974  CTF_int::cdealloc(idx_arr);
975  return 3*i;
976  }
977  } else {
978  if (idx_arr[3*idx_A[i+1]+2] != -1){
979  if (new_contraction != NULL){
980  nA_sym[i] = NS;
981  nA->set_sym(nA_sym);
982  }
983  CTF_int::cdealloc(idx_arr);
984  return 3*i;
985  }
986  }
987  }
988  }
989 
990 
991  int nB_sym[B->order];
992  if (new_contraction != NULL)
993  memcpy(nB_sym, nB->sym, sizeof(int)*nB->order);
994  for (i=0; i<B->order; i++){
995  if (B->sym[i] != NS){
996  iB = idx_B[i];
997  if (idx_arr[3*iB+0] != -1){
998  if (A->sym[idx_arr[3*iB+0]] != B->sym[i] ||
999  idx_B[i+1] != idx_A[idx_arr[3*iB+0]+1]){
1000  if (new_contraction != NULL){
1001  nB_sym[i] = NS;
1002  nB->set_sym(nB_sym);
1003  }
1004  CTF_int::cdealloc(idx_arr);
1005  return 3*i+1;
1006  }
1007  } else {
1008  if (idx_arr[3*idx_B[i+1]+0] != -1){
1009  if (new_contraction != NULL){
1010  nB_sym[i] = NS;
1011  nB->set_sym(nB_sym);
1012  }
1013  CTF_int::cdealloc(idx_arr);
1014  return 3*i+1;
1015  }
1016  }
1017  if (idx_arr[3*iB+2] != -1){
1018  if (C->sym[idx_arr[3*iB+2]] != B->sym[i] ||
1019  idx_B[i+1] != idx_C[idx_arr[3*iB+2]+1]){
1020  if (new_contraction != NULL){
1021  nB_sym[i] = NS;
1022  nB->set_sym(nB_sym);
1023  }
1024  CTF_int::cdealloc(idx_arr);
1025  return 3*i+1;
1026  }
1027  } else {
1028  if (idx_arr[3*idx_B[i+1]+2] != -1){
1029  if (new_contraction != NULL){
1030  nB_sym[i] = NS;
1031  nB->set_sym(nB_sym);
1032  }
1033  CTF_int::cdealloc(idx_arr);
1034  return 3*i+1;
1035  }
1036  }
1037  }
1038  }
1039  //if A=B, output symmetry may still be preserved, so long as all indices in A and B are proper
1040  bool is_preserv = true;
1041  if (A != B) is_preserv = false;
1042  else {
1043  for (int j=0; j<A->order; j++){
1044  if (idx_A[j] != idx_B[j]){
1045  iA = idx_A[j];
1046  iB = idx_B[j];
1047  if (idx_arr[3*iA+2] == -1 || idx_arr[3*iB+2] == -1) is_preserv = false;
1048  else {
1049  for (int k=MIN(idx_arr[3*iA+2],idx_arr[3*iB+2]);
1050  k<MAX(idx_arr[3*iA+2],idx_arr[3*iB+2]);
1051  k++){
1052  if (C->sym[k] != SY) is_preserv = false;
1053  }
1054  }
1055  }
1056  }
1057  }
1058  if (!is_preserv){
1059  int nC_sym[C->order];
1060  if (new_contraction != NULL)
1061  memcpy(nC_sym, nC->sym, sizeof(int)*nC->order);
1062  for (i=0; i<C->order; i++){
1063  if (C->sym[i] != NS){
1064  iC = idx_C[i];
1065  if (idx_arr[3*iC+1] != -1){
1066  if (B->sym[idx_arr[3*iC+1]] != C->sym[i] ||
1067  idx_C[i+1] != idx_B[idx_arr[3*iC+1]+1]){
1068  if (new_contraction != NULL){
1069  nC_sym[i] = NS;
1070  nC->set_sym(nC_sym);
1071  }
1072  CTF_int::cdealloc(idx_arr);
1073  return 3*i+2;
1074  }
1075  } else if (idx_arr[3*idx_C[i+1]+1] != -1){
1076  if (new_contraction != NULL){
1077  nC_sym[i] = NS;
1078  nC->set_sym(nC_sym);
1079  }
1080  CTF_int::cdealloc(idx_arr);
1081  return 3*i+2;
1082  }
1083  if (idx_arr[3*iC+0] != -1){
1084  if (A->sym[idx_arr[3*iC+0]] != C->sym[i] ||
1085  idx_C[i+1] != idx_A[idx_arr[3*iC+0]+1]){
1086  if (new_contraction != NULL){
1087  nC_sym[i] = NS;
1088  nC->set_sym(nC_sym);
1089  }
1090  CTF_int::cdealloc(idx_arr);
1091  return 3*i+2;
1092  }
1093  } else if (idx_arr[3*iC+0] == -1){
1094  if (idx_arr[3*idx_C[i+1]] != -1){
1095  if (new_contraction != NULL){
1096  nC_sym[i] = NS;
1097  nC->set_sym(nC_sym);
1098  }
1099  CTF_int::cdealloc(idx_arr);
1100  return 3*i+2;
1101  }
1102  }
1103  }
1104  }
1105  }
1106  CTF_int::cdealloc(idx_arr);
1107  return -1;
1108  }
1109 
1110  bool contraction::check_consistency(){
1111  int i, num_tot, len;
1112  int iA, iB, iC;
1113  int * idx_arr;
1114 
1115  inv_idx(A->order, idx_A,
1116  B->order, idx_B,
1117  C->order, idx_C,
1118  &num_tot, &idx_arr);
1119 
1120  for (i=0; i<num_tot; i++){
1121  len = -1;
1122  iA = idx_arr[3*i+0];
1123  iB = idx_arr[3*i+1];
1124  iC = idx_arr[3*i+2];
1125  if (iA != -1){
1126  len = A->lens[iA];
1127  }
1128  if (len != -1 && iB != -1 && len != B->lens[iB]){
1129  if (A->wrld->cdt.rank == 0){
1130  printf("Error in contraction call: The %dth edge length of tensor %s does not",
1131  iA, A->name);
1132  printf("match the %dth edge length of tensor %s.\n",
1133  iB, B->name);
1134  }
1135  return false;
1136  }
1137  if (len != -1 && iC != -1 && len != C->lens[iC]){
1138  if (A->wrld->cdt.rank == 0){
1139  printf("Error in contraction call: The %dth edge length of tensor %s (%d) does not",
1140  iA, A->name, len);
1141  printf("match the %dth edge length of tensor %s (%d).\n",
1142  iC, C->name, C->lens[iC]);
1143  }
1144  return false;
1145  }
1146  if (iB != -1){
1147  len = B->lens[iB];
1148  }
1149  if (len != -1 && iC != -1 && len != C->lens[iC]){
1150  if (A->wrld->cdt.rank == 0){
1151  printf("Error in contraction call: The %dth edge length of tensor %s does not",
1152  iB, B->name);
1153  printf("match the %dth edge length of tensor %s.\n",
1154  iC, C->name);
1155  }
1156  return false;
1157  }
1158  }
1159  CTF_int::cdealloc(idx_arr);
1160  return true;
1161  }
1162 
1163 
1164  int contraction::check_mapping(){
1165 
1166  int num_tot, i, ph_A, ph_B, iA, iB, iC, pass, order, topo_order;
1167  int * idx_arr;
1168  int * phys_mismatched, * phys_mapped;
1169  mapping * map;
1170  tensor * pA, * pB;
1171 
1172  pass = 1;
1173 
1174  if (A->is_mapped == 0) pass = 0;
1175  if (B->is_mapped == 0) pass = 0;
1176  if (C->is_mapped == 0) pass = 0;
1177  ASSERT(pass==1);
1178 
1179  if (A->is_folded == 1) pass = 0;
1180  if (B->is_folded == 1) pass = 0;
1181  if (C->is_folded == 1) pass = 0;
1182 
1183  if (pass==0){
1184  DPRINTF(3,"failed confirmation here\n");
1185  return 0;
1186  }
1187 
1188  if (A->topo != B->topo) pass = 0;
1189  if (A->topo != C->topo) pass = 0;
1190 
1191  if (pass==0){
1192  DPRINTF(3,"failed confirmation here\n");
1193  return 0;
1194  }
1195 
1196  topo_order = A->topo->order;
1197  CTF_int::alloc_ptr(sizeof(int)*topo_order, (void**)&phys_mismatched);
1198  CTF_int::alloc_ptr(sizeof(int)*topo_order, (void**)&phys_mapped);
1199  memset(phys_mismatched, 0, sizeof(int)*topo_order);
1200  memset(phys_mapped, 0, sizeof(int)*topo_order);
1201 
1202 
1203  inv_idx(A->order, idx_A,
1204  B->order, idx_B,
1205  C->order, idx_C,
1206  &num_tot, &idx_arr);
1207 
1208  if (!check_self_mapping(A, idx_A))
1209  pass = 0;
1210  if (!check_self_mapping(B, idx_B))
1211  pass = 0;
1212  if (!check_self_mapping(C, idx_C))
1213  pass = 0;
1214  if (pass == 0){
1215  DPRINTF(3,"failed confirmation here\n");
1216  }
1217 
1218 
1219  for (i=0; i<num_tot; i++){
1220  if (idx_arr[3*i+0] != -1 &&
1221  idx_arr[3*i+1] != -1 &&
1222  idx_arr[3*i+2] != -1){
1223  iA = idx_arr[3*i+0];
1224  iB = idx_arr[3*i+1];
1225  iC = idx_arr[3*i+2];
1226  // printf("A[%d].np = %d\n", iA, A->edge_map[iA].np);
1227  //printf("B[%d].np = %d\n", iB, B->edge_map[iB].np);
1228  //printf("C[%d].np = %d\n", iC, C->edge_map[iC].np);
1229  if (0 == comp_dim_map(&B->edge_map[iB], &A->edge_map[iA]) ||
1230  0 == comp_dim_map(&B->edge_map[iB], &C->edge_map[iC])){
1231  DPRINTF(3,"failed confirmation here %d %d %d\n",iA,iB,iC);
1232  pass = 0;
1233  break;
1234  } else {
1235  map = &A->edge_map[iA];
1236  for (;;){
1237  if (map->type == PHYSICAL_MAP){
1238  if (phys_mapped[map->cdt] == 1){
1239  DPRINTF(3,"failed confirmation here %d\n",iA);
1240  pass = 0;
1241  break;
1242  } else {
1243  phys_mapped[map->cdt] = 1;
1244  phys_mismatched[map->cdt] = 1;
1245  }
1246  } else break;
1247  if (map->has_child) map = map->child;
1248  else break;
1249  }
1250  }
1251  }
1252  }
1253  for (i=0; i<num_tot; i++){
1254  if (idx_arr[3*i+0] == -1 ||
1255  idx_arr[3*i+1] == -1 ||
1256  idx_arr[3*i+2] == -1){
1257  for (order=0; order<3; order++){
1258  switch (order){
1259  case 0:
1260  pA = A;
1261  pB = B;
1262  iA = idx_arr[3*i+0];
1263  iB = idx_arr[3*i+1];
1264  iC = idx_arr[3*i+2];
1265  break;
1266  case 1:
1267  pA = A;
1268  pB = C;
1269  iA = idx_arr[3*i+0];
1270  iB = idx_arr[3*i+2];
1271  iC = idx_arr[3*i+1];
1272  break;
1273  case 2:
1274  pA = C;
1275  pB = B;
1276  iA = idx_arr[3*i+2];
1277  iB = idx_arr[3*i+1];
1278  iC = idx_arr[3*i+0];
1279  break;
1280  }
1281  if (iC == -1){
1282  if (iB == -1){
1283  if (iA != -1) {
1284  map = &pA->edge_map[iA];
1285  for (;;){
1286  if (map->type == PHYSICAL_MAP){
1287  if (phys_mapped[map->cdt] == 1){
1288  DPRINTF(3,"failed confirmation here %d\n",iA);
1289  pass = 0;
1290  break;
1291  } else
1292  phys_mapped[map->cdt] = 1;
1293  } else break;
1294  if (map->has_child) map = map->child;
1295  else break;
1296  }
1297  }
1298  } else if (iA == -1){
1299  map = &pB->edge_map[iB];
1300  for (;;){
1301  if (map->type == PHYSICAL_MAP){
1302  if (phys_mapped[map->cdt] == 1){
1303  DPRINTF(3,"failed confirmation here %d\n",iA);
1304  pass = 0;
1305  break;
1306  } else
1307  phys_mapped[map->cdt] = 1;
1308  } else break;
1309  if (map->has_child) map = map->child;
1310  else break;
1311  }
1312  } else {
1313  /* Confirm that the phases of A and B
1314  over which we are contracting are the same */
1315  ph_A = pA->edge_map[iA].calc_phase();
1316  ph_B = pB->edge_map[iB].calc_phase();
1317 
1318  if (ph_A != ph_B){
1319  //if (global_comm.rank == 0)
1320  DPRINTF(3,"failed confirmation here iA=%d iB=%d\n",iA,iB);
1321  pass = 0;
1322  break;
1323  }
1324  /* If the mapping along this dimension is the same make sure
1325  its mapped to a onto a unique free dimension */
1326  if (comp_dim_map(&pB->edge_map[iB], &pA->edge_map[iA])){
1327  map = &pB->edge_map[iB];
1328  if (map->type == PHYSICAL_MAP){
1329  if (phys_mapped[map->cdt] == 1){
1330  DPRINTF(3,"failed confirmation here %d\n",iB);
1331  pass = 0;
1332  } else
1333  phys_mapped[map->cdt] = 1;
1334  }
1335  } else {
1336  /* If the mapping along this dimension is different, make sure
1337  the mismatch is mapped onto unqiue physical dimensions */
1338  map = &pA->edge_map[iA];
1339  if (map->has_child) {
1340  if (map->child->type == PHYSICAL_MAP){
1341  DPRINTF(3,"failed confirmation here %d, matched and folded physical mapping not allowed\n",iB);
1342  pass = 0;
1343  }
1344  }
1345 
1346 
1347  for (;;){
1348  if (map->type == PHYSICAL_MAP){
1349  if (phys_mismatched[map->cdt] == 1){
1350  DPRINTF(3,"failed confirmation here i=%d iA=%d iB=%d\n",i,iA,iB);
1351  pass = 0;
1352  break;
1353  } else
1354  phys_mismatched[map->cdt] = 1;
1355  if (map->has_child)
1356  map = map->child;
1357  else break;
1358  } else break;
1359  }
1360  map = &pB->edge_map[iB];
1361  for (;;){
1362  if (map->type == PHYSICAL_MAP){
1363  if (phys_mismatched[map->cdt] == 1){
1364  DPRINTF(3,"failed confirmation here i=%d iA=%d iB=%d\n",i,iA,iB);
1365  pass = 0;
1366  break;
1367  } else
1368  phys_mismatched[map->cdt] = 1;
1369  if (map->has_child)
1370  map = map->child;
1371  else break;
1372  } else break;
1373  }
1374  }
1375  }
1376  }
1377  }
1378  }
1379  }
1380  for (i=0; i<topo_order; i++){
1381  if (phys_mismatched[i] == 1 && phys_mapped[i] == 0){
1382  DPRINTF(3,"failed confirmation here i=%d\n",i);
1383  pass = 0;
1384  break;
1385  }
1386  /* if (phys_mismatched[i] == 0 && phys_mapped[i] == 0){
1387  DPRINTF(3,"failed confirmation here i=%d\n",i);
1388  pass = 0;
1389  break;
1390  } */
1391  }
1392 
1393 
1394  CTF_int::cdealloc(idx_arr);
1395  CTF_int::cdealloc(phys_mismatched);
1396  CTF_int::cdealloc(phys_mapped);
1397  return pass;
1398  }
1399 
1410  static int
1411  map_weigh_indices(int const * idx_arr,
1412  int const * idx_weigh,
1413  int num_tot,
1414  int num_weigh,
1415  topology const * topo,
1416  tensor * A,
1417  tensor * B,
1418  tensor * C){
1419  int tsr_order, iweigh, iA, iB, iC, i, j, k, jX, stat, num_sub_phys_dims;
1420  int * tsr_edge_len, * tsr_sym_table, * restricted, * comm_idx;
1421  CommData * sub_phys_comm;
1422  mapping * weigh_map;
1423 
1424  TAU_FSTART(map_weigh_indices);
1425 
1426  tsr_order = num_weigh;
1427 
1428 
1429  for (i=0; i<num_weigh; i++){
1430  iweigh = idx_weigh[i];
1431  iA = idx_arr[iweigh*3+0];
1432  iB = idx_arr[iweigh*3+1];
1433  iC = idx_arr[iweigh*3+2];
1434 
1435  if (A->edge_map[iA].type == PHYSICAL_MAP ||
1436  B->edge_map[iB].type == PHYSICAL_MAP ||
1437  C->edge_map[iC].type == PHYSICAL_MAP)
1438  return NEGATIVE;
1439  }
1440  CTF_int::alloc_ptr(tsr_order*sizeof(int), (void**)&restricted);
1441  CTF_int::alloc_ptr(tsr_order*sizeof(int), (void**)&tsr_edge_len);
1442  CTF_int::alloc_ptr(tsr_order*tsr_order*sizeof(int), (void**)&tsr_sym_table);
1443  CTF_int::alloc_ptr(tsr_order*sizeof(mapping), (void**)&weigh_map);
1444 
1445  memset(tsr_sym_table, 0, tsr_order*tsr_order*sizeof(int));
1446  memset(restricted, 0, tsr_order*sizeof(int));
1447  extract_free_comms(topo, A->order, A->edge_map,
1448  B->order, B->edge_map,
1449  num_sub_phys_dims, &sub_phys_comm, &comm_idx);
1450 
1451  for (i=0; i<tsr_order; i++){
1452  weigh_map[i].type = VIRTUAL_MAP;
1453  weigh_map[i].has_child = 0;
1454  weigh_map[i].np = 1;
1455  }
1456  for (i=0; i<num_weigh; i++){
1457  iweigh = idx_weigh[i];
1458  iA = idx_arr[iweigh*3+0];
1459  iB = idx_arr[iweigh*3+1];
1460  iC = idx_arr[iweigh*3+2];
1461 
1462 
1463  weigh_map[i].np = lcm(weigh_map[i].np,A->edge_map[iA].np);
1464  weigh_map[i].np = lcm(weigh_map[i].np,B->edge_map[iB].np);
1465  weigh_map[i].np = lcm(weigh_map[i].np,C->edge_map[iC].np);
1466 
1467  tsr_edge_len[i] = A->pad_edge_len[iA];
1468 
1469  for (j=i+1; j<num_weigh; j++){
1470  jX = idx_arr[idx_weigh[j]*3+0];
1471 
1472  for (k=MIN(iA,jX); k<MAX(iA,jX); k++){
1473  if (A->sym[k] == NS)
1474  break;
1475  }
1476  if (k==MAX(iA,jX)){
1477  tsr_sym_table[i*tsr_order+j] = 1;
1478  tsr_sym_table[j*tsr_order+i] = 1;
1479  }
1480 
1481  jX = idx_arr[idx_weigh[j]*3+1];
1482 
1483  for (k=MIN(iB,jX); k<MAX(iB,jX); k++){
1484  if (B->sym[k] == NS)
1485  break;
1486  }
1487  if (k==MAX(iB,jX)){
1488  tsr_sym_table[i*tsr_order+j] = 1;
1489  tsr_sym_table[j*tsr_order+i] = 1;
1490  }
1491 
1492  jX = idx_arr[idx_weigh[j]*3+2];
1493 
1494  for (k=MIN(iC,jX); k<MAX(iC,jX); k++){
1495  if (C->sym[k] == NS)
1496  break;
1497  }
1498  if (k==MAX(iC,jX)){
1499  tsr_sym_table[i*tsr_order+j] = 1;
1500  tsr_sym_table[j*tsr_order+i] = 1;
1501  }
1502  }
1503  }
1504  stat = map_tensor(num_sub_phys_dims, tsr_order,
1505  tsr_edge_len, tsr_sym_table,
1506  restricted, sub_phys_comm,
1507  comm_idx, 0,
1508  weigh_map);
1509 
1510  if (stat == ERROR)
1511  return ERROR;
1512 
1513  /* define mapping of tensors A and B according to the mapping of ctr dims */
1514  if (stat == SUCCESS){
1515  for (i=0; i<num_weigh; i++){
1516  iweigh = idx_weigh[i];
1517  iA = idx_arr[iweigh*3+0];
1518  iB = idx_arr[iweigh*3+1];
1519  iC = idx_arr[iweigh*3+2];
1520 
1521  copy_mapping(1, &weigh_map[i], &A->edge_map[iA]);
1522  copy_mapping(1, &weigh_map[i], &B->edge_map[iB]);
1523  copy_mapping(1, &weigh_map[i], &C->edge_map[iC]);
1524  }
1525  }
1526  CTF_int::cdealloc(restricted);
1527  CTF_int::cdealloc(tsr_edge_len);
1528  CTF_int::cdealloc(tsr_sym_table);
1529  for (i=0; i<num_weigh; i++){
1530  weigh_map[i].clear();
1531  }
1532  CTF_int::cdealloc(weigh_map);
1533  //if (num_sub_phys_dims > 0)
1534  CTF_int::cdealloc(sub_phys_comm);
1535  CTF_int::cdealloc(comm_idx);
1536 
1537  TAU_FSTOP(map_weigh_indices);
1538  return stat;
1539  }
1551  static int
1552  map_ctr_indices(int const * idx_arr,
1553  int const * idx_ctr,
1554  int num_tot,
1555  int num_ctr,
1556  topology const * topo,
1557  tensor * A,
1558  tensor * B){
1559  int tsr_order, ictr, iA, iB, i, j, jctr, jX, stat, num_sub_phys_dims;
1560  int * tsr_edge_len, * tsr_sym_table, * restricted, * comm_idx;
1561  CommData * sub_phys_comm;
1562  mapping * ctr_map;
1563 
1564  TAU_FSTART(map_ctr_indices);
1565 
1566  tsr_order = num_ctr*2;
1567 
1568  CTF_int::alloc_ptr(tsr_order*sizeof(int), (void**)&restricted);
1569  CTF_int::alloc_ptr(tsr_order*sizeof(int), (void**)&tsr_edge_len);
1570  CTF_int::alloc_ptr(tsr_order*tsr_order*sizeof(int), (void**)&tsr_sym_table);
1571  CTF_int::alloc_ptr(tsr_order*sizeof(mapping), (void**)&ctr_map);
1572 
1573  memset(tsr_sym_table, 0, tsr_order*tsr_order*sizeof(int));
1574  memset(restricted, 0, tsr_order*sizeof(int));
1575 
1576  for (i=0; i<tsr_order; i++){
1577  ctr_map[i].type = VIRTUAL_MAP;
1578  ctr_map[i].has_child = 0;
1579  ctr_map[i].np = 1;
1580  }
1581  for (i=0; i<num_ctr; i++){
1582  ictr = idx_ctr[i];
1583  iA = idx_arr[ictr*3+0];
1584  iB = idx_arr[ictr*3+1];
1585 
1586  copy_mapping(1, &A->edge_map[iA], &ctr_map[2*i+0]);
1587  copy_mapping(1, &B->edge_map[iB], &ctr_map[2*i+1]);
1588  }
1589  /* for (i=0; i<tsr_order; i++){
1590  if (ctr_map[i].type == PHYSICAL_MAP) is_premapped = 1;
1591  }*/
1592 
1593  extract_free_comms(topo, A->order, A->edge_map,
1594  B->order, B->edge_map,
1595  num_sub_phys_dims, &sub_phys_comm, &comm_idx);
1596 
1597 
1598  /* Map a tensor of dimension 2*num_ctr, with symmetries among each pair.
1599  * Set the edge lengths and symmetries according to those in ctr dims of A and B.
1600  * This gives us a mapping for the contraction dimensions of tensors A and B. */
1601  for (i=0; i<num_ctr; i++){
1602  ictr = idx_ctr[i];
1603  iA = idx_arr[ictr*3+0];
1604  iB = idx_arr[ictr*3+1];
1605 
1606  tsr_edge_len[2*i+0] = A->pad_edge_len[iA];
1607  tsr_edge_len[2*i+1] = A->pad_edge_len[iA];
1608 
1609  tsr_sym_table[2*i*tsr_order+2*i+1] = 1;
1610  tsr_sym_table[(2*i+1)*tsr_order+2*i] = 1;
1611 
1612  /* Check if A has symmetry among the dimensions being contracted over.
1613  * Ignore symmetry with non-contraction dimensions.
1614  * FIXME: this algorithm can be more efficient but should not be a bottleneck */
1615  if (A->sym[iA] != NS){
1616  for (j=0; j<num_ctr; j++){
1617  jctr = idx_ctr[j];
1618  jX = idx_arr[jctr*3+0];
1619  if (jX == iA+1){
1620  tsr_sym_table[2*i*tsr_order+2*j] = 1;
1621  tsr_sym_table[2*i*tsr_order+2*j+1] = 1;
1622  tsr_sym_table[2*j*tsr_order+2*i] = 1;
1623  tsr_sym_table[2*j*tsr_order+2*i+1] = 1;
1624  tsr_sym_table[(2*i+1)*tsr_order+2*j] = 1;
1625  tsr_sym_table[(2*i+1)*tsr_order+2*j+1] = 1;
1626  tsr_sym_table[(2*j+1)*tsr_order+2*i] = 1;
1627  tsr_sym_table[(2*j+1)*tsr_order+2*i+1] = 1;
1628  }
1629  }
1630  }
1631  if (B->sym[iB] != NS){
1632  for (j=0; j<num_ctr; j++){
1633  jctr = idx_ctr[j];
1634  jX = idx_arr[jctr*3+1];
1635  if (jX == iB+1){
1636  tsr_sym_table[2*i*tsr_order+2*j] = 1;
1637  tsr_sym_table[2*i*tsr_order+2*j+1] = 1;
1638  tsr_sym_table[2*j*tsr_order+2*i] = 1;
1639  tsr_sym_table[2*j*tsr_order+2*i+1] = 1;
1640  tsr_sym_table[(2*i+1)*tsr_order+2*j] = 1;
1641  tsr_sym_table[(2*i+1)*tsr_order+2*j+1] = 1;
1642  tsr_sym_table[(2*j+1)*tsr_order+2*i] = 1;
1643  tsr_sym_table[(2*j+1)*tsr_order+2*i+1] = 1;
1644  }
1645  }
1646  }
1647  }
1648  /* Run the mapping algorithm on this construct */
1649  /*if (is_premapped){
1650  stat = map_symtsr(tsr_order, tsr_sym_table, ctr_map);
1651  } else {*/
1652  stat = map_tensor(num_sub_phys_dims, tsr_order,
1653  tsr_edge_len, tsr_sym_table,
1654  restricted, sub_phys_comm,
1655  comm_idx, 0,
1656  ctr_map);
1657 
1658  //}
1659  if (stat == ERROR)
1660  return ERROR;
1661 
1662  /* define mapping of tensors A and B according to the mapping of ctr dims */
1663  if (stat == SUCCESS){
1664  for (i=0; i<num_ctr; i++){
1665  ictr = idx_ctr[i];
1666  iA = idx_arr[ictr*3+0];
1667  iB = idx_arr[ictr*3+1];
1668 
1669  /* A->edge_map[iA] = ctr_map[2*i+0];
1670  B->edge_map[iB] = ctr_map[2*i+1];*/
1671  copy_mapping(1, &ctr_map[2*i+0], &A->edge_map[iA]);
1672  copy_mapping(1, &ctr_map[2*i+1], &B->edge_map[iB]);
1673  }
1674  }
1675  CTF_int::cdealloc(restricted);
1676  CTF_int::cdealloc(tsr_edge_len);
1677  CTF_int::cdealloc(tsr_sym_table);
1678  for (i=0; i<2*num_ctr; i++){
1679  ctr_map[i].clear();
1680  }
1681  CTF_int::cdealloc(ctr_map);
1682  CTF_int::cdealloc(sub_phys_comm);
1683  CTF_int::cdealloc(comm_idx);
1684 
1685  TAU_FSTOP(map_ctr_indices);
1686  return stat;
1687  }
1688 
1700  static int
1701  map_no_ctr_indices(int const * idx_arr,
1702  int const * idx_no_ctr,
1703  int num_tot,
1704  int num_no_ctr,
1705  topology const * topo,
1706  tensor * A,
1707  tensor * B,
1708  tensor * C){
1709  int stat, i, inoctr, iA, iB, iC;
1710 
1711  TAU_FSTART(map_noctr_indices);
1712 
1713  /* for (i=0; i<num_no_ctr; i++){
1714  inoctr = idx_no_ctr[i];
1715  iA = idx_arr[3*inoctr+0];
1716  iB = idx_arr[3*inoctr+1];
1717  iC = idx_arr[3*inoctr+2];
1718 
1719 
1720  if (iC != -1 && iA != -1){
1721  copy_mapping(1, C->edge_map + iC, A->edge_map + iA);
1722  }
1723  if (iB != -1 && iA != -1){
1724  copy_mapping(1, C->edge_map + iB, A->edge_map + iA);
1725  }
1726  }*/
1727  /* Map remainders of A and B to remainders of phys grid */
1728  stat = A->map_tensor_rem(topo->order, topo->dim_comm, 1);
1729  if (stat != SUCCESS){
1730  if (A->order != 0 || B->order != 0 || C->order != 0){
1731  TAU_FSTOP(map_noctr_indices);
1732  return stat;
1733  }
1734  }
1735  for (i=0; i<num_no_ctr; i++){
1736  inoctr = idx_no_ctr[i];
1737  iA = idx_arr[3*inoctr+0];
1738  iB = idx_arr[3*inoctr+1];
1739  iC = idx_arr[3*inoctr+2];
1740 
1741 
1742  if (iA != -1 && iC != -1){
1743  copy_mapping(1, A->edge_map + iA, C->edge_map + iC);
1744  }
1745  if (iB != -1 && iC != -1){
1746  copy_mapping(1, B->edge_map + iB, C->edge_map + iC);
1747  }
1748  }
1749  stat = C->map_tensor_rem(topo->order, topo->dim_comm, 0);
1750  if (stat != SUCCESS){
1751  TAU_FSTOP(map_noctr_indices);
1752  return stat;
1753  }
1754  for (i=0; i<num_no_ctr; i++){
1755  inoctr = idx_no_ctr[i];
1756  iA = idx_arr[3*inoctr+0];
1757  iB = idx_arr[3*inoctr+1];
1758  iC = idx_arr[3*inoctr+2];
1759 
1760 
1761  if (iA != -1 && iC != -1){
1762  copy_mapping(1, C->edge_map + iC, A->edge_map + iA);
1763  }
1764  if (iB != -1 && iC != -1){
1765  copy_mapping(1, C->edge_map + iC, B->edge_map + iB);
1766  }
1767  }
1768  TAU_FSTOP(map_noctr_indices);
1769 
1770  return SUCCESS;
1771  }
1772 
1773 
1783  static int
1784  map_extra_indices(int const * idx_arr,
1785  int const * idx_extra,
1786  int num_extra,
1787  tensor * A,
1788  tensor * B,
1789  tensor * C){
1790  int i, iA, iB, iC, iextra;
1791 
1792 
1793  for (i=0; i<num_extra; i++){
1794  iextra = idx_extra[i];
1795  iA = idx_arr[3*iextra+0];
1796  iB = idx_arr[3*iextra+1];
1797  iC = idx_arr[3*iextra+2];
1798 
1799  if (iA != -1){
1800  //FIXME handle extra indices via reduction
1801  if (A->edge_map[iA].type == PHYSICAL_MAP)
1802  return NEGATIVE;
1803  if (A->edge_map[iA].type == NOT_MAPPED){
1804  A->edge_map[iA].type = VIRTUAL_MAP;
1805  A->edge_map[iA].np = 1;
1806  A->edge_map[iA].has_child = 0;
1807  }
1808  } else {
1809  if (iB != -1) {
1810  if (B->edge_map[iB].type == PHYSICAL_MAP)
1811  return NEGATIVE;
1812  if (B->edge_map[iB].type == NOT_MAPPED){
1813  B->edge_map[iB].type = VIRTUAL_MAP;
1814  B->edge_map[iB].np = 1;
1815  B->edge_map[iB].has_child = 0;
1816  }
1817  } else {
1818  ASSERT(iC != -1);
1819  if (C->edge_map[iC].type == PHYSICAL_MAP)
1820  return NEGATIVE;
1821  if (C->edge_map[iC].type == NOT_MAPPED){
1822  C->edge_map[iC].type = VIRTUAL_MAP;
1823  C->edge_map[iC].np = 1;
1824  C->edge_map[iC].has_child = 0;
1825  }
1826  }
1827  }
1828  }
1829  return SUCCESS;
1830  }
1831 
1832  int contraction::
1833  get_num_map_variants(topology const * topo){
1834  int nAB, nAC, nBC, nmax_ctr_2d;
1835  return get_num_map_variants(topo, nmax_ctr_2d, nAB, nAC, nBC);
1836  }
1837 
1838  int contraction::
1839  get_num_map_variants(topology const * topo,
1840  int & nmax_ctr_2d,
1841  int & nAB,
1842  int & nAC,
1843  int & nBC){
1844  TAU_FSTART(get_num_map_vars);
1845  int num_tot;
1846  int * idx_arr;
1847  inv_idx(A->order, idx_A,
1848  B->order, idx_B,
1849  C->order, idx_C,
1850  &num_tot, &idx_arr);
1851  nAB=0;
1852  nAC=0;
1853  nBC=0;
1854 
1855  for (int i=0; i<num_tot; i++){
1856  if (idx_arr[3*i+0] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] == -1)
1857  nAB++;
1858  if (idx_arr[3*i+0] != -1 && idx_arr[3*i+1] == -1 && idx_arr[3*i+2] != -1)
1859  nAC++;
1860  if (idx_arr[3*i+0] == -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1)
1861  nBC++;
1862  }
1863  nmax_ctr_2d = std::min(topo->order/2,std::min(nAC,std::min(nAB,nBC)));
1864  int nv=0;
1865  for (int nctr_2d=0; nctr_2d<=nmax_ctr_2d; nctr_2d++){
1866  if (topo->order-2*nctr_2d <= num_tot){
1867  nv += std::pow(3,nctr_2d)*choose(num_tot,topo->order-2*nctr_2d)*choose(nAB,nctr_2d)*choose(nAC,nctr_2d)*choose(nBC,nctr_2d);
1868  }
1869  }
1870  cdealloc(idx_arr);
1871  TAU_FSTOP(get_num_map_vars);
1872  return nv;
1873  }
1874 
1875  bool contraction::switch_topo_perm(){
1876  ASSERT(A->topo == B->topo && B->topo == C->topo);
1877  topology const * topo = A->topo;
1878  int new_order[topo->order*sizeof(int)];
1879  std::fill(new_order, new_order+topo->order, -1);
1880  int il=0;
1881  for (int i=0; i<A->order; i++){
1882  mapping const * map = A->edge_map+i;
1883  if (map->type == PHYSICAL_MAP && map->has_child && map->child->type == PHYSICAL_MAP){
1884  new_order[map->cdt] = il;
1885  il++;
1886  new_order[map->child->cdt] = il;
1887  il++;
1888  }
1889  }
1890  for (int i=0; i<B->order; i++){
1891  mapping const * map = B->edge_map+i;
1892  if (map->type == PHYSICAL_MAP && map->has_child && map->child->type == PHYSICAL_MAP){
1893  if (new_order[map->cdt] != -1 || new_order[map->child->cdt] != -1){
1894  if (new_order[map->child->cdt] != new_order[map->cdt]+1)
1895  return false;
1896  } else {
1897  new_order[map->cdt] = il;
1898  il++;
1899  new_order[map->child->cdt] = il;
1900  il++;
1901  }
1902  }
1903  }
1904  for (int i=0; i<C->order; i++){
1905  mapping const * map = C->edge_map+i;
1906  if (map->type == PHYSICAL_MAP && map->has_child && map->child->type == PHYSICAL_MAP){
1907  if (new_order[map->cdt] != -1 || new_order[map->child->cdt] != -1){
1908  if (new_order[map->child->cdt] != new_order[map->cdt]+1)
1909  return false;
1910  } else {
1911  new_order[map->cdt] = il;
1912  il++;
1913  new_order[map->child->cdt] = il;
1914  il++;
1915  }
1916  }
1917  }
1918 
1919  for (int i=0; i<topo->order; i++){
1920  if (new_order[i] == -1){
1921  new_order[i] = il;
1922  il++;
1923  }
1924  }
1925  int new_lens[topo->order];
1926  for (int i=0; i<topo->order; i++){
1927  new_lens[new_order[i]] = topo->lens[i];
1928 // printf("new_order[%d/%d] = %d, new_lens[%d] = %d\n", i, topo->order, new_order[i], new_order[i], new_lens[new_order[i]]);
1929  }
1930  topology * new_topo = NULL;
1931  for (int i=0; i<(int)A->wrld->topovec.size(); i++){
1932  if (A->wrld->topovec[i]->order == topo->order){
1933  bool has_same_len = true;
1934  for (int j=0; j<topo->order; j++){
1935  if (A->wrld->topovec[i]->lens[j] != new_lens[j]) has_same_len = false;
1936  }
1937  if (has_same_len){
1938  new_topo = A->wrld->topovec[i];
1939  break;
1940  }
1941  }
1942  }
1943  ASSERT(new_topo != NULL);
1944  A->topo = new_topo;
1945  B->topo = new_topo;
1946  C->topo = new_topo;
1947  for (int i=0; i<A->order; i++){
1948  mapping * map = A->edge_map + i;
1949  while (map != NULL && map->type == PHYSICAL_MAP){
1950  map->cdt = new_order[map->cdt];
1951  if (map->has_child) map = map->child;
1952  else map = NULL;
1953  }
1954  }
1955  for (int i=0; i<B->order; i++){
1956  mapping * map = B->edge_map + i;
1957  while (map != NULL && map->type == PHYSICAL_MAP){
1958  map->cdt = new_order[map->cdt];
1959  if (map->has_child) map = map->child;
1960  else map = NULL;
1961  }
1962  }
1963  for (int i=0; i<C->order; i++){
1964  mapping * map = C->edge_map + i;
1965  while (map != NULL && map->type == PHYSICAL_MAP){
1966  map->cdt = new_order[map->cdt];
1967  if (map->has_child) map = map->child;
1968  else map = NULL;
1969  }
1970  }
1971  return true;
1972 
1973  }
1974 
1975  bool contraction::
1976  exh_map_to_topo(topology const * topo,
1977  int variant){
1978 
1979  int num_tot;
1980  int * idx_arr;
1981  inv_idx(A->order, idx_A,
1982  B->order, idx_B,
1983  C->order, idx_C,
1984  &num_tot, &idx_arr);
1985 
1986  int nAB, nAC, nBC, nmax_ctr_2d;
1987  int nvar_tot = get_num_map_variants(topo, nmax_ctr_2d, nAB, nAC, nBC);
1988  ASSERT(variant<nvar_tot);
1989 
1990  int nv=0;
1991  for (int nctr_2d=0; nctr_2d<=nmax_ctr_2d; nctr_2d++){
1992  int nv0 = nv;
1993  if (topo->order-2*nctr_2d <= num_tot){
1994  nv += std::pow(3,nctr_2d)*choose(num_tot,topo->order-2*nctr_2d)*choose(nAB,nctr_2d)*choose(nAC,nctr_2d)*choose(nBC,nctr_2d);
1995  }
1996  if (nv > variant){
1997  int v = variant - nv0;
1998  int rep_choices = choose(num_tot,topo->order-2*nctr_2d);
1999  int rep_ch = v%rep_choices;
2000  int rep_inds[topo->order-2*nctr_2d];
2001  get_choice(num_tot,topo->order-2*nctr_2d,rep_ch,rep_inds);
2002  for (int i=2*nctr_2d; i<topo->order; i++){
2003  int r = rep_inds[i-2*nctr_2d];
2004  if (idx_arr[3*r+0] != -1)
2005  A->edge_map[idx_arr[3*r+0]].aug_phys(topo, i);
2006  if (idx_arr[3*r+1] != -1)
2007  B->edge_map[idx_arr[3*r+1]].aug_phys(topo, i);
2008  if (idx_arr[3*r+2] != -1)
2009  C->edge_map[idx_arr[3*r+2]].aug_phys(topo, i);
2010  }
2011  int iAB[nctr_2d];
2012  int iAC[nctr_2d];
2013  int iBC[nctr_2d];
2014  int ord[nctr_2d];
2015  v = v/rep_choices;
2016  for (int i=0; i<nctr_2d; i++){
2017  ord[i] = v%3;
2018  v = v/3;
2019  }
2020  get_choice(nAB,nctr_2d,v%choose(nAB,nctr_2d),iAB);
2021  v = v/choose(nAB,nctr_2d);
2022  get_choice(nAC,nctr_2d,v%choose(nAC,nctr_2d),iAC);
2023  v = v/choose(nAC,nctr_2d);
2024  get_choice(nBC,nctr_2d,v%choose(nBC,nctr_2d),iBC);
2025  v = v/choose(nBC,nctr_2d);
2026 
2027  for (int i=0; i<nctr_2d; i++){
2028  // printf("iAB[%d] = %d iAC[%d] = %d iBC[%d] = %d ord[%d] = %d\n", i, iAB[i], i, iAC[i], i, iBC[i], i, ord[i]);
2029  int iiAB=0;
2030  int iiAC=0;
2031  int iiBC=0;
2032  for (int j=0; j<num_tot; j++){
2033  if (idx_arr[3*j+0] != -1 && idx_arr[3*j+1] != -1 && idx_arr[3*j+2] == -1){
2034  if (iAB[i] == iiAB){
2035  switch (ord[i]){
2036  case 0:
2037  A->edge_map[idx_arr[3*j+0]].aug_phys(topo, 2*i);
2038  B->edge_map[idx_arr[3*j+1]].aug_phys(topo, 2*i+1);
2039  break;
2040  case 1:
2041  case 2:
2042  A->edge_map[idx_arr[3*j+0]].aug_phys(topo, 2*i);
2043  B->edge_map[idx_arr[3*j+1]].aug_phys(topo, 2*i);
2044  break;
2045  }
2046  }
2047  iiAB++;
2048  }
2049  if (idx_arr[3*j+0] != -1 && idx_arr[3*j+1] == -1 && idx_arr[3*j+2] != -1){
2050  if (iAC[i] == iiAC){
2051  switch (ord[i]){
2052  case 0:
2053  A->edge_map[idx_arr[3*j+0]].aug_phys(topo, 2*i+1);
2054  C->edge_map[idx_arr[3*j+2]].aug_phys(topo, 2*i+1);
2055  case 1:
2056  break;
2057  A->edge_map[idx_arr[3*j+0]].aug_phys(topo, 2*i+1);
2058  C->edge_map[idx_arr[3*j+2]].aug_phys(topo, 2*i);
2059  break;
2060  case 2:
2061  A->edge_map[idx_arr[3*j+0]].aug_phys(topo, 2*i+1);
2062  C->edge_map[idx_arr[3*j+2]].aug_phys(topo, 2*i+1);
2063  break;
2064  }
2065  }
2066  iiAC++;
2067  }
2068  if (idx_arr[3*j+0] == -1 && idx_arr[3*j+1] != -1 && idx_arr[3*j+2] != -1){
2069  if (iBC[i] == iiBC){
2070  switch (ord[i]){
2071  case 2:
2072  B->edge_map[idx_arr[3*j+1]].aug_phys(topo, 2*i+1);
2073  C->edge_map[idx_arr[3*j+2]].aug_phys(topo, 2*i);
2074  break;
2075  case 0:
2076  B->edge_map[idx_arr[3*j+1]].aug_phys(topo, 2*i);
2077  C->edge_map[idx_arr[3*j+2]].aug_phys(topo, 2*i);
2078  break;
2079  case 1:
2080  B->edge_map[idx_arr[3*j+1]].aug_phys(topo, 2*i+1);
2081  C->edge_map[idx_arr[3*j+2]].aug_phys(topo, 2*i+1);
2082  break;
2083  }
2084  }
2085  iiBC++;
2086  }
2087  }
2088  }
2089  break;
2090  }
2091  }
2092  /* int num_totp = num_tot+1;
2093  int num_choices = num_totp*num_totp;
2094  int nv = variant;
2095  // go in reverse order to make aug_phys have potential to be correct order with multiple phys dims
2096  for (int idim=topo->order-1; idim>=0; idim--){
2097  int i = (nv%num_choices)/num_totp;
2098  int j = (nv%num_choices)%num_totp;
2099  nv = nv/num_choices;
2100  int iA = -1;
2101  int iB = -1;
2102  int iC = -1;
2103  if (i!=num_tot){
2104  iA = idx_arr[3*i+0];
2105  iB = idx_arr[3*i+1];
2106  iC = idx_arr[3*i+2];
2107  if (i==j){
2108  if (iA != -1 || iB != -1 || iC != -1) return false;
2109  else {
2110  A->edge_map[iA].aug_phys(topo, idim);
2111  B->edge_map[iB].aug_phys(topo, idim);
2112  C->edge_map[iC].aug_phys(topo, idim);
2113  }
2114  }
2115  }
2116  int jA = -1;
2117  int jB = -1;
2118  int jC = -1;
2119  if (j!= num_tot && j!= i){
2120  jA = idx_arr[3*j+0];
2121  jB = idx_arr[3*j+1];
2122  jC = idx_arr[3*j+2];
2123  }
2124  if (i!=j){
2125  if (iA != -1) A->edge_map[iA].aug_phys(topo, idim);
2126  else if (jA != -1) A->edge_map[jA].aug_phys(topo, idim);
2127  if (iB != -1) B->edge_map[iB].aug_phys(topo, idim);
2128  else if (jB != -1) B->edge_map[jB].aug_phys(topo, idim);
2129  if (iC != -1) C->edge_map[iC].aug_phys(topo, idim);
2130  else if (jC != -1) C->edge_map[jC].aug_phys(topo, idim);
2131  }
2132  }*/
2133 
2134 
2135  //A->order*B->order*C->order+A->order*B->order+A->order*C->order+B->order*C->order+A->order+B->order+C->order+1;
2136 /* int nv = variant;
2137  for (int idim=0; idim<topo->order; idim++){
2138  int iv = nv%num_choices;
2139  nv = nv/num_choices;
2140  if (iv == 0) continue;
2141  iv--;
2142  if (iv < A->order+B->order+C->order){
2143  if (iv < A->order){
2144  A->edge_map[iv].aug_phys(topo, idim);
2145  continue;
2146  }
2147  iv -= A->order;
2148  if (iv < B->order){
2149  B->edge_map[iv].aug_phys(topo, idim);
2150  continue;
2151  }
2152  iv -= B->order;
2153  if (iv < C->order){
2154  C->edge_map[iv].aug_phys(topo, idim);
2155  continue;
2156  }
2157  }
2158  iv -= A->order+B->order+C->order;
2159  if (iv < A->order*B->order+A->order*C->order+B->order*C->order){
2160  if (iv < A->order*B->order){
2161  A->edge_map[iv%A->order].aug_phys(topo, idim);
2162  B->edge_map[iv/A->order].aug_phys(topo, idim);
2163  continue;
2164  }
2165  iv -= A->order*B->order;
2166  if (iv < A->order*C->order){
2167  A->edge_map[iv%A->order].aug_phys(topo, idim);
2168  C->edge_map[iv/A->order].aug_phys(topo, idim);
2169  continue;
2170  }
2171  iv -= A->order*C->order;
2172  if (iv < B->order*C->order){
2173  B->edge_map[iv%B->order].aug_phys(topo, idim);
2174  C->edge_map[iv/B->order].aug_phys(topo, idim);
2175  continue;
2176  }
2177  }
2178  iv -= A->order*B->order+A->order*C->order+B->order*C->order;
2179  A->edge_map[iv%A->order].aug_phys(topo, idim);
2180  iv = iv/A->order;
2181  B->edge_map[iv%B->order].aug_phys(topo, idim);
2182  iv = iv/B->order;
2183  C->edge_map[iv%C->order].aug_phys(topo, idim);
2184  } */
2185  bool is_mod = true;
2186  while (is_mod){
2187  is_mod = false;
2188  int ret;
2189  ret = map_symtsr(A->order, A->sym_table, A->edge_map);
2190  ASSERT(ret == SUCCESS); //if (ret!=SUCCESS) return ret;
2191  ret = map_symtsr(B->order, B->sym_table, B->edge_map);
2192  ASSERT(ret == SUCCESS); //if (ret!=SUCCESS) return ret;
2193  ret = map_symtsr(C->order, C->sym_table, C->edge_map);
2194  ASSERT(ret == SUCCESS); //if (ret!=SUCCESS) return ret;
2195 
2196  for (int i=0; i<num_tot; i++){
2197  int iA = idx_arr[3*i+0];
2198  int iB = idx_arr[3*i+1];
2199  int iC = idx_arr[3*i+2];
2200  int lcm_phase = 1;
2201  if (iA != -1) lcm_phase = lcm(lcm_phase,A->edge_map[iA].calc_phase());
2202  if (iB != -1) lcm_phase = lcm(lcm_phase,B->edge_map[iB].calc_phase());
2203  if (iC != -1) lcm_phase = lcm(lcm_phase,C->edge_map[iC].calc_phase());
2204  if (iA != -1 && lcm_phase != A->edge_map[iA].calc_phase()){
2205  A->edge_map[iA].aug_virt(lcm_phase);
2206  is_mod = true;
2207  }
2208  if (iB != -1 && lcm_phase != B->edge_map[iB].calc_phase()){
2209  B->edge_map[iB].aug_virt(lcm_phase);
2210  is_mod = true;
2211  }
2212  if (iC != -1 && lcm_phase != C->edge_map[iC].calc_phase()){
2213  C->edge_map[iC].aug_virt(lcm_phase);
2214  is_mod = true;
2215  }
2216  //printf("i=%d lcm_phase=%d\n",i,lcm_phase);
2217  }
2218  }
2219  cdealloc(idx_arr);
2220  return true;
2221  }
2222 
2223  int contraction::
2224  map_to_topology(topology * topo,
2225  int order){
2226  /* int * idx_ctr,
2227  int * idx_extra,
2228  int * idx_no_ctr,
2229  int * idx_weigh){*/
2230  int num_tot, num_ctr, num_no_ctr, num_weigh, num_extra, i, ret;
2231  int const * tidx_A, * tidx_B, * tidx_C;
2232  int * idx_arr, * idx_extra, * idx_no_ctr, * idx_weigh, * idx_ctr;
2233 
2234  tensor * tA, * tB, * tC;
2235  get_perm<tensor*>(order, A, B, C, tA, tB, tC);
2236  get_perm<const int*>(order, idx_A, idx_B, idx_C, tidx_A, tidx_B, tidx_C);
2237 
2238  inv_idx(tA->order, tidx_A,
2239  tB->order, tidx_B,
2240  tC->order, tidx_C,
2241  &num_tot, &idx_arr);
2242 
2243  CTF_int::alloc_ptr(sizeof(int)*num_tot, (void**)&idx_no_ctr);
2244  CTF_int::alloc_ptr(sizeof(int)*num_tot, (void**)&idx_extra);
2245  CTF_int::alloc_ptr(sizeof(int)*num_tot, (void**)&idx_weigh);
2246  CTF_int::alloc_ptr(sizeof(int)*num_tot, (void**)&idx_ctr);
2247  num_ctr = 0, num_no_ctr = 0, num_extra = 0, num_weigh = 0;
2248  for (i=0; i<num_tot; i++){
2249  if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
2250  idx_weigh[num_weigh] = i;
2251  num_weigh++;
2252  } else if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1){
2253  idx_ctr[num_ctr] = i;
2254  num_ctr++;
2255  } else if (idx_arr[3*i+2] != -1 &&
2256  ((idx_arr[3*i+0] != -1) || (idx_arr[3*i+1] != -1))){
2257  idx_no_ctr[num_no_ctr] = i;
2258  num_no_ctr++;
2259  } else {
2260  idx_extra[num_extra] = i;
2261  num_extra++;
2262  }
2263  }
2264  tA->topo = topo;
2265  tB->topo = topo;
2266  tC->topo = topo;
2267 
2268  /* Map the weigh indices of A, B, and C*/
2269 
2270 
2271  int stat;
2272  ret = map_weigh_indices(idx_arr, idx_weigh, num_tot, num_weigh, topo, tA, tB, tC);
2273  do {
2274  if (ret == NEGATIVE) {
2275  stat = ret;
2276  break;
2277  }
2278  if (ret == ERROR) {
2279  stat = ret;
2280  break;
2281  }
2282 
2283 
2284  /* Map the contraction indices of A and B */
2285  ret = map_ctr_indices(idx_arr, idx_ctr, num_tot, num_ctr, topo, tA, tB);
2286  if (ret == NEGATIVE) {
2287  stat = ret;
2288  break;
2289  }
2290  if (ret == ERROR) {
2291  stat = ret;
2292  break;
2293  }
2294 
2295 
2296  /* ret = map_self_indices(tA, tidx_A);
2297  if (ret == NEGATIVE) {
2298  CTF_int::cdealloc(idx_arr);
2299  return NEGATIVE;
2300  }
2301  if (ret == ERROR) {
2302  CTF_int::cdealloc(idx_arr);
2303  return ERROR;
2304  }
2305  ret = map_self_indices(tB, tidx_B);
2306  if (ret == NEGATIVE) {
2307  CTF_int::cdealloc(idx_arr);
2308  return NEGATIVE;
2309  }
2310  if (ret == ERROR) {
2311  CTF_int::cdealloc(idx_arr);
2312  return ERROR;
2313  }
2314  ret = map_self_indices(tC, tidx_C);
2315  if (ret == NEGATIVE) {
2316  CTF_int::cdealloc(idx_arr);
2317  return NEGATIVE;
2318  }
2319  if (ret == ERROR) {
2320  CTF_int::cdealloc(idx_arr);
2321  return ERROR;
2322  }*/
2323  ret = map_extra_indices(idx_arr, idx_extra, num_extra, tA, tB, tC);
2324  if (ret == NEGATIVE) {
2325  stat = ret;
2326  break;
2327  }
2328  if (ret == ERROR) {
2329  stat = ret;
2330  break;
2331  }
2332 
2333 
2334  /* Map C or equivalently, the non-contraction indices of A and B */
2335  ret = map_no_ctr_indices(idx_arr, idx_no_ctr, num_tot, num_no_ctr, topo, tA, tB, tC);
2336  if (ret == NEGATIVE){
2337  stat = ret;
2338  break;
2339  }
2340  if (ret == ERROR) {
2341  return ERROR;
2342  }
2343  ret = map_symtsr(tA->order, tA->sym_table, tA->edge_map);
2344  if (ret!=SUCCESS) return ret;
2345  ret = map_symtsr(tB->order, tB->sym_table, tB->edge_map);
2346  if (ret!=SUCCESS) return ret;
2347  ret = map_symtsr(tC->order, tC->sym_table, tC->edge_map);
2348  if (ret!=SUCCESS) return ret;
2349 
2350  /* Do it again to make sure everything is properly mapped. FIXME: loop */
2351  ret = map_ctr_indices(idx_arr, idx_ctr, num_tot, num_ctr, topo, tA ,tB);
2352  if (ret == NEGATIVE){
2353  stat = ret;
2354  break;
2355  }
2356  if (ret == ERROR) {
2357  return ERROR;
2358  }
2359  ret = map_no_ctr_indices(idx_arr, idx_no_ctr, num_tot, num_no_ctr, topo, tA, tB, tC);
2360  if (ret == NEGATIVE){
2361  stat = ret;
2362  break;
2363  }
2364  if (ret == ERROR) {
2365  return ERROR;
2366  }
2367 
2368  /*ret = map_ctr_indices(idx_arr, idx_ctr, num_tot, num_ctr,
2369  tA, tB, topo);*/
2370  /* Map C or equivalently, the non-contraction indices of A and B */
2371  /*ret = map_no_ctr_indices(idx_arr, idx_no_ctr, num_tot, num_no_ctr,
2372  tA, tB, tC, topo);*/
2373  ret = map_symtsr(tA->order, tA->sym_table, tA->edge_map);
2374  if (ret!=SUCCESS) return ret;
2375  ret = map_symtsr(tB->order, tB->sym_table, tB->edge_map);
2376  if (ret!=SUCCESS) return ret;
2377  ret = map_symtsr(tC->order, tC->sym_table, tC->edge_map);
2378  if (ret!=SUCCESS) return ret;
2379 
2380 
2381  stat = SUCCESS;
2382  } while(0);
2383 
2384  cdealloc(idx_arr); cdealloc(idx_ctr); cdealloc(idx_extra); cdealloc(idx_no_ctr); cdealloc(idx_weigh);
2385  return stat;
2386  }
2387 
2388  int contraction::try_topo_morph(){
2389  topology * tA, * tB, * tC;
2390  int ret;
2391  tensor * tsr_keep, * tsr_change_A, * tsr_change_B;
2392 
2393  tA = A->topo;
2394  tB = B->topo;
2395  tC = C->topo;
2396 
2397  if (tA == tB && tB == tC){
2398  return SUCCESS;
2399  }
2400 
2401  if (tA->order >= tB->order){
2402  if (tA->order >= tC->order){
2403  tsr_keep = A;
2404  tsr_change_A = B;
2405  tsr_change_B = C;
2406  } else {
2407  tsr_keep = C;
2408  tsr_change_A = A;
2409  tsr_change_B = B;
2410  }
2411  } else {
2412  if (tB->order >= tC->order){
2413  tsr_keep = B;
2414  tsr_change_A = A;
2415  tsr_change_B = C;
2416  } else {
2417  tsr_keep = C;
2418  tsr_change_A = A;
2419  tsr_change_B = B;
2420  }
2421  }
2422 
2423  tA = tsr_change_A->topo;
2424  tB = tsr_change_B->topo;
2425  tC = tsr_keep->topo;
2426 
2427  if (tA != tC){
2428  ret = can_morph(tC, tA);
2429  if (!ret)
2430  return NEGATIVE;
2431  }
2432  if (tB != tC){
2433  ret = can_morph(tC, tB);
2434  if (!ret)
2435  return NEGATIVE;
2436  }
2437 
2438  if (tA != tC){
2439  morph_topo(tC, tA,
2440  tsr_change_A->order, tsr_change_A->edge_map);
2441  tsr_change_A->topo = tC;
2442  }
2443  if (tB != tC){
2444  morph_topo(tC, tB,
2445  tsr_change_B->order, tsr_change_B->edge_map);
2446  tsr_change_B->topo = tC;
2447  }
2448  return SUCCESS;
2449 
2450  }
2451 
2452  void contraction::get_best_sel_map(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, int & idx, double & time){
2453  int ret, j, d;
2454  int need_remap_A, need_remap_B, need_remap_C;
2455  int64_t memuse;//, bmemuse;
2456  double est_time, best_time;
2457  int btopo;
2458  bool is_ctr_sparse = is_sparse();
2459  World * wrld = A->wrld;
2460  CommData global_comm = wrld->cdt;
2461  btopo = -1;
2462  best_time = DBL_MAX;
2463  int num_tot;
2464  int * idx_arr;
2465  inv_idx(A->order, idx_A,
2466  B->order, idx_B,
2467  C->order, idx_C,
2468  &num_tot, &idx_arr);
2469  TAU_FSTART(evaluate_mappings)
2470  int64_t max_memuse = proc_bytes_available();
2471  for (j=0; j<6; j++){
2472  // Attempt to map to all possible permutations of processor topology
2473  #if DEBUG > 2
2474  for (int t=1; t<(int)wrld->topovec.size()+8; t++){
2475  #else
2476  for (int t=global_comm.rank+1; t<(int)wrld->topovec.size()+8; t+=global_comm.np){
2477  #endif
2478  A->clear_mapping();
2479  B->clear_mapping();
2480  C->clear_mapping();
2481  A->set_padding();
2482  B->set_padding();
2483  C->set_padding();
2484 
2485  topology * topo_i = NULL;
2486  if (t < 8){
2487  if ((t & 1) > 0){
2488  if (old_topo_A == NULL) continue;
2489  else {
2490  topo_i = old_topo_A;
2491  copy_mapping(A->order, old_map_A, A->edge_map);
2492  }
2493  }
2494  if ((t & 2) > 0){
2495  if (old_topo_B == NULL || (topo_i != NULL && topo_i != old_topo_B)) continue;
2496  else {
2497  topo_i = old_topo_B;
2498  copy_mapping(B->order, old_map_B, B->edge_map);
2499  }
2500  }
2501 
2502  if ((t & 4) > 0){
2503  if (old_topo_C == NULL || (topo_i != NULL && topo_i != old_topo_C)) continue;
2504  else {
2505  topo_i = old_topo_C;
2506  copy_mapping(C->order, old_map_C, C->edge_map);
2507  }
2508  }
2509  } else topo_i = wrld->topovec[t-8];
2510  ASSERT(topo_i != NULL);
2511 
2512  ret = map_to_topology(topo_i, j);
2513 
2514  if (ret == NEGATIVE){
2515  //printf("map_to_topology returned negative\n");
2516  continue;
2517  }
2518 
2519  A->is_mapped = 1;
2520  B->is_mapped = 1;
2521  C->is_mapped = 1;
2522  A->topo = topo_i;
2523  B->topo = topo_i;
2524  C->topo = topo_i;
2525 
2526  if (check_mapping() == 0){
2527  continue;
2528  }
2529  est_time = 0.0;
2530  A->set_padding();
2531  B->set_padding();
2532  C->set_padding();
2533  #if DEBUG >= 3
2534  if (global_comm.rank == 0){
2535  printf("\nTest mappings:\n");
2536  A->print_map(stdout, 0);
2537  B->print_map(stdout, 0);
2538  C->print_map(stdout, 0);
2539  }
2540  #endif
2541  ctr * sctr;
2542  double nnz_frac_A = 1.0;
2543  double nnz_frac_B = 1.0;
2544  double nnz_frac_C = 1.0;
2545  if (A->is_sparse) nnz_frac_A = std::min(1.,((double)A->nnz_tot)/(A->size*A->calc_npe()));
2546  if (B->is_sparse) nnz_frac_B = std::min(1.,((double)B->nnz_tot)/(B->size*B->calc_npe()));
2547  if (C->is_sparse){
2548  nnz_frac_C = std::min(1.,((double)C->nnz_tot)/(C->size*C->calc_npe()));
2549  int64_t len_ctr = 1;
2550  for (int i=0; i<num_tot; i++){
2551  if (idx_arr[3*i+2]==-1){
2552  int edge_len = idx_arr[3*i+0] != -1 ? A->lens[idx_arr[3*i+0]]
2553  : B->lens[idx_arr[3*i+1]];
2554  len_ctr *= edge_len;
2555  }
2556  }
2557  nnz_frac_C = std::min(1.,std::max(nnz_frac_C,nnz_frac_A*nnz_frac_B*len_ctr));
2558  }
2559  // check this early on to avoid 64-bit integer overflow
2560  double size_memuse = A->size*nnz_frac_A*A->sr->el_size + B->size*nnz_frac_B*B->sr->el_size + C->size*nnz_frac_C*C->sr->el_size;
2561  if (size_memuse >= (double)max_memuse){
2562  if (global_comm.rank == 0)
2563  DPRINTF(1,"Not enough memory available for topo %d with order %d to store tensors %ld/%ld\n", t,j,(int64_t)size_memuse,max_memuse);
2564  continue;
2565  }
2566 
2567  #if FOLD_TSR
2568  if (can_fold()){
2569  est_time = est_time_fold();
2570  iparam prm = map_fold(false);
2571  sctr = construct_ctr(1, &prm);
2572  if (is_ctr_sparse)
2573  est_time = ((spctr*)sctr)->est_time_rec(sctr->num_lyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
2574  else
2575  est_time = sctr->est_time_rec(sctr->num_lyr);
2576  A->remove_fold();
2577  B->remove_fold();
2578  C->remove_fold();
2579  } else
2580  #endif
2581  {
2582  sctr = construct_ctr();
2583  if (is_ctr_sparse){
2584  est_time = ((spctr*)sctr)->est_time_rec(sctr->num_lyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
2585  } else {
2586  est_time = sctr->est_time_rec(sctr->num_lyr);
2587  }
2588  }
2589  #if DEBUG >= 3
2590  if (global_comm.rank == 0){
2591  printf("mapping passed contr est_time = %E sec\n", est_time);
2592  }
2593  #endif
2594  ASSERT(est_time >= 0.0);
2595  memuse = 0;
2596  need_remap_A = 0;
2597  need_remap_B = 0;
2598  need_remap_C = 0;
2599  if (topo_i == old_topo_A){
2600  for (d=0; d<A->order; d++){
2601  if (!comp_dim_map(&A->edge_map[d],&old_map_A[d]))
2602  need_remap_A = 1;
2603  }
2604  } else
2605  need_remap_A = 1;
2606  if (need_remap_A) {
2607  est_time += A->est_redist_time(*dA, nnz_frac_A);
2608  memuse = A->get_redist_mem(*dA, nnz_frac_A);
2609  } else
2610  memuse = 0;
2611  if (topo_i == old_topo_B){
2612  for (d=0; d<B->order; d++){
2613  if (!comp_dim_map(&B->edge_map[d],&old_map_B[d]))
2614  need_remap_B = 1;
2615  }
2616  } else
2617  need_remap_B = 1;
2618  if (need_remap_B) {
2619  est_time += B->est_redist_time(*dB, nnz_frac_B);
2620  memuse = std::max(memuse,B->get_redist_mem(*dB, nnz_frac_B));
2621  }
2622  if (topo_i == old_topo_C){
2623  for (d=0; d<C->order; d++){
2624  if (!comp_dim_map(&C->edge_map[d],&old_map_C[d]))
2625  need_remap_C = 1;
2626  }
2627  } else
2628  need_remap_C = 1;
2629  if (need_remap_C) {
2630  est_time += 2.*C->est_redist_time(*dC, nnz_frac_C);
2631  memuse = std::max(1.0*memuse,2.*C->get_redist_mem(*dC, nnz_frac_C));
2632  }
2633  if (is_ctr_sparse)
2634  memuse = MAX(((spctr*)sctr)->spmem_rec(nnz_frac_A,nnz_frac_B,nnz_frac_C), memuse);
2635  else
2636  memuse = MAX((int64_t)sctr->mem_rec(), memuse);
2637  #if DEBUG >= 3
2638  if (global_comm.rank == 0){
2639  printf("total (with redistribution and transp) est_time = %E\n", est_time);
2640  }
2641  #endif
2642  ASSERT(est_time >= 0.0);
2643 
2644  if ((int64_t)memuse >= max_memuse){
2645  if (global_comm.rank == 0)
2646  DPRINTF(1,"Not enough memory available for topo %d with order %d memory %ld/%ld\n", t,j,memuse,max_memuse);
2647  delete sctr;
2648  continue;
2649  }
2650  if ((!A->is_sparse && A->size > INT_MAX) ||(!B->is_sparse && B->size > INT_MAX) || (!C->is_sparse && C->size > INT_MAX)){
2651  DPRINTF(1,"MPI does not handle enough bits for topo %d with order\n", j);
2652  delete sctr;
2653  continue;
2654  }
2655 
2656  if (est_time < best_time) {
2657  best_time = est_time;
2658  //bmemuse = memuse;
2659  btopo = 6*t+j;
2660  }
2661  delete sctr;
2662  }
2663  }
2664  TAU_FSTOP(evaluate_mappings)
2665  TAU_FSTART(all_select_ctr_map);
2666  double gbest_time;
2667  MPI_Allreduce(&best_time, &gbest_time, 1, MPI_DOUBLE, MPI_MIN, global_comm.cm);
2668  if (best_time != gbest_time){
2669  btopo = INT_MAX;
2670  }
2671  int ttopo;
2672  MPI_Allreduce(&btopo, &ttopo, 1, MPI_INT, MPI_MIN, global_comm.cm);
2673  TAU_FSTOP(all_select_ctr_map);
2674 
2675  idx=ttopo;
2676  time=gbest_time;
2677 
2678  cdealloc(idx_arr);
2679  }
2680 
2681  void contraction::get_best_exh_map(distribution const * dA, distribution const * dB, distribution const * dC, topology * old_topo_A, topology * old_topo_B, topology * old_topo_C, mapping const * old_map_A, mapping const * old_map_B, mapping const * old_map_C, int & idx, double & time, double init_best_time=DBL_MAX){
2682  int d;
2683  int need_remap_A, need_remap_B, need_remap_C;
2684  int64_t memuse;//, bmemuse;
2685  double est_time, best_time;
2686  int btopo;
2687  bool is_ctr_sparse = is_sparse();
2688  World * wrld = A->wrld;
2689  CommData global_comm = wrld->cdt;
2690  btopo = -1;
2691  best_time = init_best_time;
2692  int num_tot;
2693  int * idx_arr;
2694  inv_idx(A->order, idx_A,
2695  B->order, idx_B,
2696  C->order, idx_C,
2697  &num_tot, &idx_arr);
2698  int64_t tot_num_choices = 0;
2699  for (int i=0; i<(int)wrld->topovec.size(); i++){
2700  // tot_num_choices += pow(num_choices,(int)wrld->topovec[i]->order);
2701  tot_num_choices += get_num_map_variants(wrld->topovec[i]);
2702  }
2703  int64_t valid_mappings = 0;
2704  int64_t choice_offset = 0;
2705  int64_t max_memuse = proc_bytes_available();
2706  TAU_FSTOP(init_select_ctr_map);
2707  for (int i=0; i<(int)wrld->topovec.size(); i++){
2708 // int tnum_choices = pow(num_choices,(int) wrld->topovec[i]->order);
2709  int tnum_choices = get_num_map_variants(wrld->topovec[i]);
2710 
2711  int64_t old_off = choice_offset;
2712  choice_offset += tnum_choices;
2713  for (int j=0; j<tnum_choices; j++){
2714  if ((old_off + j)%global_comm.np != global_comm.rank)
2715  continue;
2716  A->clear_mapping();
2717  B->clear_mapping();
2718  C->clear_mapping();
2719  A->set_padding();
2720  B->set_padding();
2721  C->set_padding();
2722  topology * topo_i = wrld->topovec[i];
2723  TAU_FSTART(exh_map);
2724  bool br = exh_map_to_topo(topo_i, j);
2725  TAU_FSTOP(exh_map);
2726  if (!br) DPRINTF(3,"exh_map_to_topo returned false\n");
2727  if (!br) continue;
2728  A->is_mapped = 1;
2729  B->is_mapped = 1;
2730  C->is_mapped = 1;
2731  A->topo = topo_i;
2732  B->topo = topo_i;
2733  C->topo = topo_i;
2734 
2735  TAU_FSTART(switch_topo_perm);
2736  br = switch_topo_perm();
2737  TAU_FSTOP(switch_topo_perm);
2738  if (!br){ DPRINTF(3,"switch topo perm returned false\n"); }
2739  if (!br) continue;
2740  TAU_FSTART(check_ctr_mapping);
2741  if (check_mapping() == 0){
2742  TAU_FSTOP(check_ctr_mapping);
2743  continue;
2744  }
2745  TAU_FSTOP(check_ctr_mapping);
2746  valid_mappings++;
2747  est_time = 0.0;
2748 
2749  TAU_FSTART(est_ctr_map_time);
2750  A->set_padding();
2751  B->set_padding();
2752  C->set_padding();
2753  #if DEBUG >= 4
2754  printf("\nTest mappings:\n");
2755  A->print_map(stdout, 0);
2756  B->print_map(stdout, 0);
2757  C->print_map(stdout, 0);
2758  #endif
2759 
2760 
2761  double nnz_frac_A = 1.0;
2762  double nnz_frac_B = 1.0;
2763  double nnz_frac_C = 1.0;
2764  if (A->is_sparse) nnz_frac_A = std::min(1.,((double)A->nnz_tot)/(A->size*A->calc_npe()));
2765  if (B->is_sparse) nnz_frac_B = std::min(1.,((double)B->nnz_tot)/(B->size*B->calc_npe()));
2766  if (C->is_sparse){
2767  nnz_frac_C = std::min(1.,((double)C->nnz_tot)/(C->size*C->calc_npe()));
2768  nnz_frac_C = std::max(nnz_frac_C,nnz_frac_A);
2769  nnz_frac_C = std::max(nnz_frac_C,nnz_frac_B);
2770  int64_t len_ctr = 1;
2771  for (int i=0; i<num_tot; i++){
2772  if (idx_arr[3*i+2]==-1){
2773  int edge_len = idx_arr[3*i+0] != -1 ? A->lens[idx_arr[3*i+0]]
2774  : B->lens[idx_arr[3*i+1]];
2775  len_ctr *= edge_len;
2776  }
2777  }
2778  nnz_frac_C = std::min(1.,std::max(nnz_frac_C,nnz_frac_A*nnz_frac_B*len_ctr));
2779  }
2780 
2781 
2782  ctr * sctr;
2783  #if FOLD_TSR
2784  if (can_fold()){
2785  est_time = est_time_fold();
2786  iparam prm = map_fold(false);
2787  sctr = construct_ctr(1, &prm);
2788  if (is_ctr_sparse)
2789  est_time = ((spctr*)sctr)->est_time_rec(sctr->num_lyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
2790  else
2791  est_time = sctr->est_time_rec(sctr->num_lyr);
2792  A->remove_fold();
2793  B->remove_fold();
2794  C->remove_fold();
2795  } else
2796  #endif
2797  {
2798  sctr = construct_ctr();
2799  if (is_ctr_sparse){
2800  est_time = ((spctr*)sctr)->est_time_rec(sctr->num_lyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
2801  } else {
2802  est_time = sctr->est_time_rec(sctr->num_lyr);
2803  }
2804  }
2805  #if DEBUG >= 4
2806  printf("mapping passed contr est_time = %E sec\n", est_time);
2807  #endif
2808  ASSERT(est_time >= 0.0);
2809 
2810  memuse = 0;
2811  need_remap_A = 0;
2812  need_remap_B = 0;
2813  need_remap_C = 0;
2814  if (topo_i == old_topo_A){
2815  for (d=0; d<A->order; d++){
2816  if (!comp_dim_map(&A->edge_map[d],&old_map_A[d]))
2817  need_remap_A = 1;
2818  }
2819  } else
2820  need_remap_A = 1;
2821  if (need_remap_A) {
2822  est_time += A->est_redist_time(*dA, nnz_frac_A);
2823  memuse = A->get_redist_mem(*dA, nnz_frac_A);
2824  } else
2825  memuse = 0;
2826  if (topo_i == old_topo_B){
2827  for (d=0; d<B->order; d++){
2828  if (!comp_dim_map(&B->edge_map[d],&old_map_B[d]))
2829  need_remap_B = 1;
2830  }
2831  } else
2832  need_remap_B = 1;
2833  if (need_remap_B) {
2834  est_time += B->est_redist_time(*dB, nnz_frac_B);
2835  memuse = std::max(memuse,B->get_redist_mem(*dB, nnz_frac_B));
2836  }
2837  if (topo_i == old_topo_C){
2838  for (d=0; d<C->order; d++){
2839  if (!comp_dim_map(&C->edge_map[d],&old_map_C[d]))
2840  need_remap_C = 1;
2841  }
2842  } else
2843  need_remap_C = 1;
2844  if (need_remap_C) {
2845  est_time += 2.*C->est_redist_time(*dC, nnz_frac_C);
2846  memuse = std::max(1.*memuse,2.*C->get_redist_mem(*dC, nnz_frac_C));
2847  }
2848 
2849  if (est_time >= best_time) continue;
2850 
2851  if (is_ctr_sparse)
2852  memuse = MAX(((spctr*)sctr)->spmem_rec(nnz_frac_A,nnz_frac_B,nnz_frac_C), memuse);
2853  else
2854  memuse = MAX((int64_t)sctr->mem_rec(), memuse);
2855  #if DEBUG >= 4
2856  printf("total (with redistribution and transp) est_time = %E\n", est_time);
2857  #endif
2858  ASSERT(est_time >= 0.0);
2859 
2860  TAU_FSTOP(est_ctr_map_time);
2861  TAU_FSTART(get_avail_res);
2862  if ((int64_t)memuse >= max_memuse){
2863  DPRINTF(1,"[EXH] Not enough memory available for topo %d with order %d memory %ld/%ld\n", i,j,memuse,max_memuse);
2864  TAU_FSTOP(get_avail_res);
2865  delete sctr;
2866  continue;
2867  }
2868  TAU_FSTOP(get_avail_res);
2869  if (((!A->is_sparse) && A->size > INT_MAX) ||
2870  ((!B->is_sparse) && B->size > INT_MAX) ||
2871  ((!C->is_sparse) && C->size > INT_MAX)){
2872  DPRINTF(1,"MPI does not handle enough bits for topo %d with order %d \n", i, j);
2873  delete sctr;
2874  continue;
2875  }
2876  if (est_time < best_time) {
2877  best_time = est_time;
2878  //bmemuse = memuse;
2879  btopo = old_off+j;
2880  }
2881  delete sctr;
2882  }
2883  }
2884 #if DEBUG >= 2
2885  int64_t tot_valid_mappings;
2886  MPI_Allreduce(&valid_mappings, &tot_valid_mappings, 1, MPI_INT64_T, MPI_SUM, global_comm.cm);
2887  if (A->wrld->rank == 0) DPRINTF(2,"number valid mappings was %ld/%ld\n", tot_valid_mappings, tot_num_choices);
2888 #endif
2889  TAU_FSTART(all_select_ctr_map);
2890  double gbest_time;
2891  MPI_Allreduce(&best_time, &gbest_time, 1, MPI_DOUBLE, MPI_MIN, global_comm.cm);
2892  if (best_time != gbest_time){
2893  btopo = INT_MAX;
2894  }
2895  int ttopo;
2896  MPI_Allreduce(&btopo, &ttopo, 1, MPI_INT, MPI_MIN, global_comm.cm);
2897  TAU_FSTOP(all_select_ctr_map);
2898 
2899  idx=ttopo;
2900  time=gbest_time;
2901 
2902  cdealloc(idx_arr);
2903 
2904  }
2905 
2906  int contraction::map(ctr ** ctrf, bool do_remap){
2907  int ret, j, need_remap, d;
2908  int * old_phase_A, * old_phase_B, * old_phase_C;
2909  topology * old_topo_A, * old_topo_B, * old_topo_C;
2910  distribution * dA, * dB, * dC;
2911  old_topo_A = A->topo;
2912  old_topo_B = B->topo;
2913  old_topo_C = C->topo;
2914  mapping * old_map_A = new mapping[A->order];
2915  mapping * old_map_B = new mapping[B->order];
2916  mapping * old_map_C = new mapping[C->order];
2917  copy_mapping(A->order, A->edge_map, old_map_A);
2918  copy_mapping(B->order, B->edge_map, old_map_B);
2919  copy_mapping(C->order, C->edge_map, old_map_C);
2920 
2921  ASSERT(A->wrld->comm == B->wrld->comm && B->wrld->comm == C->wrld->comm);
2922  World * wrld = A->wrld;
2923  CommData global_comm = wrld->cdt;
2924 
2925  TAU_FSTART(init_select_ctr_map);
2926  #if BEST_VOL
2927  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&virt_blk_len_A);
2928  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&virt_blk_len_B);
2929  CTF_int::alloc_ptr(sizeof(int)*C->order, (void**)&virt_blk_len_C);
2930  #endif
2931 
2932  ASSERT(A->is_mapped);
2933  ASSERT(B->is_mapped);
2934  ASSERT(C->is_mapped);
2935  if (do_remap){
2936  #if DEBUG >= 2
2937  if (global_comm.rank == 0)
2938  printf("Initial mappings:\n");
2939  A->print_map();
2940  B->print_map();
2941  C->print_map();
2942  #endif
2943  }
2944  A->unfold();
2945  B->unfold();
2946  C->unfold();
2947  A->set_padding();
2948  B->set_padding();
2949  C->set_padding();
2950  /* Save the current mappings of A, B, C */
2951  dA = new distribution(A);
2952  dB = new distribution(B);
2953  dC = new distribution(C);
2954  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&old_phase_A);
2955  for (j=0; j<A->order; j++){
2956  old_phase_A[j] = A->edge_map[j].calc_phase();
2957  }
2958  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&old_phase_B);
2959  for (j=0; j<B->order; j++){
2960  old_phase_B[j] = B->edge_map[j].calc_phase();
2961  }
2962  CTF_int::alloc_ptr(sizeof(int)*C->order, (void**)&old_phase_C);
2963  for (j=0; j<C->order; j++){
2964  old_phase_C[j] = C->edge_map[j].calc_phase();
2965  }
2966 
2967  //bmemuse = UINT64_MAX;
2968  int ttopo, ttopo_sel, ttopo_exh;
2969  double gbest_time_sel, gbest_time_exh;
2970 
2971  TAU_FSTART(get_best_sel_map);
2972  get_best_sel_map(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, ttopo_sel, gbest_time_sel);
2973  TAU_FSTOP(get_best_sel_map);
2974  if (gbest_time_sel < 1.){
2975  gbest_time_exh = gbest_time_sel+1.;
2976  ttopo_exh = ttopo_sel;
2977  } else {
2978  TAU_FSTART(get_best_exh_map);
2979  get_best_exh_map(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, ttopo_exh, gbest_time_exh, gbest_time_sel);
2980  TAU_FSTOP(get_best_exh_map);
2981  }
2982  if (gbest_time_sel <= gbest_time_exh){
2983  ttopo = ttopo_sel;
2984  } else {
2985  ttopo = ttopo_exh;
2986  }
2987 
2988  A->clear_mapping();
2989  B->clear_mapping();
2990  C->clear_mapping();
2991  A->set_padding();
2992  B->set_padding();
2993  C->set_padding();
2994 
2995  if (!do_remap || ttopo == INT_MAX || ttopo == -1){
2996  CTF_int::cdealloc(old_phase_A);
2997  CTF_int::cdealloc(old_phase_B);
2998  CTF_int::cdealloc(old_phase_C);
2999  delete [] old_map_A;
3000  delete [] old_map_B;
3001  delete [] old_map_C;
3002  delete dA;
3003  delete dB;
3004  delete dC;
3005 
3006  if (ttopo == INT_MAX || ttopo == -1){
3007  printf("ERROR: Failed to map contraction!\n");
3008  //ABORT;
3009  return ERROR;
3010  }
3011  return SUCCESS;
3012  }
3013  topology * topo_g = NULL;
3014  int j_g;
3015  if (gbest_time_sel <= gbest_time_exh){
3016  j_g = ttopo%6;
3017  if (ttopo < 48){
3018  if (((ttopo/6) & 1) > 0){
3019  topo_g = old_topo_A;
3020  copy_mapping(A->order, old_map_A, A->edge_map);
3021  }
3022  if (((ttopo/6) & 2) > 0){
3023  topo_g = old_topo_B;
3024  copy_mapping(B->order, old_map_B, B->edge_map);
3025  }
3026 
3027  if (((ttopo/6) & 4) > 0){
3028  topo_g = old_topo_C;
3029  copy_mapping(C->order, old_map_C, C->edge_map);
3030  }
3031  assert(topo_g != NULL);
3032 
3033  } else topo_g = wrld->topovec[(ttopo-48)/6];
3034  } else {
3035  int64_t choice_offset = 0;
3036  int i=0;
3037  int64_t old_off = 0;
3038  for (i=0; i<(int)wrld->topovec.size(); i++){
3039  //int tnum_choices = pow(num_choices,(int) wrld->topovec[i]->order);
3040  int tnum_choices = get_num_map_variants(wrld->topovec[i]);
3041  old_off = choice_offset;
3042  choice_offset += tnum_choices;
3043  if (choice_offset > ttopo) break;
3044  }
3045  topo_g = wrld->topovec[i];
3046  j_g = ttopo-old_off;
3047  }
3048 
3049  A->topo = topo_g;
3050  B->topo = topo_g;
3051  C->topo = topo_g;
3052  A->is_mapped = 1;
3053  B->is_mapped = 1;
3054  C->is_mapped = 1;
3055 
3056  if (gbest_time_sel <= gbest_time_exh){
3057  ret = map_to_topology(topo_g, j_g);
3058  if (ret == NEGATIVE || ret == ERROR) {
3059  printf("ERROR ON FINAL MAP ATTEMPT, THIS SHOULD NOT HAPPEN\n");
3060  return ERROR;
3061  }
3062  } else {
3063  exh_map_to_topo(topo_g, j_g);
3064  switch_topo_perm();
3065  }
3066  #if DEBUG > 2
3067  if (!check_mapping())
3068  printf("ERROR ON FINAL MAP ATTEMPT, THIS SHOULD NOT HAPPEN\n");
3069  // else if (global_comm.rank == 0) printf("Mapping successful estimated execution time = %lf sec\n",best_time);
3070  #endif
3071  ASSERT(check_mapping());
3072  A->set_padding();
3073  B->set_padding();
3074  C->set_padding();
3075  if (can_fold()){
3076  iparam prm = map_fold(false);
3077  *ctrf = construct_ctr(1, &prm);
3078  A->remove_fold();
3079  B->remove_fold();
3080  C->remove_fold();
3081  } else
3082  *ctrf = construct_ctr();
3083  #if DEBUG > 2
3084  if (global_comm.rank == 0)
3085  printf("New mappings:\n");
3086  A->print_map(stdout);
3087  B->print_map(stdout);
3088  C->print_map(stdout);
3089  MPI_Barrier(global_comm.cm);
3090  #endif
3091 
3092 #ifdef VERBOSE
3093  double nnz_frac_A = 1.0;
3094  double nnz_frac_B = 1.0;
3095  double nnz_frac_C = 1.0;
3096  int num_tot;
3097  int * idx_arr;
3098  inv_idx(A->order, idx_A,
3099  B->order, idx_B,
3100  C->order, idx_C,
3101  &num_tot, &idx_arr);
3102  if (A->is_sparse) nnz_frac_A = std::min(1.,((double)A->nnz_tot)/(A->size*A->calc_npe()));
3103  if (B->is_sparse) nnz_frac_B = std::min(1.,((double)B->nnz_tot)/(B->size*B->calc_npe()));
3104  if (C->is_sparse){
3105  nnz_frac_C = std::min(1.,((double)C->nnz_tot)/(C->size*C->calc_npe()));
3106  int64_t len_ctr = 1;
3107  for (int i=0; i<num_tot; i++){
3108  if (idx_arr[3*i+2]==-1){
3109  int edge_len = idx_arr[3*i+0] != -1 ? A->lens[idx_arr[3*i+0]]
3110  : B->lens[idx_arr[3*i+1]];
3111  len_ctr *= edge_len;
3112  }
3113  }
3114  nnz_frac_C = std::min(1.,std::max(nnz_frac_C,nnz_frac_A*nnz_frac_B*len_ctr));
3115  }
3116  cdealloc(idx_arr);
3117  int64_t memuse = 0;
3118  if (is_sparse())
3119  memuse = MAX(((spctr*)*ctrf)->spmem_rec(nnz_frac_A,nnz_frac_B,nnz_frac_C), memuse);
3120  else
3121  memuse = MAX((int64_t)(*ctrf)->mem_rec(), memuse);
3122 
3123  if (global_comm.rank == 0){
3124  VPRINTF(1,"Contraction will use %E bytes per processor out of %E available memory and take an estimated of %E sec\n",
3125  (double)memuse,(double)proc_bytes_available(),std::min(gbest_time_sel,gbest_time_exh));
3126 #if DEBUG >= 3
3127  (*ctrf)->print();
3128 #endif
3129  }
3130 #endif
3131 
3132  if (A->is_cyclic == 0 &&
3133  B->is_cyclic == 0 &&
3134  C->is_cyclic == 0){
3135  A->is_cyclic = 0;
3136  B->is_cyclic = 0;
3137  C->is_cyclic = 0;
3138  } else {
3139  A->is_cyclic = 1;
3140  B->is_cyclic = 1;
3141  C->is_cyclic = 1;
3142  }
3143  /* redistribute tensor data */
3144  TAU_FSTART(redistribute_for_contraction);
3145  need_remap = 0;
3146  if (A->topo == old_topo_A){
3147  for (d=0; d<A->order; d++){
3148  if (!comp_dim_map(&A->edge_map[d],&old_map_A[d]))
3149  need_remap = 1;
3150  }
3151  } else
3152  need_remap = 1;
3153  if (need_remap)
3154  A->redistribute(*dA);
3155  need_remap = 0;
3156  if (B->topo == old_topo_B){
3157  for (d=0; d<B->order; d++){
3158  if (!comp_dim_map(&B->edge_map[d],&old_map_B[d]))
3159  need_remap = 1;
3160  }
3161  } else
3162  need_remap = 1;
3163  if (need_remap)
3164  B->redistribute(*dB);
3165  need_remap = 0;
3166  if (C->topo == old_topo_C){
3167  for (d=0; d<C->order; d++){
3168  if (!comp_dim_map(&C->edge_map[d],&old_map_C[d]))
3169  need_remap = 1;
3170  }
3171  } else
3172  need_remap = 1;
3173  if (need_remap)
3174  C->redistribute(*dC);
3175 
3176  TAU_FSTOP(redistribute_for_contraction);
3177 
3178  CTF_int::cdealloc( old_phase_A );
3179  CTF_int::cdealloc( old_phase_B );
3180  CTF_int::cdealloc( old_phase_C );
3181 
3182  delete [] old_map_A;
3183  delete [] old_map_B;
3184  delete [] old_map_C;
3185 
3186 
3187  delete dA;
3188  delete dB;
3189  delete dC;
3190 
3191  return SUCCESS;
3192  }
3193 
3194 
3195  ctr * contraction::construct_dense_ctr(int is_inner,
3196  iparam const * inner_params,
3197  int * nvirt_all,
3198  int is_used,
3199  int const * phys_mapped){
3200  int num_tot, i, i_A, i_B, i_C, is_top, nphys_dim;
3201  int64_t nvirt;
3202  int64_t blk_sz_A, blk_sz_B, blk_sz_C;
3203  int64_t vrt_sz_A, vrt_sz_B, vrt_sz_C;
3204  //int sA, sB, sC,
3205  bool need_rep;
3206  int * blk_len_A, * virt_blk_len_A, * blk_len_B;
3207  int * virt_blk_len_B, * blk_len_C, * virt_blk_len_C;
3208  int * idx_arr, * virt_dim;
3209  //strp_tsr * str_A, * str_B, * str_C;
3210  mapping * map;
3211  ctr * hctr = NULL;
3212  ctr ** rec_ctr = NULL;
3213  ASSERT(A->wrld->comm == B->wrld->comm && B->wrld->comm == C->wrld->comm);
3214  World * wrld = A->wrld;
3215  CommData global_comm = wrld->cdt;
3216 
3217  is_top = 1;
3218  nphys_dim = A->topo->order;
3219 
3220  inv_idx(A->order, idx_A,
3221  B->order, idx_B,
3222  C->order, idx_C,
3223  &num_tot, &idx_arr);
3224 
3225 
3226  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&virt_blk_len_A);
3227  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&virt_blk_len_B);
3228  CTF_int::alloc_ptr(sizeof(int)*C->order, (void**)&virt_blk_len_C);
3229 
3230  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&blk_len_A);
3231  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&blk_len_B);
3232  CTF_int::alloc_ptr(sizeof(int)*C->order, (void**)&blk_len_C);
3233  CTF_int::alloc_ptr(sizeof(int)*num_tot, (void**)&virt_dim);
3234 
3235  /* Determine the block dimensions of each local subtensor */
3236  blk_sz_A = A->size;
3237  blk_sz_B = B->size;
3238  blk_sz_C = C->size;
3239  calc_dim(A->order, blk_sz_A, A->pad_edge_len, A->edge_map,
3240  &vrt_sz_A, virt_blk_len_A, blk_len_A);
3241  calc_dim(B->order, blk_sz_B, B->pad_edge_len, B->edge_map,
3242  &vrt_sz_B, virt_blk_len_B, blk_len_B);
3243  calc_dim(C->order, blk_sz_C, C->pad_edge_len, C->edge_map,
3244  &vrt_sz_C, virt_blk_len_C, blk_len_C);
3245 
3246  /* Strip out the relevant part of the tensor if we are contracting over diagonal */
3247 /* sA = strip_diag( A->order, num_tot, idx_A, vrt_sz_A,
3248  A->edge_map, A->topo, A->sr,
3249  blk_len_A, &blk_sz_A, &str_A);
3250  sB = strip_diag( B->order, num_tot, idx_B, vrt_sz_B,
3251  B->edge_map, B->topo, B->sr,
3252  blk_len_B, &blk_sz_B, &str_B);
3253  sC = strip_diag( C->order, num_tot, idx_C, vrt_sz_C,
3254  C->edge_map, C->topo, C->sr,
3255  blk_len_C, &blk_sz_C, &str_C);
3256 
3257  if (sA || sB || sC){
3258  ASSERT(0);//this is always done via sum now
3259  if (global_comm.rank == 0)
3260  DPRINTF(1,"Stripping tensor\n");
3261  strp_ctr * sctr = new strp_ctr;
3262  hctr = sctr;
3263  is_top = 0;
3264  rec_ctr = &sctr->rec_ctr;
3265 
3266  sctr->rec_strp_A = str_A;
3267  sctr->rec_strp_B = str_B;
3268  sctr->rec_strp_C = str_C;
3269  sctr->strip_A = sA;
3270  sctr->strip_B = sB;
3271  sctr->strip_C = sC;
3272  }*/
3273 
3274  need_rep = 0;
3275  for (i=0; i<nphys_dim; i++){
3276  if (phys_mapped[3*i+0] == 0 ||
3277  phys_mapped[3*i+1] == 0 ||
3278  phys_mapped[3*i+2] == 0){
3279  /*ASSERT((phys_mapped[3*i+0] == 0 && phys_mapped[3*i+1] == 0) ||
3280  (phys_mapped[3*i+0] == 0 && phys_mapped[3*i+2] == 0) ||
3281  (phys_mapped[3*i+1] == 0 && phys_mapped[3*i+2] == 0));*/
3282  need_rep = 1;
3283  break;
3284  }
3285  }
3286  if (need_rep){
3287  if (global_comm.rank == 0)
3288  DPRINTF(2,"Replicating tensor\n");
3289 
3290  ctr_replicate * rctr = new ctr_replicate(this, phys_mapped, blk_sz_A, blk_sz_B, blk_sz_C);
3291  if (is_top){
3292  hctr = rctr;
3293  is_top = 0;
3294  } else {
3295  *rec_ctr = rctr;
3296  }
3297  rec_ctr = &rctr->rec_ctr;
3298  }
3299 
3300  //#ifdef OFFLOAD
3301  int total_iter = 1;
3302  int upload_phase_A = 1;
3303  int upload_phase_B = 1;
3304  int download_phase_C = 1;
3305  //#endif
3306  nvirt = 1;
3307 
3308  ctr_2d_general * bottom_ctr_gen = NULL;
3309  /* if (nvirt_all != NULL)
3310  *nvirt_all = 1;*/
3311  for (i=0; i<num_tot; i++){
3312  virt_dim[i] = 1;
3313  i_A = idx_arr[3*i+0];
3314  i_B = idx_arr[3*i+1];
3315  i_C = idx_arr[3*i+2];
3316  /* If this index belongs to exactly two tensors */
3317  if ((i_A != -1 && i_B != -1 && i_C == -1) ||
3318  (i_A != -1 && i_B == -1 && i_C != -1) ||
3319  (i_A == -1 && i_B != -1 && i_C != -1)) {
3320  ctr_2d_general * ctr_gen = new ctr_2d_general(this);
3321  #ifdef OFFLOAD
3322  ctr_gen->alloc_host_buf = false;
3323  #endif
3324  int is_built = 0;
3325  if (i_A == -1){
3326  is_built = ctr_2d_gen_build(is_used,
3327  global_comm,
3328  i,
3329  virt_dim,
3330  ctr_gen->edge_len,
3331  total_iter,
3332  A,
3333  i_A,
3334  ctr_gen->cdt_A,
3335  ctr_gen->ctr_lda_A,
3336  ctr_gen->ctr_sub_lda_A,
3337  ctr_gen->move_A,
3338  blk_len_A,
3339  blk_sz_A,
3340  virt_blk_len_A,
3341  upload_phase_A,
3342  B,
3343  i_B,
3344  ctr_gen->cdt_B,
3345  ctr_gen->ctr_lda_B,
3346  ctr_gen->ctr_sub_lda_B,
3347  ctr_gen->move_B,
3348  blk_len_B,
3349  blk_sz_B,
3350  virt_blk_len_B,
3351  upload_phase_B,
3352  C,
3353  i_C,
3354  ctr_gen->cdt_C,
3355  ctr_gen->ctr_lda_C,
3356  ctr_gen->ctr_sub_lda_C,
3357  ctr_gen->move_C,
3358  blk_len_C,
3359  blk_sz_C,
3360  virt_blk_len_C,
3361  download_phase_C);
3362  }
3363  if (i_B == -1){
3364  is_built = ctr_2d_gen_build(is_used,
3365  global_comm,
3366  i,
3367  virt_dim,
3368  ctr_gen->edge_len,
3369  total_iter,
3370  B,
3371  i_B,
3372  ctr_gen->cdt_B,
3373  ctr_gen->ctr_lda_B,
3374  ctr_gen->ctr_sub_lda_B,
3375  ctr_gen->move_B,
3376  blk_len_B,
3377  blk_sz_B,
3378  virt_blk_len_B,
3379  upload_phase_B,
3380  C,
3381  i_C,
3382  ctr_gen->cdt_C,
3383  ctr_gen->ctr_lda_C,
3384  ctr_gen->ctr_sub_lda_C,
3385  ctr_gen->move_C,
3386  blk_len_C,
3387  blk_sz_C,
3388  virt_blk_len_C,
3389  download_phase_C,
3390  A,
3391  i_A,
3392  ctr_gen->cdt_A,
3393  ctr_gen->ctr_lda_A,
3394  ctr_gen->ctr_sub_lda_A,
3395  ctr_gen->move_A,
3396  blk_len_A,
3397  blk_sz_A,
3398  virt_blk_len_A,
3399  upload_phase_A);
3400  }
3401  if (i_C == -1){
3402  is_built = ctr_2d_gen_build(is_used,
3403  global_comm,
3404  i,
3405  virt_dim,
3406  ctr_gen->edge_len,
3407  total_iter,
3408  C,
3409  i_C,
3410  ctr_gen->cdt_C,
3411  ctr_gen->ctr_lda_C,
3412  ctr_gen->ctr_sub_lda_C,
3413  ctr_gen->move_C,
3414  blk_len_C,
3415  blk_sz_C,
3416  virt_blk_len_C,
3417  download_phase_C,
3418  A,
3419  i_A,
3420  ctr_gen->cdt_A,
3421  ctr_gen->ctr_lda_A,
3422  ctr_gen->ctr_sub_lda_A,
3423  ctr_gen->move_A,
3424  blk_len_A,
3425  blk_sz_A,
3426  virt_blk_len_A,
3427  upload_phase_A,
3428  B,
3429  i_B,
3430  ctr_gen->cdt_B,
3431  ctr_gen->ctr_lda_B,
3432  ctr_gen->ctr_sub_lda_B,
3433  ctr_gen->move_B,
3434  blk_len_B,
3435  blk_sz_B,
3436  virt_blk_len_B,
3437  upload_phase_B);
3438  }
3439  if (is_built){
3440  if (is_top){
3441  hctr = ctr_gen;
3442  is_top = 0;
3443  } else {
3444  *rec_ctr = ctr_gen;
3445  }
3446  if (bottom_ctr_gen == NULL)
3447  bottom_ctr_gen = ctr_gen;
3448  rec_ctr = &ctr_gen->rec_ctr;
3449  } else {
3450  ctr_gen->rec_ctr = NULL;
3451  delete ctr_gen;
3452  }
3453  } else {
3454  if (i_A != -1){
3455  map = &A->edge_map[i_A];
3456  while (map->has_child) map = map->child;
3457  if (map->type == VIRTUAL_MAP)
3458  virt_dim[i] = map->np;
3459  } else if (i_B != -1){
3460  map = &B->edge_map[i_B];
3461  while (map->has_child) map = map->child;
3462  if (map->type == VIRTUAL_MAP)
3463  virt_dim[i] = map->np;
3464  } else if (i_C != -1){
3465  map = &C->edge_map[i_C];
3466  while (map->has_child) map = map->child;
3467  if (map->type == VIRTUAL_MAP)
3468  virt_dim[i] = map->np;
3469  }
3470  }
3471  /*if (sA && i_A != -1){
3472  nvirt = virt_dim[i]/str_A->strip_dim[i_A];
3473  } else if (sB && i_B != -1){
3474  nvirt = virt_dim[i]/str_B->strip_dim[i_B];
3475  } else if (sC && i_C != -1){
3476  nvirt = virt_dim[i]/str_C->strip_dim[i_C];
3477  }*/
3478 
3479  nvirt = nvirt * virt_dim[i];
3480  }
3481  if (nvirt_all != NULL)
3482  *nvirt_all = nvirt;
3483  bool do_offload = false;
3484  #ifdef OFFLOAD
3485  if ((!is_custom || func->has_off_gemm) && is_inner > 0 && (is_custom || C->sr->is_offloadable())){
3486  do_offload = true;
3487  if (bottom_ctr_gen != NULL)
3488  bottom_ctr_gen->alloc_host_buf = true;
3489  ctr_offload * ctroff = new ctr_offload(this, blk_sz_A, blk_sz_B, blk_sz_C, total_iter, upload_phase_A, upload_phase_B, download_phase_C);
3490  if (is_top){
3491  hctr = ctroff;
3492  is_top = 0;
3493  } else {
3494  *rec_ctr = ctroff;
3495  }
3496  rec_ctr = &ctroff->rec_ctr;
3497  }
3498  #endif
3499 
3500  if (blk_sz_A < vrt_sz_A){
3501  printf("blk_sz_A = %ld, vrt_sz_A = %ld\n", blk_sz_A, vrt_sz_A);
3502  printf("sizes are %ld %ld %ld\n",A->size,B->size,C->size);
3503  A->print_map(stdout, 0);
3504  B->print_map(stdout, 0);
3505  C->print_map(stdout, 0);
3506  }
3507  if (blk_sz_B < vrt_sz_B){
3508  printf("blk_sz_B = %ld, vrt_sz_B = %ld\n", blk_sz_B, vrt_sz_B);
3509  printf("sizes are %ld %ld %ld\n",A->size,B->size,C->size);
3510  A->print_map(stdout, 0);
3511  B->print_map(stdout, 0);
3512  C->print_map(stdout, 0);
3513  }
3514  if (blk_sz_C < vrt_sz_C){
3515  printf("blk_sz_C = %ld, vrt_sz_C = %ld\n", blk_sz_C, vrt_sz_C);
3516  printf("sizes are %ld %ld %ld\n",A->size,B->size,C->size);
3517  A->print_map(stdout, 0);
3518  B->print_map(stdout, 0);
3519  C->print_map(stdout, 0);
3520  }
3521  ASSERT(blk_sz_A >= vrt_sz_A);
3522  ASSERT(blk_sz_B >= vrt_sz_B);
3523  ASSERT(blk_sz_C >= vrt_sz_C);
3524  /* Multiply over virtual sub-blocks */
3525  if (nvirt > 1){
3526  ctr_virt * ctrv = new ctr_virt(this, num_tot, virt_dim, vrt_sz_A, vrt_sz_B, vrt_sz_C);
3527  if (is_top) {
3528  hctr = ctrv;
3529  is_top = 0;
3530  } else {
3531  *rec_ctr = ctrv;
3532  }
3533  rec_ctr = &ctrv->rec_ctr;
3534  } else
3535  CTF_int::cdealloc(virt_dim);
3536 
3537 
3538  iparam inp_cpy;
3539  if (inner_params != NULL)
3540  inp_cpy = *inner_params;
3541  inp_cpy.offload = do_offload;
3542 
3543  seq_tsr_ctr * ctrseq = new seq_tsr_ctr(this, is_inner, &inp_cpy, virt_blk_len_A, virt_blk_len_B, virt_blk_len_C, vrt_sz_C);
3544  if (is_top) {
3545  hctr = ctrseq;
3546  is_top = 0;
3547  } else {
3548  *rec_ctr = ctrseq;
3549  }
3550 
3551 /* hctr->beta = this->beta;
3552  hctr->A = A->data;
3553  hctr->B = B->data;
3554  hctr->C = C->data;*/
3555  /* if (global_comm.rank == 0){
3556  int64_t n,m,k;
3557  dtype old_flops;
3558  dtype new_flops;
3559  ggg_sym_nmk(A->order, A->pad_edge_len, idx_A, A->sym,
3560  B->order, B->pad_edge_len, idx_B, B->sym,
3561  C->order, &n, &m, &k);
3562  old_flops = 2.0*(dtype)n*(dtype)m*(dtype)k;
3563  new_flops = A->calc_nvirt();
3564  new_flops *= B->calc_nvirt();
3565  new_flops *= C->calc_nvirt();
3566  new_flops *= global_comm.np;
3567  new_flops = sqrt(new_flops);
3568  new_flops *= global_comm.np;
3569  ggg_sym_nmk(A->order, virt_blk_len_A, idx_A, A->sym,
3570  B->order, virt_blk_len_B, idx_B, B->sym,
3571  C->order, &n, &m, &k);
3572  printf("Each subcontraction is a " PRId64 " by " PRId64 " by " PRId64 " DGEMM performing %E flops\n",n,m,k,
3573  2.0*(dtype)n*(dtype)m*(dtype)k);
3574  new_flops *= 2.0*(dtype)n*(dtype)m*(dtype)k;
3575  printf("Contraction performing %E flops rather than %E, a factor of %lf more flops due to padding\n",
3576  new_flops, old_flops, new_flops/old_flops);
3577 
3578  }*/
3579 
3580  CTF_int::cdealloc(idx_arr);
3581  CTF_int::cdealloc(blk_len_A);
3582  CTF_int::cdealloc(blk_len_B);
3583  CTF_int::cdealloc(blk_len_C);
3584 
3585  return hctr;
3586  }
3587 
3588 
3589  ctr * contraction::construct_sparse_ctr(int is_inner,
3590  iparam const * inner_params,
3591  int * nvirt_all,
3592  int is_used,
3593  int const * phys_mapped){
3594  int num_tot, i, i_A, i_B, i_C, nphys_dim, is_top, nvirt;
3595  int * idx_arr, * virt_dim;
3596  int64_t blk_sz_A, blk_sz_B, blk_sz_C;
3597  int64_t vrt_sz_A, vrt_sz_B, vrt_sz_C;
3598  int * blk_len_A, * virt_blk_len_A, * blk_len_B;
3599  int * virt_blk_len_B, * blk_len_C, * virt_blk_len_C;
3600  mapping * map;
3601  spctr * hctr = NULL;
3602  spctr ** rec_ctr = NULL;
3603  ASSERT(A->wrld->cdt.cm == B->wrld->cdt.cm && B->wrld->cdt.cm == C->wrld->cdt.cm);
3604  World * wrld = A->wrld;
3605  CommData global_comm = wrld->cdt;
3606 
3607  is_top = 1;
3608  nphys_dim = A->topo->order;
3609 
3610 
3611  inv_idx(A->order, idx_A,
3612  B->order, idx_B,
3613  C->order, idx_C,
3614  &num_tot, &idx_arr);
3615 
3616  nphys_dim = A->topo->order;
3617 
3618  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&virt_blk_len_A);
3619  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&virt_blk_len_B);
3620  CTF_int::alloc_ptr(sizeof(int)*C->order, (void**)&virt_blk_len_C);
3621 
3622  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&blk_len_A);
3623  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&blk_len_B);
3624  CTF_int::alloc_ptr(sizeof(int)*C->order, (void**)&blk_len_C);
3625  CTF_int::alloc_ptr(sizeof(int)*num_tot, (void**)&virt_dim);
3626 
3627  /* Determine the block dimensions of each local subtensor */
3628  blk_sz_A = A->size;
3629  blk_sz_B = B->size;
3630  blk_sz_C = C->size;
3631  calc_dim(A->order, blk_sz_A, A->pad_edge_len, A->edge_map,
3632  &vrt_sz_A, virt_blk_len_A, blk_len_A);
3633  calc_dim(B->order, blk_sz_B, B->pad_edge_len, B->edge_map,
3634  &vrt_sz_B, virt_blk_len_B, blk_len_B);
3635  calc_dim(C->order, blk_sz_C, C->pad_edge_len, C->edge_map,
3636  &vrt_sz_C, virt_blk_len_C, blk_len_C);
3637 
3638  if (!is_inner){
3639  if (A->is_sparse && A->wrld->np > 1){
3640  spctr_pin_keys * skctr = new spctr_pin_keys(this, 0);
3641  if (is_top){
3642  hctr = skctr;
3643  is_top = 0;
3644  } else {
3645  *rec_ctr = skctr;
3646  }
3647  rec_ctr = &skctr->rec_ctr;
3648  }
3649 
3650  if (B->is_sparse && B->wrld->np > 1){
3651  spctr_pin_keys * skctr = new spctr_pin_keys(this, 1);
3652  if (is_top){
3653  hctr = skctr;
3654  is_top = 0;
3655  } else {
3656  *rec_ctr = skctr;
3657  }
3658  rec_ctr = &skctr->rec_ctr;
3659  }
3660 
3661  if (C->is_sparse && C->wrld->np > 1){
3662  spctr_pin_keys * skctr = new spctr_pin_keys(this, 1);
3663  if (is_top){
3664  hctr = skctr;
3665  is_top = 0;
3666  } else {
3667  *rec_ctr = skctr;
3668  }
3669  rec_ctr = &skctr->rec_ctr;
3670  }
3671  }
3672 
3673 
3674  bool need_rep = 0;
3675  for (i=0; i<nphys_dim; i++){
3676  if (phys_mapped[3*i+0] == 0 ||
3677  phys_mapped[3*i+1] == 0 ||
3678  phys_mapped[3*i+2] == 0){
3679  /*ASSERT((phys_mapped[3*i+0] == 0 && phys_mapped[3*i+1] == 0) ||
3680  (phys_mapped[3*i+0] == 0 && phys_mapped[3*i+2] == 0) ||
3681  (phys_mapped[3*i+1] == 0 && phys_mapped[3*i+2] == 0));*/
3682  need_rep = 1;
3683  break;
3684  }
3685  }
3686  if (need_rep){
3687  if (global_comm.rank == 0)
3688  DPRINTF(2,"Replicating tensor\n");
3689 
3690  spctr_replicate * rctr = new spctr_replicate(this, phys_mapped, blk_sz_A, blk_sz_B, blk_sz_C);
3691  if (is_top){
3692  hctr = rctr;
3693  is_top = 0;
3694  } else {
3695  *rec_ctr = rctr;
3696  }
3697  rec_ctr = &rctr->rec_ctr;
3698  }
3699 
3700 
3701  //#ifdef OFFLOAD
3702  int total_iter = 1;
3703  int upload_phase_A = 1;
3704  int upload_phase_B = 1;
3705  int download_phase_C = 1;
3706  //#endif
3707  nvirt = 1;
3708 
3709 
3710  spctr_2d_general * bottom_ctr_gen = NULL;
3711 
3712  int spvirt_blk_len_A[A->order];
3713  if (A->is_sparse){
3714  blk_sz_A = A->calc_nvirt();
3715  for (int a=0; a<A->order; a++){
3716  blk_len_A[a] = A->edge_map[a].calc_phase()/A->edge_map[a].calc_phys_phase();
3717  }
3718  std::fill(spvirt_blk_len_A, spvirt_blk_len_A+A->order, 1);
3719  } else
3720  memcpy(spvirt_blk_len_A, virt_blk_len_A, sizeof(int)*A->order);
3721  int spvirt_blk_len_B[B->order];
3722  if (B->is_sparse){
3723  blk_sz_B = B->calc_nvirt();
3724  for (int a=0; a<B->order; a++){
3725  blk_len_B[a] = B->edge_map[a].calc_phase()/B->edge_map[a].calc_phys_phase();
3726  }
3727  std::fill(spvirt_blk_len_B, spvirt_blk_len_B+B->order, 1);
3728  } else
3729  memcpy(spvirt_blk_len_B, virt_blk_len_B, sizeof(int)*B->order);
3730  int spvirt_blk_len_C[C->order];
3731  if (C->is_sparse){
3732  blk_sz_C = C->calc_nvirt();
3733  for (int a=0; a<C->order; a++){
3734  blk_len_C[a] = C->edge_map[a].calc_phase()/C->edge_map[a].calc_phys_phase();
3735  }
3736  std::fill(spvirt_blk_len_C, spvirt_blk_len_C+C->order, 1);
3737  } else
3738  memcpy(spvirt_blk_len_C, virt_blk_len_C, sizeof(int)*C->order);
3739 
3740  for (i=0; i<num_tot; i++){
3741  virt_dim[i] = 1;
3742  i_A = idx_arr[3*i+0];
3743  i_B = idx_arr[3*i+1];
3744  i_C = idx_arr[3*i+2];
3745  /* If this index belongs to exactly two tensors */
3746  if ((i_A != -1 && i_B != -1 && i_C == -1) ||
3747  (i_A != -1 && i_B == -1 && i_C != -1) ||
3748  (i_A == -1 && i_B != -1 && i_C != -1)) {
3749  spctr_2d_general * ctr_gen = new spctr_2d_general(this);
3750  #ifdef OFFLOAD
3751  ctr_gen->alloc_host_buf = false;
3752  #endif
3753  int is_built = 0;
3754  if (i_A == -1){
3755  is_built = ctr_2d_gen_build(is_used,
3756  global_comm,
3757  i,
3758  virt_dim,
3759  ctr_gen->edge_len,
3760  total_iter,
3761  A,
3762  i_A,
3763  ctr_gen->cdt_A,
3764  ctr_gen->ctr_lda_A,
3765  ctr_gen->ctr_sub_lda_A,
3766  ctr_gen->move_A,
3767  blk_len_A,
3768  blk_sz_A,
3769  spvirt_blk_len_A,
3770  upload_phase_A,
3771  B,
3772  i_B,
3773  ctr_gen->cdt_B,
3774  ctr_gen->ctr_lda_B,
3775  ctr_gen->ctr_sub_lda_B,
3776  ctr_gen->move_B,
3777  blk_len_B,
3778  blk_sz_B,
3779  spvirt_blk_len_B,
3780  upload_phase_B,
3781  C,
3782  i_C,
3783  ctr_gen->cdt_C,
3784  ctr_gen->ctr_lda_C,
3785  ctr_gen->ctr_sub_lda_C,
3786  ctr_gen->move_C,
3787  blk_len_C,
3788  blk_sz_C,
3789  spvirt_blk_len_C,
3790  download_phase_C);
3791  }
3792  if (i_B == -1){
3793  is_built = ctr_2d_gen_build(is_used,
3794  global_comm,
3795  i,
3796  virt_dim,
3797  ctr_gen->edge_len,
3798  total_iter,
3799  B,
3800  i_B,
3801  ctr_gen->cdt_B,
3802  ctr_gen->ctr_lda_B,
3803  ctr_gen->ctr_sub_lda_B,
3804  ctr_gen->move_B,
3805  blk_len_B,
3806  blk_sz_B,
3807  spvirt_blk_len_B,
3808  upload_phase_B,
3809  C,
3810  i_C,
3811  ctr_gen->cdt_C,
3812  ctr_gen->ctr_lda_C,
3813  ctr_gen->ctr_sub_lda_C,
3814  ctr_gen->move_C,
3815  blk_len_C,
3816  blk_sz_C,
3817  spvirt_blk_len_C,
3818  download_phase_C,
3819  A,
3820  i_A,
3821  ctr_gen->cdt_A,
3822  ctr_gen->ctr_lda_A,
3823  ctr_gen->ctr_sub_lda_A,
3824  ctr_gen->move_A,
3825  blk_len_A,
3826  blk_sz_A,
3827  spvirt_blk_len_A,
3828  upload_phase_A);
3829  }
3830  if (i_C == -1){
3831  is_built = ctr_2d_gen_build(is_used,
3832  global_comm,
3833  i,
3834  virt_dim,
3835  ctr_gen->edge_len,
3836  total_iter,
3837  C,
3838  i_C,
3839  ctr_gen->cdt_C,
3840  ctr_gen->ctr_lda_C,
3841  ctr_gen->ctr_sub_lda_C,
3842  ctr_gen->move_C,
3843  blk_len_C,
3844  blk_sz_C,
3845  spvirt_blk_len_C,
3846  download_phase_C,
3847  A,
3848  i_A,
3849  ctr_gen->cdt_A,
3850  ctr_gen->ctr_lda_A,
3851  ctr_gen->ctr_sub_lda_A,
3852  ctr_gen->move_A,
3853  blk_len_A,
3854  blk_sz_A,
3855  spvirt_blk_len_A,
3856  upload_phase_A,
3857  B,
3858  i_B,
3859  ctr_gen->cdt_B,
3860  ctr_gen->ctr_lda_B,
3861  ctr_gen->ctr_sub_lda_B,
3862  ctr_gen->move_B,
3863  blk_len_B,
3864  blk_sz_B,
3865  spvirt_blk_len_B,
3866  upload_phase_B);
3867  }
3868  ctr_gen->dns_vrt_sz_A = vrt_sz_A;
3869  ctr_gen->dns_vrt_sz_B = vrt_sz_B;
3870  ctr_gen->dns_vrt_sz_C = vrt_sz_C;
3871  if (is_built){
3872  if (is_top){
3873  hctr = ctr_gen;
3874  is_top = 0;
3875  } else {
3876  *rec_ctr = ctr_gen;
3877  }
3878  if (bottom_ctr_gen == NULL)
3879  bottom_ctr_gen = ctr_gen;
3880  rec_ctr = &ctr_gen->rec_ctr;
3881  } else {
3882  ctr_gen->rec_ctr = NULL;
3883  delete ctr_gen;
3884  }
3885  } else {
3886  if (i_A != -1){
3887  map = &A->edge_map[i_A];
3888  while (map->has_child) map = map->child;
3889  if (map->type == VIRTUAL_MAP)
3890  virt_dim[i] = map->np;
3891  } else if (i_B != -1){
3892  map = &B->edge_map[i_B];
3893  while (map->has_child) map = map->child;
3894  if (map->type == VIRTUAL_MAP)
3895  virt_dim[i] = map->np;
3896  } else if (i_C != -1){
3897  map = &C->edge_map[i_C];
3898  while (map->has_child) map = map->child;
3899  if (map->type == VIRTUAL_MAP)
3900  virt_dim[i] = map->np;
3901  }
3902  }
3903  /*if (sA && i_A != -1){
3904  nvirt = virt_dim[i]/str_A->strip_dim[i_A];
3905  } else if (sB && i_B != -1){
3906  nvirt = virt_dim[i]/str_B->strip_dim[i_B];
3907  } else if (sC && i_C != -1){
3908  nvirt = virt_dim[i]/str_C->strip_dim[i_C];
3909  }*/
3910 
3911  nvirt = nvirt * virt_dim[i];
3912  }
3913 
3914  if (nvirt_all != NULL)
3915  *nvirt_all = nvirt;
3916 
3917  ASSERT(blk_sz_A >= 1);
3918  ASSERT(blk_sz_B >= 1);
3919  ASSERT(blk_sz_C >= 1);
3920 
3921  bool do_offload = false;
3922  #ifdef OFFLOAD
3923  //if ((!is_custom || func->has_off_gemm) && is_inner > 0 && (is_custom || C->sr->is_offloadable())){
3924  if (is_custom && func->has_off_gemm){
3925  do_offload = true;
3926  if (bottom_ctr_gen != NULL)
3927  bottom_ctr_gen->alloc_host_buf = true;
3928  spctr_offload * ctroff = new spctr_offload(this, blk_sz_A, blk_sz_B, blk_sz_C, total_iter, upload_phase_A, upload_phase_B, download_phase_C);
3929  if (is_top){
3930  hctr = ctroff;
3931  is_top = 0;
3932  } else {
3933  *rec_ctr = ctroff;
3934  }
3935  rec_ctr = &ctroff->rec_ctr;
3936  }
3937  #endif
3938 
3939 
3940  /* Multiply over virtual sub-blocks */
3941  if (nvirt > 1){
3942  spctr_virt * ctrv = new spctr_virt(this, num_tot, virt_dim, vrt_sz_A, vrt_sz_B, vrt_sz_C);
3943  if (is_top) {
3944  hctr = ctrv;
3945  is_top = 0;
3946  } else {
3947  *rec_ctr = ctrv;
3948  }
3949  rec_ctr = &ctrv->rec_ctr;
3950  } else
3951  CTF_int::cdealloc(virt_dim);
3952 
3953  int krnl_type = -1;
3954  if (is_inner){
3955  ASSERT(!(!A->is_sparse && (B->is_sparse || C->is_sparse)));
3956  ASSERT(!(C->is_sparse && (!B->is_sparse || !A->is_sparse)));
3957  if (A->is_sparse && !B->is_sparse && !C->is_sparse){
3958  if (is_custom || !A->sr->has_coo_ker) krnl_type = 2;
3959  else krnl_type = 1;
3960  }
3961  if (A->is_sparse && B->is_sparse && !C->is_sparse){
3962  krnl_type = 3;
3963  }
3964  if (A->is_sparse && B->is_sparse && C->is_sparse){
3965  krnl_type = 4;
3966  }
3967  } else {
3968  // Shouldn't be here ever anymore
3969  //printf("%d %d %d\n", A->is_sparse, B->is_sparse, C->is_sparse);
3970  ASSERT(!B->is_sparse && !C->is_sparse);
3971  assert(!B->is_sparse && !C->is_sparse);
3972 
3973  krnl_type = 0;
3974  }
3975 
3976 
3977  iparam inp_cpy;
3978  if (inner_params != NULL)
3979  inp_cpy = *inner_params;
3980  inp_cpy.offload = do_offload;
3981 
3982  seq_tsr_spctr * ctrseq;
3983  if (C->is_sparse)
3984  ctrseq = new seq_tsr_spctr(this, krnl_type, &inp_cpy, virt_blk_len_A, virt_blk_len_B, virt_blk_len_C, 0);
3985  else
3986  ctrseq = new seq_tsr_spctr(this, krnl_type, &inp_cpy, virt_blk_len_A, virt_blk_len_B, virt_blk_len_C, vrt_sz_C);
3987  if (is_top) {
3988  hctr = ctrseq;
3989  is_top = 0;
3990  } else {
3991  *rec_ctr = ctrseq;
3992  }
3993 
3994  CTF_int::cdealloc(blk_len_A);
3995  CTF_int::cdealloc(blk_len_B);
3996  CTF_int::cdealloc(blk_len_C);
3997  CTF_int::cdealloc(idx_arr);
3998 
3999  return hctr;
4000  }
4001 
4002  ctr * contraction::construct_ctr(int is_inner,
4003  iparam const * inner_params,
4004  int * nvirt_all,
4005  int is_used){
4006 
4007  TAU_FSTART(construct_contraction);
4008  int i;
4009  mapping * map;
4010  int * phys_mapped;
4011 
4012  int nphys_dim = A->topo->order;
4013 
4014  CTF_int::alloc_ptr(sizeof(int)*nphys_dim*3, (void**)&phys_mapped);
4015  memset(phys_mapped, 0, sizeof(int)*nphys_dim*3);
4016 
4017  for (i=0; i<A->order; i++){
4018  map = &A->edge_map[i];
4019  if (map->type == PHYSICAL_MAP){
4020  phys_mapped[3*map->cdt+0] = 1;
4021  }
4022  while (map->has_child) {
4023  map = map->child;
4024  if (map->type == PHYSICAL_MAP){
4025  phys_mapped[3*map->cdt+0] = 1;
4026  }
4027  }
4028  }
4029  for (i=0; i<B->order; i++){
4030  map = &B->edge_map[i];
4031  if (map->type == PHYSICAL_MAP){
4032  phys_mapped[3*map->cdt+1] = 1;
4033  }
4034  while (map->has_child) {
4035  map = map->child;
4036  if (map->type == PHYSICAL_MAP){
4037  phys_mapped[3*map->cdt+1] = 1;
4038  }
4039  }
4040  }
4041  for (i=0; i<C->order; i++){
4042  map = &C->edge_map[i];
4043  if (map->type == PHYSICAL_MAP){
4044  phys_mapped[3*map->cdt+2] = 1;
4045  }
4046  while (map->has_child) {
4047  map = map->child;
4048  if (map->type == PHYSICAL_MAP){
4049  phys_mapped[3*map->cdt+2] = 1;
4050  }
4051  }
4052  }
4053  ctr * hctr;
4054  if (is_sparse()){
4055  hctr = construct_sparse_ctr(is_inner, inner_params, nvirt_all, is_used, phys_mapped);
4056  } else {
4057  hctr = construct_dense_ctr(is_inner, inner_params, nvirt_all, is_used, phys_mapped);
4058  }
4059  CTF_int::cdealloc(phys_mapped);
4060  TAU_FSTOP(construct_contraction);
4061  return hctr;
4062  }
4063 
4064  int contraction::contract(){
4065  int stat;
4066  ctr * ctrf;
4067  CommData global_comm = C->wrld->cdt;
4068 
4070  || C->has_zero_edge_len){
4071  if (!C->sr->isequal(beta,C->sr->mulid()) && !C->has_zero_edge_len){
4072  int * new_idx_C;
4073  int num_diag = 0;
4074  new_idx_C = (int*)CTF_int::alloc(sizeof(int)*C->order);
4075  for (int i=0; i<C->order; i++){
4076  new_idx_C[i]=i-num_diag;
4077  for (int j=0; j<i; j++){
4078  if (idx_C[i] == idx_C[j]){
4079  new_idx_C[i]=j-num_diag;
4080  num_diag++;
4081  break;
4082  }
4083  }
4084  }
4085  scaling scl = scaling(C, new_idx_C, beta);
4086  scl.execute();
4087  CTF_int::cdealloc(new_idx_C);
4088  }
4089  return SUCCESS;
4090  }
4091  //FIXME: create these tensors without home
4092  if (A == B || A == C){
4093  tensor * new_tsr = new tensor(A);
4094  contraction new_ctr = contraction(*this);
4095  new_ctr.A = new_tsr;
4096  stat = new_ctr.contract();
4097  delete new_tsr;
4098  return stat;
4099  }
4100  if (B == C){
4101  tensor * new_tsr = new tensor(B);
4102  contraction new_ctr = contraction(*this);
4103  new_ctr.B = new_tsr;
4104  stat = new_ctr.contract();
4105  delete new_tsr;
4106  return stat;
4107  }
4108 // ASSERT(!C->is_sparse);
4109  if (B->is_sparse && !A->is_sparse){
4110 // ASSERT(!A->is_sparse);
4111  //FIXME ASSERT that commitative
4112  contraction CBA(B,idx_B,A,idx_A,alpha,C,idx_C,beta,func);
4113  CBA.contract();
4114  return SUCCESS;
4115  }
4116  //FIXME: was putting indices of A at the end here before
4117 /* if (A->is_sparse){
4118  //bool A_inds_ordered = true;
4119  //for (int i=1; i<A->order; i++){
4120  // if (idx_A[i] < idx_A[i-1]) A_inds_ordered = false;
4121  //}
4122  int new_idx_A[A->order];
4123  int new_idx_B[B->order];
4124  int new_idx_C[C->order];
4125 
4126  int num_tot, * idx_arr;
4127  inv_idx(A->order, idx_A,
4128  B->order, idx_B,
4129  C->order, idx_C,
4130  &num_tot, &idx_arr);
4131  bool used[num_tot];
4132  memset(used,0,sizeof(bool)*num_tot);
4133  for (int i=0; i<num_tot; i++){
4134  int la=-2;
4135  int ila=-1;
4136  for (int j=num_tot-1; j>=0; j--){
4137  if (used[j] == 0 && idx_arr[3*j]>la){
4138  ila = j;
4139  la = idx_arr[3*j];
4140  }
4141  }
4142  ASSERT(ila>=0);
4143  used[ila] = 1;
4144  if (idx_arr[3*ila] != -1)
4145  new_idx_A[idx_arr[3*ila]]=num_tot-i-1;
4146  if (idx_arr[3*ila+1] != -1)
4147  new_idx_B[idx_arr[3*ila+1]]=num_tot-i-1;
4148  if (idx_arr[3*ila+2] != -1)
4149  new_idx_C[idx_arr[3*ila+2]]=num_tot-i-1;
4150  }
4151  cdealloc(idx_arr);
4152  bool is_chngd = false;
4153  for (int i=0; i<A->order; i++){
4154  if (idx_A[i] != new_idx_A[i]) is_chngd=true;
4155  }
4156  for (int i=0; i<B->order; i++){
4157  if (idx_B[i] != new_idx_B[i]) is_chngd=true;
4158  }
4159  for (int i=0; i<C->order; i++){
4160  if (idx_C[i] != new_idx_C[i]) is_chngd=true;
4161  }
4162  if (is_chngd){
4163  contraction CBA(A,new_idx_A,B,new_idx_B,alpha,C,new_idx_C,beta,func);
4164  CBA.contract();
4165  return SUCCESS;
4166  }
4167 
4168  }*/
4169 
4170 
4171  #if DEBUG >= 1 //|| VERBOSE >= 1)
4172 // if (global_comm.rank == 0)
4173  // printf("Contraction permutation:\n");
4174  print();
4175  #endif
4176 
4177  TAU_FSTART(contract);
4178 
4179  TAU_FSTART(prescale_operands);
4180  prescale_operands();
4181  TAU_FSTOP(prescale_operands);
4182  #if 0 //VERIFY
4183  int64_t nsA, nsB;
4184  int64_t nA, nB, nC, up_nC;
4185  dtype * sA, * sB, * ans_C;
4186  dtype * uA, * uB, * uC;
4187  dtype * up_C, * up_ans_C, * pup_C;
4188  int order_A, order_B, order_C, i, pass;
4189  int * edge_len_A, * edge_len_B, * edge_len_C;
4190  int * sym_A, * sym_B, * sym_C;
4191  int * sym_tmp;
4192  stat = allread_tsr(type->tid_A, &nsA, &sA);
4193  assert(stat == SUCCESS);
4194 
4195  stat = allread_tsr(type->tid_B, &nsB, &sB);
4196  assert(stat == SUCCESS);
4197 
4198  stat = allread_tsr(type->tid_C, &nC, &ans_C);
4199  assert(stat == SUCCESS);
4200  #endif
4201  /* Check if the current tensor mappings can be contracted on */
4202  /*fseq_tsr_ctr fftsr=ftsr;
4203  if (ftsr.func_ptr == NULL){
4204  fftsr.func_ptr = &sym_seq_ctr_ref;
4205  #ifdef OFFLOAD
4206  fftsr.is_offloadable = 0;
4207  #endif
4208  }*/
4209 
4210  #ifdef PROFILE
4211  TAU_FSTART(pre_map_barrier);
4212  MPI_Barrier(global_comm.cm);
4213  TAU_FSTOP(pre_map_barrier);
4214  #endif
4215  #if REDIST
4216  //stat = map_tensors(type, fftsr, felm, alpha, beta, &ctrf);
4217  stat = map(&ctrf);
4218  if (stat == ERROR) {
4219  printf("Failed to map tensors to physical grid\n");
4220  return ERROR;
4221  }
4222  #else
4223  if (check_mapping() != 0) {
4224  /* Construct the tensor algorithm we would like to use */
4225  #if DEBUG >= 1
4226  if (global_comm.rank == 0)
4227  printf("Keeping mappings:\n");
4228  #endif
4229  } else {
4230  #if DEBUG >= 1
4231  if (global_comm.rank == 0){
4232  printf("Initial mappings are unsuitable mappings:\n");
4233  A->print_map();
4234  B->print_map();
4235  C->print_map();
4236  }
4237  #endif
4238  }
4239  stat = map(&ctrf);
4240  if (stat == ERROR) {
4241  printf("Failed to map tensors to physical grid\n");
4242  return ERROR;
4243  }
4244 #if (VERBOSE >= 1 || DEBUG >= 1)
4245  if (global_comm.rank == 0){
4246  /* int64_t memuse=0;
4247  if (is_sparse())
4248  memuse = MAX(((spctr*)ctrf)->spmem_rec(nnz_frac_A,nnz_frac_B,nnz_frac_C), memuse);
4249  else
4250  memuse = MAX((int64_t)ctrf->mem_rec(), memuse);
4251  if (keep_map)
4252  printf("CTF: Contraction does not require redistribution, will use %E bytes per processor out of %E available memory and take an estimated of %E sec\n",
4253  (double)memuse,(double)proc_bytes_available(),ctrf->est_time_rec(1));
4254  else
4255  printf("CTF: Contraction will use new mapping, will use %E bytes per processor out of %E available memory and take an estimated of %E sec\n",
4256  (double)memuse,(double)proc_bytes_available(),ctrf->est_time_rec(1));*/
4257  A->print_map();
4258  B->print_map();
4259  C->print_map();
4260  }
4261 #endif
4262 
4263  #ifdef PROFILE
4264  TAU_FSTART(pre_fold_barrier);
4265  MPI_Barrier(global_comm.cm);
4266  TAU_FSTOP(pre_fold_barrier);
4267  #endif
4268  #endif
4269  //ASSERT(check_mapping());
4270  bool is_inner = false;
4271  #if FOLD_TSR
4272  is_inner = can_fold();
4273  if (is_inner){
4274  iparam prm;
4275  TAU_FSTART(map_fold);
4276  prm = map_fold();
4277  TAU_FSTOP(map_fold);
4278  delete ctrf;
4279  ctrf = construct_ctr(1, &prm);
4280  }
4281  #endif
4282  #if DEBUG >=2
4283  if (global_comm.rank == 0){
4284  ctrf->print();
4285  }
4286  #endif
4287  #if VERBOSE >= 2
4288  double dtt = MPI_Wtime();
4289  #endif
4290  #ifdef DEBUG
4291 // if (global_comm.rank == 0){
4292 // //DPRINTF(1,"[%d] performing contraction\n",
4293 // // global_comm.rank);
4294 // // DPRINTF(1,"%E bytes of buffer space will be needed for this contraction\n",
4295 // // (double)ctrf->mem_rec());
4296 // printf("%E bytes needed, System memory = %E bytes total, %E bytes used, %E bytes available.\n",
4297 // (double)ctrf->mem_rec(),
4298 // (double)proc_bytes_total(),
4299 // (double)proc_bytes_used(),
4300 // (double)proc_bytes_available());
4301 // }
4302  #endif
4303  #if DEBUG >=2
4304  A->print_map();
4305  B->print_map();
4306  C->print_map();
4307  #endif
4308  // stat = zero_out_padding(type->tid_A);
4309  // stat = zero_out_padding(type->tid_B);
4310  #ifdef PROFILE
4311  TAU_FSTART(pre_ctr_func_barrier);
4312  MPI_Barrier(global_comm.cm);
4313  TAU_FSTOP(pre_ctr_func_barrier);
4314  #endif
4315  TAU_FSTART(ctr_func);
4316  /* Invoke the contraction algorithm */
4317  A->topo->activate();
4318  if (is_sparse()){
4319  int64_t * size_blk_A = NULL;
4320  int64_t * size_blk_B = NULL;
4321  int64_t * size_blk_C = NULL;
4322  char * data_A;
4323  char * data_B;
4324  char * data_C;
4325  if (!is_inner){
4326  data_A = A->data;
4327  data_B = B->data;
4328  data_C = C->data;
4329  if (A->nnz_blk != NULL){
4330  alloc_ptr(A->calc_nvirt()*sizeof(int64_t), (void**)&size_blk_A);
4331  for (int i=0; i<A->calc_nvirt(); i++){
4332  size_blk_A[i] = A->nnz_blk[i]*A->sr->pair_size();
4333  }
4334  }
4335  if (B->nnz_blk != NULL){
4336  alloc_ptr(B->calc_nvirt()*sizeof(int64_t), (void**)&size_blk_B);
4337  for (int i=0; i<B->calc_nvirt(); i++){
4338  size_blk_B[i] = B->nnz_blk[i]*B->sr->pair_size();
4339  }
4340  }
4341  if (C->nnz_blk != NULL){
4342  alloc_ptr(C->calc_nvirt()*sizeof(int64_t), (void**)&size_blk_C);
4343  for (int i=0; i<C->calc_nvirt(); i++){
4344  size_blk_C[i] = C->nnz_blk[i]*C->sr->pair_size();
4345  }
4346  }
4347  } else {
4348  if (A->is_sparse) data_A = A->rec_tsr->data;
4349  else data_A = A->data;
4350  if (B->is_sparse) data_B = B->rec_tsr->data;
4351  else data_B = B->data;
4352  if (C->is_sparse) data_C = C->rec_tsr->data;
4353  else data_C = C->data;
4354  size_blk_A = A->rec_tsr->nnz_blk;
4355  size_blk_B = B->rec_tsr->nnz_blk;
4356  size_blk_C = C->rec_tsr->nnz_blk;
4357  }
4358 
4359  ((spctr*)ctrf)->run(data_A, A->calc_nvirt(), size_blk_A,
4360  data_B, B->calc_nvirt(), size_blk_B,
4361  data_C, C->calc_nvirt(), size_blk_C,
4362  data_C);
4363  if (C->is_sparse){
4364  if (!is_inner){
4365  if (data_C != C->data) {
4366  cdealloc(C->data);
4367  C->data = data_C;
4368  }
4369  for (int i=0; i<C->calc_nvirt(); i++){
4370  C->nnz_blk[i] = size_blk_C[i]/C->sr->pair_size();
4371  }
4372  } else {
4373  if (C->rec_tsr->data != data_C){
4374  cdealloc(C->rec_tsr->data);
4375  C->rec_tsr->data = data_C;
4376  }
4377  }
4378  }
4379 
4380  if (!is_inner){
4381  if (size_blk_A != NULL) cdealloc(size_blk_A);
4382  if (size_blk_B != NULL) cdealloc(size_blk_B);
4383  if (size_blk_C != NULL) cdealloc(size_blk_C);
4384  }
4385  } else
4386  ctrf->run(A->data, B->data, C->data);
4387  A->topo->deactivate();
4388 
4389  #ifdef PROFILE
4390  TAU_FSTART(post_ctr_func_barrier);
4391  MPI_Barrier(global_comm.cm);
4392  TAU_FSTOP(post_ctr_func_barrier);
4393  #endif
4394  TAU_FSTOP(ctr_func);
4395  C->unfold(1);
4396  #ifndef SEQ
4397  if (C->is_cyclic)
4398  stat = C->zero_out_padding();
4399  #endif
4400  A->unfold();
4401  B->unfold();
4402  #if VERBOSE >= 2
4403  if (A->wrld->rank == 0){
4404  VPRINTF(2, "Contraction permutation completed in %lf sec.\n",MPI_Wtime()-dtt);
4405  }
4406  #endif
4407 
4408 
4409  #if 0 //VERIFY
4410  stat = allread_tsr(type->tid_A, &nA, &uA);
4411  assert(stat == SUCCESS);
4412  stat = get_tsr_info(type->tid_A, &order_A, &edge_len_A, &sym_A);
4413  assert(stat == SUCCESS);
4414 
4415  stat = allread_tsr(type->tid_B, &nB, &uB);
4416  assert(stat == SUCCESS);
4417  stat = get_tsr_info(type->tid_B, &order_B, &edge_len_B, &sym_B);
4418  assert(stat == SUCCESS);
4419 
4420  if (nsA != nA) { printf("nsA = " PRId64 ", nA = " PRId64 "\n",nsA,nA); ABORT; }
4421  if (nsB != nB) { printf("nsB = " PRId64 ", nB = " PRId64 "\n",nsB,nB); ABORT; }
4422  for (i=0; (int64_t)i<nA; i++){
4423  if (fabs(uA[i] - sA[i]) > 1.E-6){
4424  printf("A[i] = %lf, sA[i] = %lf\n", uA[i], sA[i]);
4425  }
4426  }
4427  for (i=0; (int64_t)i<nB; i++){
4428  if (fabs(uB[i] - sB[i]) > 1.E-6){
4429  printf("B[%d] = %lf, sB[%d] = %lf\n", i, uB[i], i, sB[i]);
4430  }
4431  }
4432 
4433  stat = allread_tsr(type->tid_C, &nC, &uC);
4434  assert(stat == SUCCESS);
4435  stat = get_tsr_info(type->tid_C, &order_C, &edge_len_C, &sym_C);
4436  assert(stat == SUCCESS);
4437  DEBUG_PRINTF("packed size of C is " PRId64 " (should be " PRId64 ")\n", nC,
4438  sy_packed_size(order_C, edge_len_C, sym_C));
4439 
4440  pup_C = (dtype*)CTF_int::alloc(nC*sizeof(dtype));
4441 
4442  cpy_sym_ctr(alpha,
4443  uA, order_A, edge_len_A, edge_len_A, sym_A, idx_A,
4444  uB, order_B, edge_len_B, edge_len_B, sym_B, idx_B,
4445  beta,
4446  ans_C, order_C, edge_len_C, edge_len_C, sym_C, idx_C);
4447  assert(stat == SUCCESS);
4448 
4449  #if ( DEBUG>=5)
4450  for (i=0; i<nC; i++){
4451  // if (fabs(C[i]-ans_C[i]) > 1.E-6){
4452  printf("PACKED: C[%d] = %lf, ans_C[%d] = %lf\n",
4453  i, C[i], i, ans_C[i]);
4454  // }
4455  }
4456  #endif
4457 
4458  punpack_tsr(uC, order_C, edge_len_C,
4459  sym_C, 1, &sym_tmp, &up_C);
4460  punpack_tsr(ans_C, order_C, edge_len_C,
4461  sym_C, 1, &sym_tmp, &up_ans_C);
4462  punpack_tsr(up_ans_C, order_C, edge_len_C,
4463  sym_C, 0, &sym_tmp, &pup_C);
4464  for (i=0; (int64_t)i<nC; i++){
4465  assert(fabs(pup_C[i] - ans_C[i]) < 1.E-6);
4466  }
4467  pass = 1;
4468  up_nC = 1;
4469  for (i=0; i<order_C; i++){ up_nC *= edge_len_C[i]; };
4470 
4471  for (i=0; i<(int)up_nC; i++){
4472  if (fabs((up_C[i]-up_ans_C[i])/up_ans_C[i]) > 1.E-6 &&
4473  fabs((up_C[i]-up_ans_C[i])) > 1.E-6){
4474  printf("C[%d] = %lf, ans_C[%d] = %lf\n",
4475  i, up_C[i], i, up_ans_C[i]);
4476  pass = 0;
4477  }
4478  }
4479  if (!pass) ABORT;
4480 
4481  #endif
4482 
4483  delete ctrf;
4484 
4485  TAU_FSTOP(contract);
4486  return SUCCESS;
4487  }
4488 
4489 
4490  int contraction::sym_contract(){
4491  int i;
4492  //int ** scl_idxs_C;
4493  //dtype * scl_alpha_C;
4494  int stat = SUCCESS;
4495  int * new_idx;
4496  int * map_A, * map_B, * map_C;
4497  tensor ** dstack_tsr_C;
4498  int ** dstack_map_C;
4499  int nst_C;
4500  std::vector<contraction> perm_types;
4501  std::vector<int> signs;
4502  char const * dbeta;
4503  ctr * ctrf;
4504  tensor * tnsr_A, * tnsr_B, * tnsr_C;
4505 
4506  bool is_cons = this->check_consistency();
4507  if (!is_cons) return ERROR;
4508 
4509  CommData global_comm = A->wrld->cdt;
4510 
4511  A->unfold();
4512  B->unfold();
4513  C->unfold();
4515  || C->has_zero_edge_len){
4516  if (!C->sr->isequal(beta,C->sr->mulid()) && !C->has_zero_edge_len){
4517  int * new_idx_C;
4518  int num_diag = 0;
4519  new_idx_C = (int*)CTF_int::alloc(sizeof(int)*C->order);
4520  for (int i=0; i<C->order; i++){
4521  new_idx_C[i]=i-num_diag;
4522  for (int j=0; j<i; j++){
4523  if (idx_C[i] == idx_C[j]){
4524  new_idx_C[i]=j-num_diag;
4525  num_diag++;
4526  break;
4527  }
4528  }
4529  }
4530  scaling scl = scaling(C, new_idx_C, beta);
4531  scl.execute();
4532  CTF_int::cdealloc(new_idx_C);
4533  }
4534  return SUCCESS;
4535  }
4536 
4537  int * new_idx_A, * new_idx_B, * new_idx_C;
4538  if (!is_custom || func->left_distributive){
4539  tensor * new_tsr_A = A->self_reduce(idx_A, &new_idx_A, B->order, idx_B, &new_idx_B, C->order, idx_C, &new_idx_C);
4540  if (new_tsr_A != A) {
4541  contraction ctr(new_tsr_A, new_idx_A, B, new_idx_B, alpha, C, new_idx_C, beta, func);
4542  ctr.execute();
4543  delete new_tsr_A;
4544  cdealloc(new_idx_A);
4545  cdealloc(new_idx_B);
4546  if (C->order > 0)
4547  cdealloc(new_idx_C);
4548  return SUCCESS;
4549  }
4550  }
4551 
4552  if (!is_custom || func->right_distributive){
4553  tensor * new_tsr_B = B->self_reduce(idx_B, &new_idx_B, A->order, idx_A, &new_idx_A, C->order, idx_C, &new_idx_C);
4554  if (new_tsr_B != B) {
4555  contraction ctr(A, new_idx_A, new_tsr_B, new_idx_B, alpha, C, new_idx_C, beta, func);
4556  ctr.execute();
4557  delete new_tsr_B;
4558  cdealloc(new_idx_A);
4559  cdealloc(new_idx_B);
4560  cdealloc(new_idx_C);
4561  return SUCCESS;
4562  }
4563  }
4564 
4565 
4566  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&map_A);
4567  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&map_B);
4568  CTF_int::alloc_ptr(sizeof(int)*C->order, (void**)&map_C);
4569  CTF_int::alloc_ptr(sizeof(int*)*C->order, (void**)&dstack_map_C);
4570  CTF_int::alloc_ptr(sizeof(tensor*)*C->order, (void**)&dstack_tsr_C);
4571  memcpy(map_A, idx_A, A->order*sizeof(int));
4572  memcpy(map_B, idx_B, B->order*sizeof(int));
4573  memcpy(map_C, idx_C, C->order*sizeof(int));
4574 
4575  tnsr_A = A;
4576  tnsr_B = B;
4577  tnsr_C = C;
4578 
4579  tensor * new_tsr;
4580  while (tnsr_A->extract_diag(map_A, 1, new_tsr, &new_idx) == SUCCESS){
4581  if (tnsr_A != A) delete tnsr_A;
4582  CTF_int::cdealloc(map_A);
4583  tnsr_A = new_tsr;
4584  map_A = new_idx;
4585  }
4586  while (tnsr_B->extract_diag(map_B, 1, new_tsr, &new_idx) == SUCCESS){
4587  if (tnsr_B != B) delete tnsr_B;
4588  CTF_int::cdealloc(map_B);
4589  tnsr_B = new_tsr;
4590  map_B = new_idx;
4591  }
4592  nst_C = 0;
4593  while (tnsr_C->extract_diag(map_C, 1, new_tsr, &new_idx) == SUCCESS){
4594  dstack_map_C[nst_C] = map_C;
4595  dstack_tsr_C[nst_C] = tnsr_C;
4596  nst_C++;
4597  tnsr_C = new_tsr;
4598  map_C = new_idx;
4599  }
4600 
4601  bivar_function const * fptr;
4602  if (is_custom) fptr = func;
4603  else fptr = NULL;
4604 
4605  contraction new_ctr = contraction(tnsr_A, map_A, tnsr_B, map_B, alpha, tnsr_C, map_C, beta, fptr);
4606  tnsr_A->unfold();
4607  tnsr_B->unfold();
4608  tnsr_C->unfold();
4609  /*if (ntid_A == ntid_B || ntid_A == ntid_C){*/
4610  if (tnsr_A == tnsr_C){
4611  tensor * nnew_tsr = new tensor(tnsr_A);
4612  contraction nnew_ctr = contraction(new_ctr);
4613  nnew_ctr.A = nnew_tsr;
4614  stat = nnew_ctr.sym_contract();
4615  delete nnew_tsr;
4616  } else if (tnsr_B == tnsr_C){
4617  tensor * nnew_tsr = new tensor(tnsr_B);
4618  contraction nnew_ctr = contraction(new_ctr);
4619  nnew_ctr.B = nnew_tsr;
4620  stat = nnew_ctr.sym_contract();
4621  delete nnew_tsr;
4622  } else {
4623 
4624  int sign = align_symmetric_indices(tnsr_A->order,
4625  new_ctr.idx_A,
4626  tnsr_A->sym,
4627  tnsr_B->order,
4628  new_ctr.idx_B,
4629  tnsr_B->sym,
4630  tnsr_C->order,
4631  new_ctr.idx_C,
4632  tnsr_C->sym);
4633 
4634  /*
4635  * Apply a factor of n! for each set of n symmetric indices which are contracted over
4636  */
4637  int ocfact = overcounting_factor(tnsr_A->order,
4638  new_ctr.idx_A,
4639  tnsr_A->sym,
4640  tnsr_B->order,
4641  new_ctr.idx_B,
4642  tnsr_B->sym,
4643  tnsr_C->order,
4644  new_ctr.idx_C,
4645  tnsr_C->sym);
4646  char const * align_alpha = alpha;
4647  if (sign != 1){
4648  char * u_align_alpha = (char*)alloc(tnsr_C->sr->el_size);
4649  if (sign == -1){
4650  tnsr_C->sr->addinv(alpha, u_align_alpha);
4651 // alpha = new_alpha;
4652  }
4653  align_alpha = u_align_alpha;
4654  //FIXME free new_alpha
4655  }
4656  char * oc_align_alpha = (char*)alloc(tnsr_C->sr->el_size);
4657  tnsr_C->sr->safecopy(oc_align_alpha, align_alpha);
4658  if (ocfact != 1){
4659  if (ocfact != 1){
4660  tnsr_C->sr->safecopy(oc_align_alpha, tnsr_C->sr->addid());
4661 
4662  for (int i=0; i<ocfact; i++){
4663  tnsr_C->sr->add(oc_align_alpha, align_alpha, oc_align_alpha);
4664  }
4665 // alpha = new_alpha;
4666  }
4667  }
4668  //new_ctr.alpha = alpha;
4669 
4670 
4671  //std::cout << alpha << ' ' << alignfact << ' ' << ocfact << std::endl;
4672 
4673  if (new_ctr.unfold_broken_sym(NULL) != -1){
4674  if (global_comm.rank == 0)
4675  DPRINTF(2,"Contraction index is broken\n");
4676 
4677  contraction * unfold_ctr;
4678  new_ctr.unfold_broken_sym(&unfold_ctr);
4679  if (unfold_ctr->map(&ctrf, 0) == SUCCESS){
4680 /* #else
4681  int sy = 0;
4682  for (i=0; i<A->order; i++){
4683  if (A->sym[i] == SY) sy = 1;
4684  }
4685  for (i=0; i<B->order; i++){
4686  if (B->sym[i] == SY) sy = 1;
4687  }
4688  for (i=0; i<C->order; i++){
4689  if (C->sym[i] == SY) sy = 1;
4690  }
4691  if (sy && unfold_ctr->map(&ctrf, 0) == SUCCESS)
4692  #endifi*/
4693  if (tnsr_A == tnsr_B){
4694  tnsr_A = new tensor(tnsr_B);
4695  }
4696  desymmetrize(tnsr_A, unfold_ctr->A, 0);
4697  desymmetrize(tnsr_B, unfold_ctr->B, 0);
4698  desymmetrize(tnsr_C, unfold_ctr->C, 1);
4699  if (global_comm.rank == 0)
4700  DPRINTF(1,"%d Performing index desymmetrization\n",tnsr_A->wrld->rank);
4701  unfold_ctr->alpha = align_alpha;
4702  stat = unfold_ctr->sym_contract();
4703  if (!unfold_ctr->C->is_data_aliased && !tnsr_C->sr->isequal(tnsr_C->sr->mulid(), unfold_ctr->beta)){
4704  int sidx_C[tnsr_C->order];
4705  for (int iis=0; iis<tnsr_C->order; iis++){
4706  sidx_C[iis] = iis;
4707  }
4708  scaling sscl = scaling(tnsr_C, sidx_C, unfold_ctr->beta);
4709  sscl.execute();
4710  }
4711  symmetrize(tnsr_C, unfold_ctr->C);
4712  if (tnsr_A != unfold_ctr->A){
4713  unfold_ctr->A->unfold();
4714  tnsr_A->pull_alias(unfold_ctr->A);
4715  delete unfold_ctr->A;
4716  }
4717  if (tnsr_B != unfold_ctr->B){
4718  unfold_ctr->B->unfold();
4719  tnsr_B->pull_alias(unfold_ctr->B);
4720  delete unfold_ctr->B;
4721  }
4722  if (tnsr_C != unfold_ctr->C){
4723  unfold_ctr->C->unfold();
4724  tnsr_C->pull_alias(unfold_ctr->C);
4725  delete unfold_ctr->C;
4726  }
4727  } else {
4728  DPRINTF(1,"%d Not Performing index desymmetrization\n",tnsr_A->wrld->rank);
4729  get_sym_perms(new_ctr, perm_types, signs);
4730  //&nscl_C, &scl_maps_C, &scl_alpha_C);
4731  dbeta = beta;
4732  char * new_alpha = (char*)alloc(tnsr_B->sr->el_size);
4733  for (i=0; i<(int)perm_types.size(); i++){
4734  if (signs[i] == 1)
4735  C->sr->copy(new_alpha, oc_align_alpha);
4736  else {
4737  ASSERT(signs[i]==-1);
4738  tnsr_C->sr->addinv(oc_align_alpha, new_alpha);
4739  }
4740  perm_types[i].alpha = new_alpha;
4741  perm_types[i].beta = dbeta;
4742  stat = perm_types[i].contract();
4743  dbeta = new_ctr.C->sr->mulid();
4744  }
4745  perm_types.clear();
4746  signs.clear();
4747  }
4748  delete unfold_ctr;
4749  } else {
4750  new_ctr.alpha = oc_align_alpha;
4751  stat = new_ctr.contract();
4752  }
4753  if (tnsr_A != A) delete tnsr_A;
4754  if (tnsr_B != B) delete tnsr_B;
4755  for (int i=nst_C-1; i>=0; i--){
4756  dstack_tsr_C[i]->extract_diag(dstack_map_C[i], 0, tnsr_C, &new_idx);
4757  delete tnsr_C;
4758  tnsr_C = dstack_tsr_C[i];
4759  }
4760  ASSERT(tnsr_C == C);
4761  CTF_int::cdealloc(oc_align_alpha);
4762  }
4763 
4764  CTF_int::cdealloc(map_A);
4765  CTF_int::cdealloc(map_B);
4766  CTF_int::cdealloc(map_C);
4767  CTF_int::cdealloc(dstack_map_C);
4768  CTF_int::cdealloc(dstack_tsr_C);
4769 
4770  return stat;
4771  }
4772 
4773  int contraction::home_contract(){
4774  #ifndef HOME_CONTRACT
4775  return sym_contract();
4776  #else
4777  int ret;
4778  int was_home_A, was_home_B, was_home_C;
4779  A->unfold();
4780  B->unfold();
4781  C->unfold();
4782 
4783  if (C->is_sparse && (C->nnz_tot > 0 || C->has_home)){
4784  if (C->sr->isequal(beta,C->sr->addid())){
4785  C->set_zero();
4786  }
4787  int64_t * nnz_blk_C = (int64_t*)alloc(sizeof(int64_t)*C->calc_nvirt());
4788  memcpy(nnz_blk_C, C->nnz_blk, sizeof(int64_t)*C->calc_nvirt());
4789  int64_t * nnz_blk_zero = (int64_t*)alloc(sizeof(int64_t)*C->calc_nvirt());
4790  std::fill(nnz_blk_zero, nnz_blk_zero+C->calc_nvirt(), 0);
4791  C->set_new_nnz_glb(nnz_blk_zero);
4792 
4793  tensor * C_buf = new tensor(C, 1, 1);
4794  C->set_new_nnz_glb(nnz_blk_C);
4795  cdealloc(nnz_blk_C);
4796  cdealloc(nnz_blk_zero);
4797  C_buf->has_home = 0;
4798  C_buf->is_home = 0;
4799  contraction new_ctr(*this);
4800  new_ctr.C = C_buf;
4801  new_ctr.beta = C->sr->mulid();
4802  new_ctr.execute();
4803  char idx[C->order];
4804  for (int i=0; i<C->order; i++){ idx[i] = 'a'+i; }
4805  summation s(C_buf, idx, C->sr->mulid(), C, idx, beta);
4806  s.execute();
4807  delete C_buf;
4808  return SUCCESS;
4809 
4810  }
4811 
4812  if (A->has_zero_edge_len ||
4813  B->has_zero_edge_len ||
4814  C->has_zero_edge_len){
4815  if (!C->sr->isequal(beta,C->sr->mulid()) && !C->has_zero_edge_len){
4816  int * new_idx_C;
4817  int num_diag = 0;
4818  new_idx_C = (int*)CTF_int::alloc(sizeof(int)*C->order);
4819  for (int i=0; i<C->order; i++){
4820  new_idx_C[i]=i-num_diag;
4821  for (int j=0; j<i; j++){
4822  if (idx_C[i] == idx_C[j]){
4823  new_idx_C[i]=new_idx_C[j];
4824  num_diag++;
4825  break;
4826  }
4827  }
4828  }
4829  scaling scl = scaling(C, new_idx_C, beta);
4830  scl.execute();
4831  CTF_int::cdealloc(new_idx_C);
4832  }
4833  return SUCCESS;
4834  }
4835 
4836 
4838 
4839  //if (stype->tid_A == stype->tid_B || stype->tid_A == stype->tid_C){
4840  /*if (stype->tid_A == stype->tid_C){
4841  clone_tensor(stype->tid_A, 1, &new_tid);
4842  CTF_ctr_type_t new_type = *stype;
4843  new_type.tid_A = new_tid;
4844  ret = home_contract(&new_type, ftsr, felm, alpha, beta);
4845  del_tsr(new_tid);
4846  return ret;
4847  } else if (stype->tid_B == stype->tid_C){
4848  clone_tensor(stype->tid_B, 1, &new_tid);
4849  CTF_ctr_type_t new_type = *stype;
4850  new_type.tid_B = new_tid;
4851  ret = home_contract(&new_type, ftsr, felm, alpha, beta);
4852  del_tsr(new_tid);
4853  return ret;
4854  }*/
4855 
4856  //CTF_ctr_type_t ntype = *stype;
4857 
4858  if (C->is_sparse && !A->is_sparse){
4859  contraction pre_new_ctr = contraction(*this);
4860  pre_new_ctr.A = new tensor(A, 1, 1);
4861  pre_new_ctr.A->sparsify([](char const *){ return true; });
4862  pre_new_ctr.execute();
4863  delete pre_new_ctr.A;
4864  return SUCCESS;
4865  }
4866 
4867  if (C->is_sparse && !B->is_sparse){
4868  contraction pre_new_ctr = contraction(*this);
4869  pre_new_ctr.B = new tensor(B, 1, 1);
4870  pre_new_ctr.B->sparsify([](char const *){ return true; });
4871  pre_new_ctr.execute();
4872  delete pre_new_ctr.B;
4873  return SUCCESS;
4874  }
4875 
4876  /*if (!C->is_sparse && (A->is_sparse && B->is_sparse)){
4877  pre_new_ctr.C = new tensor(C, 0, 1);
4878  pre_new_ctr.C->sparsify();
4879  pre_new_ctr.C->has_home = 0;
4880  pre_new_ctr.C->is_home = 0;
4881  }*/
4882 
4883  //if any of the tensors are sparse and we have a Hadamard index, take the smallest of the two operand tensor and add to it a new index to get rid of the Hadamard index
4884  if (this->is_sparse()){
4885  int num_tot, * idx_arr;
4886  inv_idx(A->order, idx_A,
4887  B->order, idx_B,
4888  C->order, idx_C,
4889  &num_tot, &idx_arr);
4890  int iA = -1, iB = -1;
4891  int has_weigh = false;
4892  //consider the number of rows/cols in matrix that will beformed during contraction, taking into account enlarging of A/B, since CSR format will require storage proportional to that
4893  int64_t szA1=1, szA2=1, szB1=1, szB2=1;
4894  for (int i=0; i<num_tot; i++){
4895  if (idx_arr[3*i+0] != -1 &&
4896  idx_arr[3*i+1] != -1 &&
4897  idx_arr[3*i+2] != -1){
4898  has_weigh = true;
4899  iA = idx_arr[3*i+0];
4900  iB = idx_arr[3*i+1];
4901  szA1 *= A->lens[iA];
4902  szA2 *= A->lens[iA];
4903  szB1 *= B->lens[iB];
4904  szB2 *= B->lens[iB];
4905  } else if (idx_arr[3*i+0] != -1 &&
4906  idx_arr[3*i+1] != -1 &&
4907  idx_arr[3*i+2] == -1){
4908  szA1 *= A->lens[idx_arr[3*i+0]];
4909  szB1 *= B->lens[idx_arr[3*i+1]];
4910  } else if (idx_arr[3*i+0] != -1 &&
4911  idx_arr[3*i+1] == -1 &&
4912  idx_arr[3*i+2] != -1){
4913  szA1 *= A->lens[idx_arr[3*i+0]];
4914  } else if (idx_arr[3*i+0] == -1 &&
4915  idx_arr[3*i+1] != -1 &&
4916  idx_arr[3*i+2] != -1){
4917  szB1 *= B->lens[idx_arr[3*i+1]];
4918  }
4919  }
4920  cdealloc(idx_arr);
4921 
4922  if (has_weigh){
4923  int64_t A_sz = A->is_sparse ? A->nnz_tot : A->size;
4924  A_sz = std::max(A_sz, std::min(szA1, szA2));
4925  int64_t B_sz = B->is_sparse ? B->nnz_tot : B->size;
4926  B_sz = std::max(B_sz, std::min(szB1, szB2));
4927  int iX;
4928  tensor * X;
4929  int const * idx_X;
4930  if (A_sz < B_sz){
4931  iX = iA;
4932  X = A;
4933  idx_X = idx_A;
4934  } else {
4935  iX = iB;
4936  X = B;
4937  idx_X = idx_B;
4938  }
4939  int * lensX = (int*)alloc(sizeof(int)*(X->order+1));
4940  int * symX = (int*)alloc(sizeof(int)*(X->order+1));
4941  int * nidxX = (int*)alloc(sizeof(int)*(X->order));
4942  int * sidxX = (int*)alloc(sizeof(int)*(X->order+1));
4943  int * cidxX = (int*)alloc(sizeof(int)*(X->order+1));
4944  for (int i=0; i<X->order; i++){
4945  if (i < iX){
4946  lensX[i] = X->lens[i];
4947  symX[i] = X->sym[i];
4948  sidxX[i] = i;
4949  nidxX[i] = i;
4950  cidxX[i] = idx_X[i];
4951  } else {
4952  lensX[i+1] = X->lens[i];
4953  symX[i+1] = X->sym[i];
4954  nidxX[i] = i;
4955  sidxX[i+1] = i;
4956  cidxX[i+1] = idx_X[i];
4957  }
4958  }
4959  lensX[iX] = lensX[iX+1];
4960  symX[iX] = NS;
4961  sidxX[iX] = sidxX[iX+1];
4962  //introduce new contraction index 'num_tot'
4963  cidxX[iX] = num_tot;
4964  char * nname = (char*)alloc(strlen(X->name) + 2);
4965  char d[] = "d";
4966  strcpy(nname, X->name);
4967  strcat(nname, d);
4968  tensor * X2 = new tensor(X->sr, X->order+1, lensX, symX, X->wrld, 1, nname, X->profile, 1);
4969  free(nname);
4970  summation s(X, nidxX, X->sr->mulid(), X2, sidxX, X->sr->mulid());
4971  s.execute();
4972  contraction * nc;
4973  if (A_sz < B_sz){
4974  nc = new contraction(X2, cidxX, B, idx_B, alpha, C, idx_C, beta, func);
4975  nc->idx_B[iB] = num_tot;
4976  } else {
4977  nc = new contraction(A, idx_A, X2, cidxX, alpha, C, idx_C, beta, func);
4978  nc->idx_A[iA] = num_tot;
4979  }
4980  nc->execute();
4981  delete nc;
4982  delete X2;
4983  free(lensX);
4984  free(symX);
4985  free(sidxX);
4986  free(nidxX);
4987  free(cidxX);
4988  return SUCCESS;
4989  }
4990  }
4991 
4992  contraction new_ctr = contraction(*this);
4993 
4994  was_home_A = A->is_home;
4995  was_home_B = B->is_home;
4996  was_home_C = C->is_home;
4997 
4998  if (was_home_A){
4999 // clone_tensor(stype->tid_A, 0, &ntype.tid_A, 0);
5000  new_ctr.A = new tensor(A, 0, 0); //tensors[ntype.tid_A];
5001  new_ctr.A->data = A->data;
5002  new_ctr.A->home_buffer = A->home_buffer;
5003  new_ctr.A->is_home = 1;
5004  new_ctr.A->has_home = 1;
5005  new_ctr.A->home_size = A->home_size;
5006  new_ctr.A->is_mapped = 1;
5007  new_ctr.A->topo = A->topo;
5008  copy_mapping(A->order, A->edge_map, new_ctr.A->edge_map);
5009  new_ctr.A->set_padding();
5010  if (new_ctr.A->is_sparse){
5011  CTF_int::alloc_ptr(new_ctr.A->calc_nvirt()*sizeof(int64_t), (void**)&new_ctr.A->nnz_blk);
5012  new_ctr.A->set_new_nnz_glb(A->nnz_blk);
5013  }
5014  }
5015  if (was_home_B){
5016  if (A == B){ //stype->tid_A == stype->tid_B){
5017  new_ctr.B = new_ctr.A; //tensors[ntype.tid_B];
5018  } else {
5019  new_ctr.B = new tensor(B, 0, 0); //tensors[ntype.tid_A];
5020 /* clone_tensor(stype->tid_B, 0, &ntype.tid_B, 0);
5021  new_ctr.B = tensors[ntype.tid_B];*/
5022  new_ctr.B->data = B->data;
5023  new_ctr.B->home_buffer = B->home_buffer;
5024  new_ctr.B->is_home = 1;
5025  new_ctr.B->has_home = 1;
5026  new_ctr.B->home_size = B->home_size;
5027  new_ctr.B->is_mapped = 1;
5028  new_ctr.B->topo = B->topo;
5029  copy_mapping(B->order, B->edge_map, new_ctr.B->edge_map);
5030  new_ctr.B->set_padding();
5031  if (B->is_sparse){
5032  CTF_int::alloc_ptr(new_ctr.B->calc_nvirt()*sizeof(int64_t), (void**)&new_ctr.B->nnz_blk);
5033  new_ctr.B->set_new_nnz_glb(B->nnz_blk);
5034  }
5035  }
5036  }
5037  if (was_home_C){
5038  ASSERT(!C->is_sparse);
5039  ASSERT(!C->is_sparse);
5040  if (C == A){ //stype->tid_C == stype->tid_A){
5041  new_ctr.C = new_ctr.A; //tensors[ntype.tid_C];
5042  } else if (C == B){ //stype->tid_C == stype->tid_B){
5043  new_ctr.C = new_ctr.B; //tensors[ntype.tid_C];
5044  } else {
5045  new_ctr.C = new tensor(C, 0, 0); //tensors[ntype.tid_C];
5046  /*clone_tensor(stype->tid_C, 0, &ntype.tid_C, 0);
5047  new_ctr.C = tensors[ntype.tid_C];*/
5048  new_ctr.C->data = C->data;
5049  new_ctr.C->home_buffer = C->home_buffer;
5050  new_ctr.C->is_home = 1;
5051  new_ctr.C->has_home = 1;
5052  new_ctr.C->home_size = C->home_size;
5053  new_ctr.C->is_mapped = 1;
5054  new_ctr.C->topo = C->topo;
5055  copy_mapping(C->order, C->edge_map, new_ctr.C->edge_map);
5056  new_ctr.C->set_padding();
5057  }
5058  }
5059 
5060  ret = new_ctr.sym_contract();//&ntype, ftsr, felm, alpha, beta);
5061  if (ret!= SUCCESS) return ret;
5062  if (was_home_A) new_ctr.A->unfold();
5063  if (was_home_B && A != B) new_ctr.B->unfold();
5064  if (was_home_C) new_ctr.C->unfold();
5065 
5066  if (was_home_C && !new_ctr.C->is_home){
5067  if (C->wrld->rank == 0)
5068  DPRINTF(2,"Migrating tensor %s back to home\n", C->name);
5069  distribution dC = distribution(new_ctr.C);
5070 /* save_mapping(new_ctr.C,
5071  &old_phase_C, &old_rank_C,
5072  &old_virt_dim_C, &old_pe_lda_C,
5073  &old_size_C,
5074  &was_cyclic_C, &old_padding_C,
5075  &old_edge_len_C, &topovec[new_ctr.C->itopo]);*/
5076  C->data = new_ctr.C->data;
5077  C->is_home = 0;
5078  TAU_FSTART(redistribute_for_ctr_home);
5079  C->redistribute(dC);
5080 /* remap_tensor(stype->tid_C, C, C->topo, old_size_C,
5081  old_phase_C, old_rank_C, old_virt_dim_C,
5082  old_pe_lda_C, was_cyclic_C,
5083  old_padding_C, old_edge_len_C, global_comm);*/
5084  TAU_FSTOP(redistribute_for_ctr_home);
5085  C->sr->copy(C->home_buffer, C->data, C->size);
5086  C->sr->dealloc(C->data);
5087  C->data = C->home_buffer;
5088  C->is_home = 1;
5089  C->has_home = 1;
5090  C->home_size = C->size;
5091  new_ctr.C->is_data_aliased = 1;
5092  delete new_ctr.C;
5093  } else if (was_home_C) {
5094  /* C->itopo = new_ctr.C->itopo;
5095  copy_mapping(C->order, new_ctr.C->edge_map, C->edge_map);
5096  C->set_padding();*/
5097  ASSERT(new_ctr.C->data == C->data);
5098  new_ctr.C->is_data_aliased = 1;
5099  delete new_ctr.C;
5100  }
5101  if (new_ctr.A != new_ctr.C){ //ntype.tid_A != ntype.tid_C){
5102  if (was_home_A && !new_ctr.A->is_home){
5103  new_ctr.A->has_home = 0;
5104  new_ctr.A->is_home = 0;
5105  if (A->is_sparse){
5106  A->data = new_ctr.A->home_buffer;
5107  new_ctr.A->home_buffer = NULL;
5108  }
5109  delete new_ctr.A;
5110  } else if (was_home_A) {
5111  A->data = new_ctr.A->data;
5112  new_ctr.A->is_data_aliased = 1;
5113  delete new_ctr.A;
5114  }
5115  }
5116  if (new_ctr.B != new_ctr.A && new_ctr.B != new_ctr.C){
5117  if (was_home_B && A != B && !new_ctr.B->is_home){
5118  new_ctr.B->has_home = 0;
5119  new_ctr.B->is_home = 0;
5120  if (B->is_sparse){
5121  B->data = new_ctr.B->home_buffer;
5122  new_ctr.B->home_buffer = NULL;
5123  }
5124  delete new_ctr.B;
5125  } else if (was_home_B && A != B) {
5126  B->data = new_ctr.B->data;
5127  new_ctr.B->is_data_aliased = 1;
5128  delete new_ctr.B;
5129  }
5130  }
5131 
5132  return SUCCESS;
5133  #endif
5134  }
5135 
5136 
5137 
5138  bool contraction::need_prescale_operands(){
5139  int num_tot, * idx_arr;
5140  inv_idx(A->order, idx_A,
5141  B->order, idx_B,
5142  C->order, idx_C,
5143  &num_tot, &idx_arr);
5144  tensor * T, * V;
5145  int fT, fV;
5146  int * idx_T, * idx_V;
5147  if (A->size <= B->size){
5148  T = A;
5149  V = B;
5150  fT = 0;
5151  fV = 1;
5152  idx_T = idx_A;
5153  idx_V = idx_B;
5154  } else {
5155  T = B;
5156  V = A;
5157  fT = 1;
5158  fV = 0;
5159  idx_T = idx_B;
5160  idx_V = idx_A;
5161  }
5162  for (int iT=0; iT<T->order; iT++){
5163  int i = idx_T[iT];
5164  int iV = idx_arr[3*i+fV];
5165  int iiV = iV;
5166  int iiC = idx_arr[3*i+2];
5167  ASSERT(iT == idx_arr[3*i+fT]);
5168  int npres = 0;
5169  while (T->sym[iT+npres] == SY && iiC == -1 &&
5170  ( (iV == -1 && iiV == -1) ||
5171  (iV != -1 && iiV != -1 && V->sym[iiV] == SY)
5172  )
5173  ){
5174  npres++;
5175  int ii = idx_T[iT+npres];
5176  iiV = idx_arr[3*ii+fV];
5177  iiC = idx_arr[3*ii+2];
5178  }
5179  if (T->sym[iT+npres] == NS){
5180  if (iiC == -1 &&
5181  ( (iV == -1 && iiV == -1) ||
5182  (iV != -1 && iiV != -1 && V->sym[iiV] == NS)
5183  ) )
5184  npres++;
5185  }
5186 
5187  if (npres > 1)
5188  return true;
5189  }
5190 
5191  for (int iV=0; iV<V->order; iV++){
5192  int i = idx_V[iV];
5193  int iiT = idx_arr[3*i+fT];
5194  int iiC = idx_arr[3*i+2];
5195  ASSERT(iV == idx_arr[3*i+fV]);
5196  int npres = 0;
5197  while (V->sym[iV+npres] == SY && iiC == -1 && iiT == -1){
5198  npres++;
5199  int ii = idx_V[iV+npres];
5200  iiT = idx_arr[3*ii+fT];
5201  iiC = idx_arr[3*i+2];
5202  }
5203  if (V->sym[iV+npres] == NS && iiC == -1 && iiT == -1){
5204  npres++;
5205  }
5206  if (npres > 1)
5207  return true;
5208  }
5209  return false;
5210  }
5211 
5212  void contraction::prescale_operands(){
5213  int num_tot, * idx_arr;
5214  inv_idx(A->order, idx_A,
5215  B->order, idx_B,
5216  C->order, idx_C,
5217  &num_tot, &idx_arr);
5218  tensor * T, * V;
5219  int fT, fV;
5220  int * idx_T, * idx_V;
5221  if (A->size <= B->size){
5222  T = A;
5223  V = B;
5224  fT = 0;
5225  fV = 1;
5226  idx_T = idx_A;
5227  idx_V = idx_B;
5228  } else {
5229  T = B;
5230  V = A;
5231  fT = 1;
5232  fV = 0;
5233  idx_T = idx_B;
5234  idx_V = idx_A;
5235  }
5236  for (int iT=0; iT<T->order; iT++){
5237  int i = idx_T[iT];
5238  int iV = idx_arr[3*i+fV];
5239  int iiV = iV;
5240  int iiC = idx_arr[3*i+2];
5241  ASSERT(iT == idx_arr[3*i+fT]);
5242  int npres = 0;
5243  while (T->sym[iT+npres] == SY && iiC == -1 &&
5244  ( (iV == -1 && iiV == -1) ||
5245  (iV != -1 && iiV != -1 && V->sym[iiV] == SY)
5246  )
5247  ){
5248  npres++;
5249  int ii = idx_T[iT+npres];
5250  iiV = idx_arr[3*ii+fV];
5251  iiC = idx_arr[3*ii+2];
5252  }
5253  if (T->sym[iT+npres] == NS){
5254  if (iiC == -1 &&
5255  ( (iV == -1 && iiV == -1) ||
5256  (iV != -1 && iiV != -1 && V->sym[iiV] == NS)
5257  ) )
5258  npres++;
5259  }
5260 
5261  if (npres > 1){
5262  int sym_mask[T->order];
5263  std::fill(sym_mask, sym_mask+T->order, 0);
5264  std::fill(sym_mask+iT, sym_mask+iT+npres, 1);
5265  /*for (int k=0; k<T->order; k++){
5266  printf("sym_mask[%d]=%d\n",k,sym_mask[k]);
5267  }*/
5268 
5269  if (T->is_home){
5270  if (T->wrld->cdt.rank == 0)
5271  DPRINTF(2,"Tensor %s leaving home\n", T->name);
5272  T->data = T->sr->alloc(T->size);
5273  memcpy(T->data, T->home_buffer, T->size*T->sr->el_size);
5274  T->is_home = 0;
5275  }
5276  T->scale_diagonals(sym_mask);
5277  }
5278  iT += std::max(npres-1, 0);
5279  }
5280 
5281  for (int iV=0; iV<V->order; iV++){
5282  int i = idx_V[iV];
5283  int iiT = idx_arr[3*i+fT];
5284  int iiC = idx_arr[3*i+2];
5285  ASSERT(iV == idx_arr[3*i+fV]);
5286  int npres = 0;
5287  while (V->sym[iV+npres] == SY && iiC == -1 && iiT == -1){
5288  npres++;
5289  int ii = idx_V[iV+npres];
5290  iiT = idx_arr[3*ii+fT];
5291  iiC = idx_arr[3*i+2];
5292  }
5293  if (V->sym[iV+npres] == NS && iiC == -1 && iiT == -1){
5294  npres++;
5295  }
5296  if (npres > 1){
5297  if (V->is_home){
5298  if (V->wrld->cdt.rank == 0)
5299  DPRINTF(2,"Tensor %s leaving home\n", V->name);
5300  V->data = (char*)CTF_int::mst_alloc(V->size*V->sr->el_size);
5301  memcpy(V->data, V->home_buffer, V->size*V->sr->el_size);
5302  V->is_home = 0;
5303  }
5304 
5305  int sym_mask[V->order];
5306  std::fill(sym_mask, sym_mask+V->order, 0);
5307  std::fill(sym_mask+iV, sym_mask+iV+npres, 1);
5308  V->scale_diagonals(sym_mask);
5309  }
5310  iV += std::max(npres-1, 0);
5311  }
5312  cdealloc(idx_arr);
5313  }
5314 
5315  void contraction::print(){
5316  int i;
5317  //max = A->order+B->order+C->order;
5318  CommData global_comm = A->wrld->cdt;
5319  MPI_Barrier(global_comm.cm);
5320  if (global_comm.rank == 0){
5321 // printf("Contracting Tensor %s with %s into %s\n", A->name, B->name, C->name);
5322  char cname[200];
5323  cname[0] = '\0';
5324  sprintf(cname, "%s", C->name);
5325  sprintf(cname+strlen(cname),"[");
5326  for (i=0; i<C->order; i++){
5327  if (i>0)
5328  sprintf(cname+strlen(cname)," %d",idx_C[i]);
5329  else
5330  sprintf(cname+strlen(cname),"%d",idx_C[i]);
5331  }
5332  sprintf(cname+strlen(cname),"] <- ");
5333  sprintf(cname+strlen(cname), "%s", A->name);
5334  sprintf(cname+strlen(cname),"[");
5335  for (i=0; i<A->order; i++){
5336  if (i>0)
5337  sprintf(cname+strlen(cname)," %d",idx_A[i]);
5338  else
5339  sprintf(cname+strlen(cname),"%d",idx_A[i]);
5340  }
5341  sprintf(cname+strlen(cname),"]*");
5342  sprintf(cname+strlen(cname), "%s", B->name);
5343  sprintf(cname+strlen(cname),"[");
5344  for (i=0; i<B->order; i++){
5345  if (i>0)
5346  sprintf(cname+strlen(cname)," %d",idx_B[i]);
5347  else
5348  sprintf(cname+strlen(cname),"%d",idx_B[i]);
5349  }
5350  sprintf(cname+strlen(cname),"]");
5351  printf("CTF: Contraction %s\n",cname);
5352 
5353 /*
5354  if (alpha != NULL){
5355  printf("alpha is ");
5356  A->sr->print(alpha);
5357  printf("\nbeta is ");
5358  B->sr->print(beta);
5359  printf("\n");
5360  }
5361 
5362  printf("Contraction index table:\n");
5363  printf(" A B C\n");
5364  for (i=0; i<max; i++){
5365  ex_A=0;
5366  ex_B=0;
5367  ex_C=0;
5368  printf("%d: ",i);
5369  for (j=0; j<A->order; j++){
5370  if (idx_A[j] == i){
5371  ex_A++;
5372  if (A->sym[j] == SY)
5373  printf("%dSY ",j);
5374  else if (A->sym[j] == SH)
5375  printf("%dSH ",j);
5376  else if (A->sym[j] == AS)
5377  printf("%dAS ",j);
5378  else
5379  printf("%d ",j);
5380  }
5381  }
5382  if (ex_A == 0)
5383  printf(" ");
5384  if (ex_A == 1)
5385  printf(" ");
5386  for (j=0; j<B->order; j++){
5387  if (idx_B[j] == i){
5388  ex_B=1;
5389  if (B->sym[j] == SY)
5390  printf("%dSY ",j);
5391  else if (B->sym[j] == SH)
5392  printf("%dSH ",j);
5393  else if (B->sym[j] == AS)
5394  printf("%dAS ",j);
5395  else
5396  printf("%d ",j);
5397  }
5398  }
5399  if (ex_B == 0)
5400  printf(" ");
5401  if (ex_B == 1)
5402  printf(" ");
5403  for (j=0; j<C->order; j++){
5404  if (idx_C[j] == i){
5405  ex_C=1;
5406  if (C->sym[j] == SY)
5407  printf("%dSY ",j);
5408  else if (C->sym[j] == SH)
5409  printf("%dSH ",j);
5410  else if (C->sym[j] == AS)
5411  printf("%dAS ",j);
5412  else
5413  printf("%d ",j);
5414  }
5415  }
5416  printf("\n");
5417  if (ex_A + ex_B + ex_C == 0) break;
5418  }*/
5419  }
5420  }
5421 }
void get_len_ordering(tensor const *A, tensor const *B, tensor const *C, int const *idx_A, int const *idx_B, int const *idx_C, int **new_ordering_A, int **new_ordering_B, int **new_ordering_C)
find ordering of indices of tensor to reduce to DGEMM (A, B, and C may be permuted ...
int64_t get_redist_mem(distribution const &old_dist, double nnz_frac)
~contraction()
destructor
Definition: contraction.cxx:29
int map_tensor(int num_phys_dims, int tsr_order, int const *tsr_edge_len, int const *tsr_sym_table, int *restricted, CommData *phys_comm, int const *comm_idx, int fill, mapping *tsr_edge_map)
map a tensor
Definition: mapping.cxx:244
void get_perm(int perm_order, ptype A, ptype B, ptype C, ptype &tA, ptype &tB, ptype &tC)
void scale_diagonals(int const *sym_mask)
scales each element by 1/(number of entries equivalent to it after permutation of indices for which s...
char * home_buffer
buffer associated with home mapping of tensor, to which it is returned
bool has_coo_ker
whether there was a custom COO CSRMM kernel provided for this algebraic structure ...
Definition: algstrct.h:37
CTF_int::CommData cdt
communicator data for MPI comm defining this world
Definition: world.h:32
map_type type
Definition: mapping.h:22
bool is_home
whether the latest tensor data is in the home buffer
int64_t * nnz_blk
nonzero elements in each block owned locally
void get_choice(int64_t n, int64_t k, int64_t ch, int *chs)
Definition: util.cxx:289
int * sym
symmetries among tensor dimensions
void execute()
run contraction
Definition: contraction.cxx:99
tensor * A
left operand
Definition: contraction.h:19
#define DPRINTF(...)
Definition: util.h:235
virtual int pair_size() const
gets pair size el_size plus the key size
Definition: algstrct.h:46
std::list< mem_transfer > contract_mst()
gets rid of empty space on the stack
Definition: memcontrol.cxx:125
int calc_phase() const
compute the phase of a mapping
Definition: mapping.cxx:39
bool offload
Definition: ctr_tsr.h:84
void aug_phys(topology const *topo, int idim)
adds a physical mapping to this mapping
Definition: mapping.cxx:104
int64_t k
Definition: ctr_tsr.h:79
double est_redist_time(distribution const &old_dist, double nnz_frac)
void calc_dim(int order, int64_t size, int const *edge_len, mapping const *edge_map, int64_t *vrt_sz, int *vrt_edge_len, int *blk_edge_len)
calculate the block-sizes of a tensor
void safecopy(char *&a, char const *b) const
copies element b to element a, , with checks for NULL and alloc as necessary
Definition: algstrct.cxx:529
void execute(bool run_diag=false)
run summation
Definition: summation.cxx:119
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340
virtual int64_t mem_rec()
Definition: ctr_comm.h:177
template int conv_idx< int >(int, int const *, int **)
int * pad_edge_len
padded tensor edge lengths
int * inner_ordering
ordering of the dimensions according to which the tensori s folded
int calc_phys_phase() const
compute the physical phase of a mapping
Definition: mapping.cxx:57
tensor * B
right operand
Definition: contraction.h:21
void get_sym_perms(summation const &sum, std::vector< summation > &perms, std::vector< int > &signs)
finds all permutations of a summation that must be done for a broken symmetry
void * mst_alloc(int64_t len)
mst_alloc allocates buffer on the specialized memory stack
Definition: memcontrol.cxx:307
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
Definition: ctr_tsr.cxx:592
int sparsify(char const *threshold=NULL, bool take_abs=true)
reduce tensor to sparse format, storing only nonzero data, or data above a specified threshold...
virtual void copy(char *a, char const *b) const
copies element b to element a
Definition: algstrct.cxx:538
bool has_home
whether the tensor has a home mapping/buffer
double sign(int par)
int64_t size
current size of local tensor data chunk (mapping-dependent)
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
Definition: common.h:37
int64_t m
Definition: ctr_tsr.h:78
int64_t home_size
size of home buffer
void desymmetrize(tensor *sym_tsr, tensor *nonsym_tsr, bool is_C)
unfolds the data of a tensor
void set_sym(int const *sym)
sets symmetry, WARNING: for internal use only !!!!
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
void set_new_nnz_glb(int64_t const *nnz_blk)
sets the number of nonzeros both locally (nnz_loc) and overall globally (nnz_tot) ...
#define DEBUG_PRINTF(...)
Definition: util.h:238
virtual void dealloc(char *ptr) const
deallocate given pointer containing contiguous array of values
Definition: algstrct.cxx:689
void copy_mapping(int order, mapping const *mapping_A, mapping *mapping_B)
copies mapping A to B
Definition: mapping.cxx:190
tensor * self_reduce(int const *idx_A, int **new_idx_A, int order_B, int const *idx_B, int **new_idx_B, int order_C=0, int const *idx_C=NULL, int **new_idx_C=NULL)
performs a partial reduction on the tensor (used in summation and contraction)
an instance of the CTF library (world) on a MPI communicator
Definition: world.h:19
untyped internal class for triply-typed bivariate function
Definition: ctr_comm.h:16
bivar_function const * func
function to execute on elements
Definition: contraction.h:39
char const * beta
scaling of existing C
Definition: contraction.h:28
bool is_sparse
whether only the non-zero elements of the tensor are stored
int order
number of tensor dimensions
bool is_custom
whether there is a elementwise custom function
Definition: contraction.h:37
virtual char * alloc(int64_t n) const
allocate space for n items, necessary for object types
Definition: algstrct.cxx:685
void set_padding()
sets padding and local size of a tensor given a mapping
virtual void addinv(char const *a, char *b) const
b = -a
Definition: algstrct.cxx:103
CTF::World * wrld
distributed processor context on which tensor is defined
#define MAX(a, b)
Definition: util.h:180
class for execution distributed scaling of a tensor
Definition: scaling.h:14
#define VPRINTF(...)
Definition: util.h:207
int rank
rank of local processor
Definition: world.h:24
bool is_cyclic
whether the tensor data is cyclically distributed (blocked if false)
virtual bool is_offloadable() const
Definition: algstrct.cxx:336
int overcounting_factor(int order_A, const T &idx_A, const int *sym_A, int order_B, const T &idx_B, const int *sym_B, int order_C, const T &idx_C, const int *sym_C)
class for execution distributed contraction of tensors
Definition: contraction.h:16
int zero_out_padding()
sets padded portion of tensor to zero (this should be maintained internally)
int * lens
unpadded tensor edge lengths
int64_t n
Definition: ctr_tsr.h:77
contraction()
lazy constructor
Definition: contraction.h:42
virtual void print()
Definition: ctr_comm.h:175
int64_t sz_C
Definition: ctr_tsr.h:80
int * idx_B
indices of right operand
Definition: contraction.h:33
tensor * C
output
Definition: contraction.h:23
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
void print_map(FILE *stream=stdout, bool allcall=1) const
displays mapping information
int comp_dim_map(mapping const *map_A, mapping const *map_B)
compares two mappings
Definition: mapping.cxx:143
bool is_data_aliased
whether the tensor data is an alias of another tensor object&#39;s data
int64_t nnz_tot
maximum number of local nonzero elements over all procs
void pull_alias(tensor const *other)
pulls data from an tensor with an aliased buffer
double est_time_transp(int order, int const *new_order, int const *edge_len, int dir, algstrct const *sr)
estimates time needed to transposes a non-symmetric (folded) tensor based on performance models ...
int64_t l
Definition: ctr_tsr.h:76
algstrct * sr
algstrct on which tensor elements and operations are defined
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
int64_t calc_npe() const
calculate the number of processes this tensor is distributed over return number of processes owning a...
void unfold(bool was_mod=0)
undo the folding of a local tensor block unsets is_folded and deletes rec_tsr
CommData * dim_comm
Definition: topology.h:20
tensor * rec_tsr
representation of folded tensor (shares data pointer)
#define TAU_FSTOP(ARG)
Definition: util.h:281
int is_equal(contraction const &os)
returns 1 if contractions have same tensors and index map
double estimate_time()
predicts execution time in seconds using performance models
#define TAU_FSTART(ARG)
Definition: util.h:280
bool is_mapped
whether a mapping has been selected
mapping * child
Definition: mapping.h:26
int set_zero()
elects a mapping and sets tensor data to zero
int * sym_table
order-by-order table of dimensional symmetry relations
MPI_Comm cm
Definition: common.h:129
virtual double est_time_rec(int nlyr)
Definition: ctr_comm.h:179
int * idx_C
indices of output
Definition: contraction.h:35
int map_tensor_rem(int num_phys_dims, CommData *phys_comm, int fill=0)
map the remainder of a tensor
int64_t calc_nvirt() const
calculate virtualization factor of tensor return virtualization factor
int can_morph(topology const *topo_keep, topology const *topo_change)
determines if two topologies are compatible with each other
Definition: topology.cxx:683
void permute_target(int order, int const *perm, int *arr)
permutes a permutation array
Definition: util.cxx:222
int map_symtsr(int tsr_order, int const *tsr_sym_table, mapping *tsr_edge_map)
adjust a mapping to maintan symmetry
Definition: mapping.cxx:470
int check_self_mapping(tensor const *tsr, int const *idx_map)
checks mapping in preparation for tensors scale, summ or contract
Definition: mapping.cxx:332
virtual void add(char const *a, char const *b, char *c) const
c = a+b
Definition: algstrct.cxx:109
virtual void run(char *A, char *B, char *C)
Definition: ctr_comm.h:174
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
bool profile
whether profiling should be done for contractions/sums involving this tensor
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
int64_t proc_bytes_available()
gives total memory available on this MPI process
Definition: memcontrol.cxx:655
void calc_fold_lnmk(tensor const *A, tensor const *B, tensor const *C, int const *idx_A, int const *idx_B, int const *idx_C, int const *ordering_A, int const *ordering_B, iparam *inner_prm)
calculate the dimensions of the matrix the contraction gets reduced to (A, B, and C may be permuted) ...
Definition: apsp.cxx:17
std::vector< CTF_int::topology * > topovec
derived topologies
Definition: world.h:28
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
internal distributed tensor class
int lcm(int a, int b)
Definition: util.h:340
void nosym_transpose(tensor *A, int all_fdim_A, int const *all_flen_A, int const *new_order, int dir)
int extract_diag(int const *idx_map, int rw, tensor *&new_tsr, int **idx_map_new)
extracts the diagonal of a tensor if the index map specifies to do so
int execute()
run scaling
Definition: scaling.cxx:64
int64_t choose(int64_t n, int64_t k)
Definition: util.cxx:285
char const * alpha
scaling of A*B
Definition: contraction.h:26
int align_symmetric_indices(int order_A, T &idx_A, const int *sym_A, int order_B, T &idx_B, const int *sym_B)
Definition: sym_indices.cxx:40
topology * topo
topology to which the tensor is mapped
#define MIN(a, b)
Definition: util.h:176
int redistribute(distribution const &old_dist, int const *old_offsets=NULL, int *const *old_permutation=NULL, int const *new_offsets=NULL, int *const *new_permutation=NULL)
permutes the data of a tensor to its new layout
Definition: common.h:37
int ctr_2d_gen_build(int is_used, CommData global_comm, int i, int *virt_dim, int &cg_edge_len, int &total_iter, tensor *A, int i_A, CommData *&cg_cdt_A, int64_t &cg_ctr_lda_A, int64_t &cg_ctr_sub_lda_A, bool &cg_move_A, int *blk_len_A, int64_t &blk_sz_A, int const *virt_blk_len_A, int &load_phase_A, tensor *B, int i_B, CommData *&cg_cdt_B, int64_t &cg_ctr_lda_B, int64_t &cg_ctr_sub_lda_B, bool &cg_move_B, int *blk_len_B, int64_t &blk_sz_B, int const *virt_blk_len_B, int &load_phase_B, tensor *C, int i_C, CommData *&cg_cdt_C, int64_t &cg_ctr_lda_C, int64_t &cg_ctr_sub_lda_C, bool &cg_move_C, int *blk_len_C, int64_t &blk_sz_C, int const *virt_blk_len_C, int &load_phase_C)
sets up a ctr_2d_general (2D SUMMA) level where A is not communicated function will be called with A/...
void extract_free_comms(topology const *topo, int order_A, mapping const *edge_map_A, int order_B, mapping const *edge_map_B, int &num_sub_phys_dims, CommData **psub_phys_comm, int **pcomm_idx)
extracts the set of physical dimensions still available for mapping
Definition: topology.cxx:628
class for execution distributed summation of tensors
Definition: summation.h:15
bool has_zero_edge_len
if true tensor has a zero edge length, so is zero, which short-cuts stuff
int * idx_A
indices of left operand
Definition: contraction.h:31
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
void aug_virt(int tot_phase)
augments mapping to have sufficient virtualization so that the total phas is exactly tot_phase (assum...
Definition: mapping.cxx:117
#define ABORT
Definition: util.h:162
void remove_fold()
removes folding without doing transpose unsets is_folded and deletes rec_tsr
void symmetrize(tensor *sym_tsr, tensor *nonsym_tsr)
folds the data of a tensor
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
Definition: util.cxx:10
char * name
name given to tensor
int np
number of processors
Definition: world.h:26
MPI_Comm comm
set of processors making up this world
Definition: world.h:22
void clear()
resets mapping to NOT_MAPPED
Definition: mapping.cxx:94
int conv_idx(int order, type const *cidx, int **iidx)
Definition: common.cxx:50
def np(self)
Definition: core.pyx:315
void morph_topo(topology const *new_topo, topology const *old_topo, int order, mapping *edge_map)
morphs a tensor topology into another
Definition: topology.cxx:700
void clear_mapping()
zeros out mapping