Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
summation.cxx
Go to the documentation of this file.
1 #include "summation.h"
2 #include "../scaling/strp_tsr.h"
3 #include "../mapping/mapping.h"
4 #include "../mapping/distribution.h"
5 #include "../tensor/untyped_tensor.h"
6 #include "../shared/util.h"
7 #include "../shared/memcontrol.h"
8 #include "sym_seq_sum.h"
9 #include "../symmetry/sym_indices.h"
10 #include "../symmetry/symmetrization.h"
11 #include "../redistribution/nosym_transp.h"
12 #include "../redistribution/redist.h"
13 #include "../scaling/scaling.h"
14 
15 namespace CTF_int {
16 
17  using namespace CTF;
18 
20  if (idx_A != NULL) cdealloc(idx_A);
21  if (idx_B != NULL) cdealloc(idx_B);
22  }
23 
25  A = other.A;
26  idx_A = (int*)alloc(sizeof(int)*other.A->order);
27  memcpy(idx_A, other.idx_A, sizeof(int)*other.A->order);
28  B = other.B;
29  idx_B = (int*)alloc(sizeof(int)*other.B->order);
30  memcpy(idx_B, other.idx_B, sizeof(int)*other.B->order);
31  func = other.func;
32  is_custom = other.is_custom;
33  alpha = other.alpha;
34  beta = other.beta;
35  }
36 
38  int const * idx_A_,
39  char const * alpha_,
40  tensor * B_,
41  int const * idx_B_,
42  char const * beta_){
43  A = A_;
44  alpha = alpha_;
45  B = B_;
46  beta = beta_;
47  is_custom = 0;
48  func = NULL;
49 
50  idx_A = (int*)alloc(sizeof(int)*A->order);
51  idx_B = (int*)alloc(sizeof(int)*B->order);
52 
53  memcpy(idx_A, idx_A_, sizeof(int)*A->order);
54  memcpy(idx_B, idx_B_, sizeof(int)*B->order);
55  }
56 
58  char const * cidx_A,
59  char const * alpha_,
60  tensor * B_,
61  char const * cidx_B,
62  char const * beta_){
63  A = A_;
64  alpha = alpha_;
65  B = B_;
66  beta = beta_;
67  is_custom = 0;
68  func = NULL;
69 
70  conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B);
71  }
72 
73 
75  int const * idx_A_,
76  char const * alpha_,
77  tensor * B_,
78  int const * idx_B_,
79  char const * beta_,
80  univar_function const * func_){
81  A = A_;
82  alpha = alpha_;
83  B = B_;
84  beta = beta_;
85  func = func_;
86  if (func == NULL)
87  is_custom = 0;
88  else
89  is_custom = 1;
90 
91  idx_A = (int*)alloc(sizeof(int)*A->order);
92  idx_B = (int*)alloc(sizeof(int)*B->order);
93 
94  memcpy(idx_A, idx_A_, sizeof(int)*A->order);
95  memcpy(idx_B, idx_B_, sizeof(int)*B->order);
96  }
97 
98 
100  char const * cidx_A,
101  char const * alpha_,
102  tensor * B_,
103  char const * cidx_B,
104  char const * beta_,
105  univar_function const * func_){
106  A = A_;
107  alpha = alpha_;
108  B = B_;
109  beta = beta_;
110  func = func_;
111  if (func == NULL)
112  is_custom = 0;
113  else
114  is_custom = 1;
115 
116  conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B);
117  }
118 
119  void summation::execute(bool run_diag){
120 #if (DEBUG >= 2 || VERBOSE >= 1)
121  #if DEBUG >= 2
122  if (A->wrld->cdt.rank == 0) printf("Summation::execute (head):\n");
123  #endif
124  print();
125 #endif
126  //update_all_models(A->wrld->cdt.cm);
127  int stat = home_sum_tsr(run_diag);
128  if (stat != SUCCESS)
129  printf("CTF ERROR: Failed to perform summation\n");
130  }
131 
133  assert(0); //FIXME
134  return 0.0;
135  }
136 
137  void summation::get_fold_indices(int * num_fold,
138  int ** fold_idx){
139  int i, in, num_tot, nfold, broken;
140  int iA, iB, inA, inB, iiA, iiB;
141  int * idx_arr, * idx;
142  inv_idx(A->order, idx_A,
143  B->order, idx_B,
144  &num_tot, &idx_arr);
145  CTF_int::alloc_ptr(num_tot*sizeof(int), (void**)&idx);
146 
147  for (i=0; i<num_tot; i++){
148  idx[i] = 1;
149  }
150 
151  for (iA=0; iA<A->order; iA++){
152  i = idx_A[iA];
153  iB = idx_arr[2*i+1];
154  broken = 0;
155  inA = iA;
156  do {
157  in = idx_A[inA];
158  inB = idx_arr[2*in+1];
159  if (((inA>=0) + (inB>=0) != 2) ||
160  (iB != -1 && inB - iB != in-i) ||
161  (iB != -1 && A->sym[inA] != B->sym[inB])){
162  broken = 1;
163  // printf("index in = %d inA = %d inB = %d is broken symA = %d symB = %d\n",in, inA, inB, A->sym[inA], B->sym[inB]);
164  }
165  inA++;
166  } while (A->sym[inA-1] != NS);
167  if (broken){
168  for (iiA=iA;iiA<inA;iiA++){
169  idx[idx_A[iiA]] = 0;
170  }
171  }
172  }
173 
174  for (iB=0; iB<B->order; iB++){
175  i = idx_B[iB];
176  iA = idx_arr[2*i+0];
177  broken = 0;
178  inB = iB;
179  do {
180  in = idx_B[inB];
181  inA = idx_arr[2*in+0];
182  if (((inB>=0) + (inA>=0) != 2) ||
183  (iA != -1 && inA - iA != in-i) ||
184  (iA != -1 && B->sym[inB] != A->sym[inA])){
185  broken = 1;
186  }
187  inB++;
188  } while (B->sym[inB-1] != NS);
189  if (broken){
190  for (iiB=iB;iiB<inB;iiB++){
191  idx[idx_B[iiB]] = 0;
192  }
193  }
194  }
195 
196 
197  nfold = 0;
198  for (i=0; i<num_tot; i++){
199  if (idx[i] == 1){
200  idx[nfold] = i;
201  nfold++;
202  }
203  }
204  *num_fold = nfold;
205  *fold_idx = idx;
206  CTF_int::cdealloc(idx_arr);
207  }
208 
209  int summation::can_fold(){
210  int i, j, nfold, * fold_idx;
211  //FIXME: fold sparse tensors into CSR form
212  if (A->is_sparse || B->is_sparse) return 0;
213 
214  for (i=0; i<A->order; i++){
215  for (j=i+1; j<A->order; j++){
216  if (idx_A[i] == idx_A[j]) return 0;
217  }
218  }
219  for (i=0; i<B->order; i++){
220  for (j=i+1; j<B->order; j++){
221  if (idx_B[i] == idx_B[j]) return 0;
222  }
223  }
224  get_fold_indices(&nfold, &fold_idx);
225  CTF_int::cdealloc(fold_idx);
226  /* FIXME: 1 folded index is good enough for now, in the future model */
227  return nfold > 0;
228  }
229 
230  void summation::get_fold_sum(summation *& fold_sum,
231  int & all_fdim_A,
232  int & all_fdim_B,
233  int *& all_flen_A,
234  int *& all_flen_B){
235  int i, j, nfold, nf;
236  int * fold_idx, * fidx_map_A, * fidx_map_B;
237  tensor * ftsr_A, * ftsr_B;
238 
239  get_fold_indices(&nfold, &fold_idx);
240  if (nfold == 0){
241  CTF_int::cdealloc(fold_idx);
242  assert(0);
243  }
244 
245  /* overestimate this space to not bother with it later */
246  CTF_int::alloc_ptr(nfold*sizeof(int), (void**)&fidx_map_A);
247  CTF_int::alloc_ptr(nfold*sizeof(int), (void**)&fidx_map_B);
248 
249 
250  A->fold(nfold, fold_idx, idx_A,
251  &all_fdim_A, &all_flen_A);
252  B->fold(nfold, fold_idx, idx_B,
253  &all_fdim_B, &all_flen_B);
254 
255  nf = 0;
256  for (i=0; i<A->order; i++){
257  for (j=0; j<nfold; j++){
258  if (A->sym[i] == NS && idx_A[i] == fold_idx[j]){
259  fidx_map_A[nf] = j;
260  nf++;
261  }
262  }
263  }
264  nf = 0;
265  for (i=0; i<B->order; i++){
266  for (j=0; j<nfold; j++){
267  if (B->sym[i] == NS && idx_B[i] == fold_idx[j]){
268  fidx_map_B[nf] = j;
269  nf++;
270  }
271  }
272  }
273 
274  ftsr_A = A->rec_tsr;
275  ftsr_B = B->rec_tsr;
276 
277  int * sidx_A, * sidx_B;
278  CTF_int::conv_idx<int>(ftsr_A->order, fidx_map_A, &sidx_A,
279  ftsr_B->order, fidx_map_B, &sidx_B);
280 
281  cdealloc(fidx_map_A);
282  cdealloc(fidx_map_B);
283  cdealloc(fold_idx);
284 
285  fold_sum = new summation(A->rec_tsr, sidx_A, alpha, B->rec_tsr, sidx_B, beta);
286  cdealloc(sidx_A);
287  cdealloc(sidx_B);
288  }
289 
290  int summation::map_fold(){
291  int i, all_fdim_A, all_fdim_B;
292  int * fnew_ord_A, * fnew_ord_B;
293  int * all_flen_A, * all_flen_B;
294  int inr_stride;
295 
296  summation * fold_sum;
297  get_fold_sum(fold_sum, all_fdim_A, all_fdim_B, all_flen_A, all_flen_B);
298  #if DEBUG>=2
299  if (A->wrld->rank == 0){
300  printf("Folded summation type:\n");
301  }
302  fold_sum->print();//print_sum(&fold_type,0.0,0.0);
303  #endif
304 
305  //for type order 1 to 3
306  fold_sum->get_len_ordering(&fnew_ord_A, &fnew_ord_B);
307  permute_target(fold_sum->A->order, fnew_ord_A, A->inner_ordering);
308  permute_target(fold_sum->B->order, fnew_ord_B, B->inner_ordering);
309 
310 
311  nosym_transpose(A, all_fdim_A, all_flen_A, A->inner_ordering, 1);
312  /*for (i=0; i<nvirt_A; i++){
313  nosym_transpose(all_fdim_A, A->inner_ordering, all_flen_A,
314  A->data + A->sr->el_size*i*(A->size/nvirt_A), 1, A->sr);
315  }*/
316  nosym_transpose(B, all_fdim_B, all_flen_B, B->inner_ordering, 1);
317  /*for (i=0; i<nvirt_B; i++){
318  nosym_transpose(all_fdim_B, B->inner_ordering, all_flen_B,
319  B->data + B->sr->el_size*i*(B->size/nvirt_B), 1, B->sr);
320  }*/
321 
322  inr_stride = 1;
323  for (i=0; i<fold_sum->A->order; i++){
324  inr_stride *= fold_sum->A->pad_edge_len[i];
325  }
326 
327  CTF_int::cdealloc(fnew_ord_A);
328  CTF_int::cdealloc(fnew_ord_B);
329  CTF_int::cdealloc(all_flen_A);
330  CTF_int::cdealloc(all_flen_B);
331 
332  delete fold_sum;
333 
334  return inr_stride;
335  }
336 
337  double summation::est_time_fold(){
338  int all_fdim_A, all_fdim_B;
339  int * fnew_ord_A, * fnew_ord_B;
340  int * all_flen_A, * all_flen_B;
341  int * tAiord, * tBiord;
342 
343  summation * fold_sum;
344  get_fold_sum(fold_sum, all_fdim_A, all_fdim_B, all_flen_A, all_flen_B);
345  fold_sum->get_len_ordering(&fnew_ord_A, &fnew_ord_B);
346  CTF_int::alloc_ptr(all_fdim_A*sizeof(int), (void**)&tAiord);
347  CTF_int::alloc_ptr(all_fdim_B*sizeof(int), (void**)&tBiord);
348  memcpy(tAiord, A->inner_ordering, all_fdim_A*sizeof(int));
349  memcpy(tBiord, B->inner_ordering, all_fdim_B*sizeof(int));
350 
351  permute_target(fold_sum->A->order, fnew_ord_A, tAiord);
352  permute_target(fold_sum->B->order, fnew_ord_B, tBiord);
353 
354  A->is_folded = 0;
355  delete A->rec_tsr;
356  cdealloc(A->inner_ordering);
357  B->is_folded = 0;
358  delete B->rec_tsr;
359  cdealloc(B->inner_ordering);
360 
361  double esttime = 0.0;
362 
363  esttime += A->calc_nvirt()*est_time_transp(all_fdim_A, tAiord, all_flen_A, 1, A->sr);
364  esttime += 2.*B->calc_nvirt()*est_time_transp(all_fdim_B, tBiord, all_flen_B, 1, B->sr);
365 
366  delete fold_sum;
367 
368  cdealloc(all_flen_A);
369  cdealloc(all_flen_B);
370  cdealloc(tAiord);
371  cdealloc(tBiord);
372  cdealloc(fnew_ord_A);
373  cdealloc(fnew_ord_B);
374  return esttime;
375  }
376 
377 
378 
379  void summation::get_len_ordering(int ** new_ordering_A,
380  int ** new_ordering_B){
381  int num_tot;
382  int * ordering_A, * ordering_B, * idx_arr;
383 
384  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&ordering_A);
385  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&ordering_B);
386 
387  inv_idx(A->order, idx_A,
388  B->order, idx_B,
389  &num_tot, &idx_arr);
390  ASSERT(num_tot == A->order);
391  ASSERT(num_tot == B->order);
392  /*for (i=0; i<num_tot; i++){
393  ordering_A[i] = idx_arr[2*i];
394  ordering_B[i] = idx_arr[2*i+1];
395  }*/
396  for (int i=0; i<num_tot; i++){
397  ordering_B[i] = i;
398  for (int j=0; j<num_tot; j++){
399  if (idx_A[j] == idx_B[i])
400  ordering_A[i] = j;
401  }
402  }
403  CTF_int::cdealloc(idx_arr);
404  *new_ordering_A = ordering_A;
405  *new_ordering_B = ordering_B;
406  }
407 
408 
409  tspsum * summation::construct_sparse_sum(int const * phys_mapped){
410  int nvirt, i, iA, iB, order_tot, is_top, need_rep;
411  int64_t blk_sz_A, blk_sz_B, vrt_sz_A, vrt_sz_B;
412  int nphys_dim;
413  int * virt_dim;
414  int * idx_arr;
415  int * virt_blk_len_A, * virt_blk_len_B;
416  int * blk_len_A, * blk_len_B;
417  mapping * map;
418  tspsum * htsum = NULL , ** rec_tsum = NULL;
419 
420  is_top = 1;
421  inv_idx(A->order, idx_A,
422  B->order, idx_B,
423  &order_tot, &idx_arr);
424 
425  nphys_dim = A->topo->order;
426 
427  CTF_int::alloc_ptr(sizeof(int)*order_tot, (void**)&virt_dim);
428  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&blk_len_A);
429  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&blk_len_B);
430  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&virt_blk_len_A);
431  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&virt_blk_len_B);
432 
433  /* Determine the block dimensions of each local subtensor */
434  blk_sz_A = A->size;
435  blk_sz_B = B->size;
436  calc_dim(A->order, blk_sz_A, A->pad_edge_len, A->edge_map,
437  &vrt_sz_A, virt_blk_len_A, blk_len_A);
438  calc_dim(B->order, blk_sz_B, B->pad_edge_len, B->edge_map,
439  &vrt_sz_B, virt_blk_len_B, blk_len_B);
440 
441  nvirt = 1;
442  for (i=0; i<order_tot; i++){
443  iA = idx_arr[2*i];
444  iB = idx_arr[2*i+1];
445  if (iA != -1){
446  map = &A->edge_map[iA];
447  while (map->has_child) map = map->child;
448  if (map->type == VIRTUAL_MAP){
449  virt_dim[i] = map->np;
450  }
451  else virt_dim[i] = 1;
452  } else {
453  ASSERT(iB!=-1);
454  map = &B->edge_map[iB];
455  while (map->has_child) map = map->child;
456  if (map->type == VIRTUAL_MAP){
457  virt_dim[i] = map->np;
458  }
459  else virt_dim[i] = 1;
460  }
461  nvirt *= virt_dim[i];
462  }
463 
464 
465 
466  if (A->is_sparse){
467  if (A->wrld->np > 1){
468  tspsum_pin_keys * sksum = new tspsum_pin_keys(this, 1);
469  if (is_top){
470  htsum = sksum;
471  is_top = 0;
472  } else {
473  *rec_tsum = sksum;
474  }
475  rec_tsum = &sksum->rec_tsum;
476  }
477 
478  tspsum_permute * pmsum = new tspsum_permute(this, 1, virt_blk_len_A);
479  if (is_top){
480  htsum = pmsum;
481  is_top = 0;
482  } else {
483  *rec_tsum = pmsum;
484  }
485  rec_tsum = &pmsum->rec_tsum;
486  }
487  if (B->is_sparse){
488  if (B->wrld->np > 1){
489  tspsum_pin_keys * sksum = new tspsum_pin_keys(this, 0);
490  if (is_top){
491  htsum = sksum;
492  is_top = 0;
493  } else {
494  *rec_tsum = sksum;
495  }
496  rec_tsum = &sksum->rec_tsum;
497  }
498 
499  tspsum_permute * pmsum = new tspsum_permute(this, 0, virt_blk_len_B);
500  if (is_top){
501  htsum = pmsum;
502  is_top = 0;
503  } else {
504  *rec_tsum = pmsum;
505  }
506  rec_tsum = &pmsum->rec_tsum;
507  }
508 
509 /* bool need_sp_map = false;
510  if (A->is_sparse || B->is_sparse){
511  for (int i=0; i<B->order; i++){
512  bool found_match = false;
513  for (int j=0; j<A->order; j++){
514  if (idx_B[i] == idx_A[j]) found_match = true;
515  }
516  if (!found_match) need_sp_map = true;
517  }
518  }
519 
520  if (need_sp_map){
521  tspsum_map * smsum = new tspsum_map(this);
522  if (is_top){
523  htsum = smsum;
524  is_top = 0;
525  } else {
526  *rec_tsum = smsum;
527  }
528  rec_tsum = &smsum->rec_tsum;
529  }*/
530 
531 
532  need_rep = 0;
533  for (i=0; i<nphys_dim; i++){
534  if (phys_mapped[2*i+0] == 0 ||
535  phys_mapped[2*i+1] == 0){
536  need_rep = 1;
537  break;
538  }
539  }
540 
541  if (need_rep){
542 /* if (A->wrld->cdt.rank == 0)
543  DPRINTF(1,"Replicating tensor\n");*/
544 
545  tspsum_replicate * rtsum = new tspsum_replicate(this, phys_mapped, blk_sz_A, blk_sz_B);
546 
547  if (is_top){
548  htsum = rtsum;
549  is_top = 0;
550  } else {
551  *rec_tsum = rtsum;
552  }
553  rec_tsum = &rtsum->rec_tsum;
554  }
555 
556  /* Multiply over virtual sub-blocks */
557  tspsum_virt * tsumv = new tspsum_virt(this);
558  if (is_top) {
559  htsum = tsumv;
560  is_top = 0;
561  } else {
562  *rec_tsum = tsumv;
563  }
564  rec_tsum = &tsumv->rec_tsum;
565 
566  tsumv->num_dim = order_tot;
567  tsumv->virt_dim = virt_dim;
568  tsumv->blk_sz_A = vrt_sz_A;
569  tsumv->blk_sz_B = vrt_sz_B;
570 
571  int * new_sym_A, * new_sym_B;
572  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&new_sym_A);
573  memcpy(new_sym_A, A->sym, sizeof(int)*A->order);
574  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&new_sym_B);
575  memcpy(new_sym_B, B->sym, sizeof(int)*B->order);
576 
577  seq_tsr_spsum * tsumseq = new seq_tsr_spsum(this);
578  tsumseq->is_inner = 0;
579  tsumseq->edge_len_A = virt_blk_len_A;
580  tsumseq->sym_A = new_sym_A;
581  tsumseq->edge_len_B = virt_blk_len_B;
582  tsumseq->sym_B = new_sym_B;
583  tsumseq->is_custom = is_custom;
584  if (is_custom){
585  tsumseq->is_inner = 0;
586  tsumseq->func = func;
587  } else tsumseq->func = NULL;
588 
589  if (is_top) {
590  htsum = tsumseq;
591  is_top = 0;
592  } else {
593  *rec_tsum = tsumseq;
594  }
595 
596  CTF_int::cdealloc(idx_arr);
597  CTF_int::cdealloc(blk_len_A);
598  CTF_int::cdealloc(blk_len_B);
599  return htsum;
600  }
601 
602  tsum * summation::construct_dense_sum(int inner_stride,
603  int const * phys_mapped){
604  int i, iA, iB, order_tot, is_top, sA, sB, need_rep, i_A, i_B, j, k;
605  int64_t blk_sz_A, blk_sz_B, vrt_sz_A, vrt_sz_B;
606  int nphys_dim, nvirt;
607  int * idx_arr, * virt_dim;
608  int * virt_blk_len_A, * virt_blk_len_B;
609  int * blk_len_A, * blk_len_B;
610  tsum * htsum = NULL , ** rec_tsum = NULL;
611  mapping * map;
612  strp_tsr * str_A, * str_B;
613 
614  is_top = 1;
615  inv_idx(A->order, idx_A,
616  B->order, idx_B,
617  &order_tot, &idx_arr);
618 
619  nphys_dim = A->topo->order;
620 
621  CTF_int::alloc_ptr(sizeof(int)*order_tot, (void**)&virt_dim);
622  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&blk_len_A);
623  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&blk_len_B);
624  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&virt_blk_len_A);
625  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&virt_blk_len_B);
626 
627  /* Determine the block dimensions of each local subtensor */
628  blk_sz_A = A->size;
629  blk_sz_B = B->size;
630  calc_dim(A->order, blk_sz_A, A->pad_edge_len, A->edge_map,
631  &vrt_sz_A, virt_blk_len_A, blk_len_A);
632  calc_dim(B->order, blk_sz_B, B->pad_edge_len, B->edge_map,
633  &vrt_sz_B, virt_blk_len_B, blk_len_B);
634  /* Strip out the relevant part of the tensor if we are contracting over diagonal */
635  sA = strip_diag(A->order, order_tot, idx_A, vrt_sz_A,
636  A->edge_map, A->topo, A->sr,
637  blk_len_A, &blk_sz_A, &str_A);
638  sB = strip_diag(B->order, order_tot, idx_B, vrt_sz_B,
639  B->edge_map, B->topo, B->sr,
640  blk_len_B, &blk_sz_B, &str_B);
641  if (sA || sB){
642  if (A->wrld->cdt.rank == 0)
643  DPRINTF(1,"Stripping tensor\n");
644  strp_sum * ssum = new strp_sum(this);
645  ssum->sr_A = A->sr;
646  ssum->sr_B = B->sr;
647  htsum = ssum;
648  is_top = 0;
649  rec_tsum = &ssum->rec_tsum;
650 
651  ssum->rec_strp_A = str_A;
652  ssum->rec_strp_B = str_B;
653  ssum->strip_A = sA;
654  ssum->strip_B = sB;
655  }
656 
657 
658  nvirt = 1;
659  for (i=0; i<order_tot; i++){
660  iA = idx_arr[2*i];
661  iB = idx_arr[2*i+1];
662  if (iA != -1){
663  map = &A->edge_map[iA];
664  while (map->has_child) map = map->child;
665  if (map->type == VIRTUAL_MAP){
666  virt_dim[i] = map->np;
667  if (sA) virt_dim[i] = virt_dim[i]/str_A->strip_dim[iA];
668  }
669  else virt_dim[i] = 1;
670  } else {
671  ASSERT(iB!=-1);
672  map = &B->edge_map[iB];
673  while (map->has_child) map = map->child;
674  if (map->type == VIRTUAL_MAP){
675  virt_dim[i] = map->np;
676  if (sB) virt_dim[i] = virt_dim[i]/str_B->strip_dim[iA];
677  }
678  else virt_dim[i] = 1;
679  }
680  nvirt *= virt_dim[i];
681  }
682 
683  need_rep = 0;
684  for (i=0; i<nphys_dim; i++){
685  if (phys_mapped[2*i+0] == 0 ||
686  phys_mapped[2*i+1] == 0){
687  need_rep = 1;
688  break;
689  }
690  }
691 
692  if (need_rep){
693 /* if (A->wrld->cdt.rank == 0)
694  DPRINTF(1,"Replicating tensor\n");*/
695 
696  tsum_replicate * rtsum = new tsum_replicate(this, phys_mapped, blk_sz_A, blk_sz_B);
697 
698  if (is_top){
699  htsum = rtsum;
700  is_top = 0;
701  } else {
702  *rec_tsum = rtsum;
703  }
704  rec_tsum = &rtsum->rec_tsum;
705  }
706 
707  /* Multiply over virtual sub-blocks */
708  if (nvirt > 1){
709  tsum_virt * tsumv = new tsum_virt(this);
710  if (is_top) {
711  htsum = tsumv;
712  is_top = 0;
713  } else {
714  *rec_tsum = tsumv;
715  }
716  rec_tsum = &tsumv->rec_tsum;
717 
718  tsumv->num_dim = order_tot;
719  tsumv->virt_dim = virt_dim;
720  tsumv->blk_sz_A = vrt_sz_A;
721  tsumv->blk_sz_B = vrt_sz_B;
722  } else CTF_int::cdealloc(virt_dim);
723  int * new_sym_A, * new_sym_B;
724  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&new_sym_A);
725  memcpy(new_sym_A, A->sym, sizeof(int)*A->order);
726  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&new_sym_B);
727  memcpy(new_sym_B, B->sym, sizeof(int)*B->order);
728 
729  seq_tsr_sum * tsumseq = new seq_tsr_sum(this);
730  if (inner_stride == -1){
731  tsumseq->is_inner = 0;
732  } else {
733  tsumseq->is_inner = 1;
734  tsumseq->inr_stride = inner_stride;
735  tensor * itsr;
736  itsr = A->rec_tsr;
737  i_A = 0;
738  for (i=0; i<A->order; i++){
739  if (A->sym[i] == NS){
740  for (j=0; j<itsr->order; j++){
741  if (A->inner_ordering[j] == i_A){
742  j=i;
743  do {
744  j--;
745  } while (j>=0 && A->sym[j] != NS);
746  for (k=j+1; k<=i; k++){
747  virt_blk_len_A[k] = 1;
748  new_sym_A[k] = NS;
749  }
750  break;
751  }
752  }
753  i_A++;
754  }
755  }
756  itsr = B->rec_tsr;
757  i_B = 0;
758  for (i=0; i<B->order; i++){
759  if (B->sym[i] == NS){
760  for (j=0; j<itsr->order; j++){
761  if (B->inner_ordering[j] == i_B){
762  j=i;
763  do {
764  j--;
765  } while (j>=0 && B->sym[j] != NS);
766  for (k=j+1; k<=i; k++){
767  virt_blk_len_B[k] = 1;
768  new_sym_B[k] = NS;
769  }
770  break;
771  }
772  }
773  i_B++;
774  }
775  }
776  }
777  tsumseq->edge_len_A = virt_blk_len_A;
778  tsumseq->sym_A = new_sym_A;
779  tsumseq->edge_len_B = virt_blk_len_B;
780  tsumseq->sym_B = new_sym_B;
781  tsumseq->is_custom = is_custom;
782  if (is_custom){
783  tsumseq->is_inner = 0;
784  tsumseq->func = func;
785  } else tsumseq->func = NULL;
786  if (is_top) {
787  htsum = tsumseq;
788  is_top = 0;
789  } else {
790  *rec_tsum = tsumseq;
791  }
792 
793  CTF_int::cdealloc(idx_arr);
794  CTF_int::cdealloc(blk_len_A);
795  CTF_int::cdealloc(blk_len_B);
796  return htsum;
797 
798  }
799 
800 
801  tsum * summation::construct_sum(int inner_stride){
802  int i;
803  int nphys_dim;
804  int * phys_mapped;
805  tsum * htsum;
806  mapping * map;
807 
808  nphys_dim = A->topo->order;
809 
810  CTF_int::alloc_ptr(sizeof(int)*nphys_dim*2, (void**)&phys_mapped);
811  memset(phys_mapped, 0, sizeof(int)*nphys_dim*2);
812 
813 
814 
815  for (i=0; i<A->order; i++){
816  map = &A->edge_map[i];
817  if (map->type == PHYSICAL_MAP){
818  phys_mapped[2*map->cdt+0] = 1;
819  }
820  while (map->has_child) {
821  map = map->child;
822  if (map->type == PHYSICAL_MAP){
823  phys_mapped[2*map->cdt+0] = 1;
824  }
825  }
826  }
827  for (i=0; i<B->order; i++){
828  map = &B->edge_map[i];
829  if (map->type == PHYSICAL_MAP){
830  phys_mapped[2*map->cdt+1] = 1;
831  }
832  while (map->has_child) {
833  map = map->child;
834  if (map->type == PHYSICAL_MAP){
835  phys_mapped[2*map->cdt+1] = 1;
836  }
837  }
838  }
839  if (A->is_sparse || B->is_sparse){
840  htsum = construct_sparse_sum(phys_mapped);
841  } else {
842  htsum = construct_dense_sum(inner_stride, phys_mapped);
843  }
844 
845  CTF_int::cdealloc(phys_mapped);
846 
847  return htsum;
848  }
849 
850  int summation::home_sum_tsr(bool run_diag){
851  int ret, was_home_A, was_home_B;
852  tensor * tnsr_A, * tnsr_B;
853  // code below turns summations into scaling, but never seems to be invoked in AQ or test_suite, so commenting it out for now
854 /* if (A==B && !is_custom){
855  bool is_scal = true;
856  for (int i=0; i<A->order; i++){
857  if (idx_A[i] != idx_B[i]) is_scal = false;
858  }
859  if (is_scal){
860  if (alpha == NULL && beta == NULL){
861  scaling scl = scaling(A, idx_A, NULL);
862  scl.execute();
863  } else {
864  char nalpha[A->sr->el_size];
865  if (alpha == NULL) A->sr->copy(nalpha, A->sr->mulid());
866  else A->sr->copy(nalpha, alpha);
867 
868  if (beta == NULL) A->sr->add(nalpha, A->sr->mulid(), nalpha);
869  else A->sr->add(nalpha, beta, nalpha);
870 
871  scaling scl = scaling(A, idx_A, nalpha);
872  scl.execute();
873  }
874  return SUCCESS;
875  }
876  }*/
877 
878  summation osum = summation(*this);
879 
881 
882  A->unfold();
883  B->unfold();
884  // FIXME: if custom function, we currently don't know whether its odd, even or neither, so unpack everything
885  /*if (is_custom){
886  bool is_nonsym=true;
887  for (int i=0; i<A->order; i++){
888  if (A->sym[i] != NS){
889  is_nonsym = false;
890  }
891  }
892  if (!is_nonsym){
893  int sym_A[A->order];
894  std::fill(sym_A, sym_A+A->order, NS);
895  int idx_A[A->order];
896  for (int i=0; i<A->order; i++){
897  idx_A[i] = i;
898  }
899  tensor tA(A->sr, A->order, A->lens, sym_A, A->wrld, 1);
900  tA.is_home = 0;
901  tA.has_home = 0;
902  summation st(A, idx_A, A->sr->mulid(), &tA, idx_A, A->sr->mulid());
903  st.execute();
904  summation stme(*this);
905  stme.A = &tA;
906  stme.execute();
907  return SUCCESS;
908  }
909  }
910  if (is_custom){
911  bool is_nonsym=true;
912  for (int i=0; i<B->order; i++){
913  if (B->sym[i] != NS){
914  is_nonsym = false;
915  }
916  }
917  if (!is_nonsym){
918  int sym_B[B->order];
919  std::fill(sym_B, sym_B+B->order, NS);
920  int idx_B[B->order];
921  for (int i=0; i<B->order; i++){
922  idx_B[i] = i;
923  }
924  tensor tB(B->sr, B->order, B->lens, sym_B, B->wrld, 1);
925  tB.is_home = 0;
926  tB.has_home = 0;
927  if (!B->sr->isequal(B->sr->addid(), beta)){
928  summation st(B, idx_B, B->sr->mulid(), &tB, idx_B, B->sr->mulid());
929  st.execute();
930  }
931  summation stme(*this);
932  stme.B = &tB;
933  stme.execute();
934  summation stme2(&tB, idx_B, B->sr->mulid(), B, idx_B, B->sr->addid());
935  stme2.execute();
936  return SUCCESS;
937  }
938  }*/
939 
940  #ifndef HOME_CONTRACT
941  #ifdef USE_SYM_SUM
942  ret = sym_sum_tsr(run_diag);
943  return ret;
944  #else
945  ret = sum_tensors(run_diag);
946  return ret;
947  #endif
948  #else
949  if (A->has_zero_edge_len ||
950  B->has_zero_edge_len){
951  if (!B->sr->isequal(beta,B->sr->mulid()) && !B->has_zero_edge_len){
952  int sub_idx_map_B[B->order];
953  int sm_idx=0;
954  for (int i=0; i<B->order; i++){
955  sub_idx_map_B[i]=sm_idx;
956  sm_idx++;
957  for (int j=0; j<i; j++){
958  if (idx_B[i]==idx_B[j]){
959  sub_idx_map_B[i]=sub_idx_map_B[j];
960  sm_idx--;
961  break;
962  }
963  }
964  }
965  scaling scl = scaling(B, sub_idx_map_B, beta);
966  scl.execute();
967  }
968  return SUCCESS;
969  }
970  if (A == B){
971  tensor * cpy_tsr_A = new tensor(A);
972  osum.A = cpy_tsr_A;
973  osum.execute();
974  delete cpy_tsr_A;
975  return SUCCESS;
976  }
977  was_home_A = A->is_home;
978  was_home_B = B->is_home;
979  if (was_home_A){
980  tnsr_A = new tensor(A,0,0);
981  tnsr_A->data = A->data;
982  tnsr_A->home_buffer = A->home_buffer;
983  tnsr_A->is_home = 1;
984  tnsr_A->has_home = 1;
985  tnsr_A->home_size = A->home_size;
986  tnsr_A->is_mapped = 1;
987  tnsr_A->topo = A->topo;
988  copy_mapping(A->order, A->edge_map, tnsr_A->edge_map);
989  tnsr_A->set_padding();
990  if (A->is_sparse){
991  CTF_int::alloc_ptr(tnsr_A->calc_nvirt()*sizeof(int64_t), (void**)&tnsr_A->nnz_blk);
992  tnsr_A->set_new_nnz_glb(A->nnz_blk);
993  }
994  osum.A = tnsr_A;
995  } else tnsr_A = NULL;
996  if (was_home_B){
997  tnsr_B = new tensor(B,0,0);
998  tnsr_B->data = B->data;
999  tnsr_B->home_buffer = B->home_buffer;
1000  tnsr_B->is_home = 1;
1001  tnsr_B->has_home = 1;
1002  tnsr_B->home_size = B->home_size;
1003  tnsr_B->is_mapped = 1;
1004  tnsr_B->topo = B->topo;
1005  copy_mapping(B->order, B->edge_map, tnsr_B->edge_map);
1006  tnsr_B->set_padding();
1007  if (B->is_sparse){
1008  CTF_int::alloc_ptr(tnsr_B->calc_nvirt()*sizeof(int64_t), (void**)&tnsr_B->nnz_blk);
1009  tnsr_B->set_new_nnz_glb(B->nnz_blk);
1010  }
1011  osum.B = tnsr_B;
1012  } else tnsr_B = NULL;
1013  #if DEBUG >= 2
1014  if (A->wrld->cdt.rank == 0)
1015  printf("Start head sum:\n");
1016  #endif
1017 
1018  #ifdef USE_SYM_SUM
1019  ret = osum.sym_sum_tsr(run_diag);
1020  #else
1021  ret = osum.sum_tensors(run_diag);
1022  #endif
1023  #if DEBUG >= 2
1024  if (A->wrld->cdt.rank == 0)
1025  printf("End head sum:\n");
1026  #endif
1027 
1028  if (ret!= SUCCESS) return ret;
1029  if (was_home_A) tnsr_A->unfold();
1030  else A->unfold();
1031  if (was_home_B){
1032  tnsr_B->unfold();
1033  if (B->is_sparse){
1034  cdealloc(B->nnz_blk);
1035  //do below manually rather than calling set_new_nnz_glb since virt factor may be different
1036  CTF_int::alloc_ptr(tnsr_B->calc_nvirt()*sizeof(int64_t), (void**)&B->nnz_blk);
1037  for (int i=0; i<tnsr_B->calc_nvirt(); i++){
1038  B->nnz_blk[i] = tnsr_B->nnz_blk[i];
1039  }
1040  B->nnz_loc = tnsr_B->nnz_loc;
1041  B->nnz_tot = tnsr_B->nnz_tot;
1042  }
1043  B->data = tnsr_B->data;
1044  } else B->unfold();
1045 
1046  if (was_home_B && !tnsr_B->is_home){
1047  if (A->wrld->cdt.rank == 0)
1048  DPRINTF(1,"Migrating tensor %s back to home\n", B->name);
1049  distribution odst(tnsr_B);
1050  B->is_home = 0;
1051  B->has_home = 0;
1052  TAU_FSTART(redistribute_for_sum_home);
1053  B->redistribute(odst);
1054  TAU_FSTOP(redistribute_for_sum_home);
1055  if (!B->is_sparse){
1056  B->sr->copy(B->home_buffer, B->data, B->size);
1057  B->sr->dealloc(B->data);
1058  B->data = B->home_buffer;
1059  } else if (tnsr_B->home_buffer != NULL) {
1060  tnsr_B->sr->pair_dealloc(tnsr_B->home_buffer);
1061  tnsr_B->home_buffer = NULL;
1062  }
1063  tnsr_B->is_data_aliased = 1;
1064  tnsr_B->is_home = 0;
1065  tnsr_B->has_home = 0;
1066  B->is_home = 1;
1067  B->has_home = 1;
1068  delete tnsr_B;
1069  } else if (was_home_B){
1070  if (!B->is_sparse){
1071  if (tnsr_B->data != B->data){
1072  printf("Tensor %s is a copy of %s and did not leave home but buffer is %p was %p\n", tnsr_B->name, B->name, tnsr_B->data, B->data);
1073  ABORT;
1074  }
1075  } else
1076  B->data = tnsr_B->data;
1077  tnsr_B->has_home = 0;
1078  tnsr_B->is_home = 0;
1079  tnsr_B->is_data_aliased = 1;
1080  delete tnsr_B;
1081  }
1082  if (was_home_A && !tnsr_A->is_home){
1083  if (A->is_sparse){
1084  A->data = tnsr_A->home_buffer;
1085  tnsr_A->home_buffer = NULL;
1086  tnsr_A->has_home = 1;
1087  tnsr_A->is_home = 1;
1088  } else {
1089  tnsr_A->has_home = 0;
1090  tnsr_A->is_home = 0;
1091  }
1092  delete tnsr_A;
1093  } else if (was_home_A) {
1094  tnsr_A->has_home = 0;
1095  tnsr_A->is_home = 0;
1096  tnsr_A->is_data_aliased = 1;
1097  if (A->is_sparse)
1098  A->data = tnsr_A->data;
1099  delete tnsr_A;
1100  }
1101  return ret;
1102  #endif
1103  }
1104 
1105  int summation::sym_sum_tsr(bool run_diag){
1106  int sidx, i, nst_B, * new_idx_map;
1107  int * map_A, * map_B;
1108  int ** dstack_map_B;
1109  tensor * tnsr_A, * tnsr_B, * new_tsr, ** dstack_tsr_B;
1110  std::vector<summation> perm_types;
1111  std::vector<int> signs;
1112  char const * dbeta;
1113  #if (DEBUG >= 2)
1114  print();
1115  #endif
1116  bool is_cons = check_consistency();
1117  if (!is_cons) return ERROR;
1118 
1119  A->unfold();
1120  B->unfold();
1121  if (A->has_zero_edge_len || B->has_zero_edge_len){
1122  if (!B->sr->isequal(beta, B->sr->mulid()) && !B->has_zero_edge_len){
1123  int sub_idx_map_B[B->order];
1124  int sm_idx=0;
1125  for (int i=0; i<B->order; i++){
1126  sub_idx_map_B[i]=sm_idx;
1127  sm_idx++;
1128  for (int j=0; j<i; j++){
1129  if (idx_B[i]==idx_B[j]){
1130  sub_idx_map_B[i]=sub_idx_map_B[j];
1131  sm_idx--;
1132  break;
1133  }
1134  }
1135  }
1136  scaling scl = scaling(B, sub_idx_map_B, beta);
1137  scl.execute();
1138  }
1139  return SUCCESS;
1140  }
1141 
1142 
1143 
1144  int * new_idx_A, * new_idx_B;
1145  if (!is_custom || func->is_distributive){
1146  tensor * new_tsr_A = A->self_reduce(idx_A, &new_idx_A, B->order, idx_B, &new_idx_B);
1147  if (new_tsr_A != A) {
1148  summation s(new_tsr_A, new_idx_A, alpha, B, new_idx_B, beta, func);
1149  s.execute();
1150  delete new_tsr_A;
1151  cdealloc(new_idx_A);
1152  cdealloc(new_idx_B);
1153  return SUCCESS;
1154  }
1155  }
1156 
1157  // If we have sparisity, use separate mechanism
1158  /*if (A->is_sparse || B->is_sparse){
1159  sp_sum();
1160  return SUCCESS;
1161  }*/
1162  tnsr_A = A;
1163  tnsr_B = B;
1164  char * new_alpha = (char*)alloc(tnsr_B->sr->el_size);
1165  CTF_int::alloc_ptr(sizeof(int)*tnsr_A->order, (void**)&map_A);
1166  CTF_int::alloc_ptr(sizeof(int)*tnsr_B->order, (void**)&map_B);
1167  CTF_int::alloc_ptr(sizeof(int*)*tnsr_B->order, (void**)&dstack_map_B);
1168  CTF_int::alloc_ptr(sizeof(tensor*)*tnsr_B->order, (void**)&dstack_tsr_B);
1169  memcpy(map_A, idx_A, tnsr_A->order*sizeof(int));
1170  memcpy(map_B, idx_B, tnsr_B->order*sizeof(int));
1171  while (!run_diag && tnsr_A->extract_diag(map_A, 1, new_tsr, &new_idx_map) == SUCCESS){
1172  if (tnsr_A != A) delete tnsr_A;
1173  CTF_int::cdealloc(map_A);
1174  tnsr_A = new_tsr;
1175  map_A = new_idx_map;
1176  }
1177  nst_B = 0;
1178  while (!run_diag && tnsr_B->extract_diag(map_B, 1, new_tsr, &new_idx_map) == SUCCESS){
1179  dstack_map_B[nst_B] = map_B;
1180  dstack_tsr_B[nst_B] = tnsr_B;
1181  nst_B++;
1182  tnsr_B = new_tsr;
1183  map_B = new_idx_map;
1184  }
1185 
1186  summation new_sum = summation(*this);
1187  new_sum.A = tnsr_A;
1188  new_sum.B = tnsr_B;
1189  memcpy(new_sum.idx_A, map_A, sizeof(int)*tnsr_A->order);
1190  memcpy(new_sum.idx_B, map_B, sizeof(int)*tnsr_B->order);
1191  if (tnsr_A == tnsr_B){
1192  tensor nnew_tsr = tensor(tnsr_A);
1193  new_sum.A = &nnew_tsr;
1194  new_sum.B = tnsr_B;
1195  new_sum.sym_sum_tsr(run_diag);
1196 
1197  /*clone_tensor(ntid_A, 1, &new_tid);
1198  new_type = *type;
1199  new_type.tid_A = new_tid;
1200  stat = sym_sum_tsr(alpha_, beta, &new_type, ftsr, felm, run_diag);
1201  del_tsr(new_tid);
1202  return stat;*/
1203  } else {
1204 
1205  /* new_type.tid_A = ntid_A;
1206  new_type.tid_B = ntid_B;
1207  new_type.idx_map_A = map_A;
1208  new_type.idx_map_B = map_B;*/
1209 
1210  //FIXME: make these typefree...
1211  int sign = align_symmetric_indices(tnsr_A->order,
1212  new_sum.idx_A,
1213  tnsr_A->sym,
1214  tnsr_B->order,
1215  new_sum.idx_B,
1216  tnsr_B->sym);
1217  int ocfact = overcounting_factor(tnsr_A->order,
1218  new_sum.idx_A,
1219  tnsr_A->sym,
1220  tnsr_B->order,
1221  new_sum.idx_B,
1222  tnsr_B->sym);
1223 
1224  if (ocfact != 1 || sign != 1){
1225  if (ocfact != 1){
1226  tnsr_B->sr->safecopy(new_alpha, tnsr_B->sr->addid());
1227 
1228  for (int i=0; i<ocfact; i++){
1229  tnsr_B->sr->add(new_alpha, alpha, new_alpha);
1230  }
1231  alpha = new_alpha;
1232  }
1233  if (sign == -1){
1234  tnsr_B->sr->safeaddinv(alpha, new_alpha);
1235  alpha = new_alpha;
1236  }
1237  }
1238 
1239  /*if (A->sym[0] == SY && B->sym[0] == AS){
1240  print();
1241  ASSERT(0);
1242  }*/
1243  if (new_sum.unfold_broken_sym(NULL) != -1){
1244  if (A->wrld->cdt.rank == 0)
1245  DPRINTF(1, "Permutational symmetry is broken\n");
1246 
1247  summation * unfold_sum;
1248  sidx = new_sum.unfold_broken_sym(&unfold_sum);
1249  int sy;
1250  sy = 0;
1251  int sidx2 = unfold_sum->unfold_broken_sym(NULL);
1252  if (sidx%2 == 0 && (A->sym[sidx/2] == SY || unfold_sum->A->sym[sidx/2] == SY)) sy = 1;
1253  if (sidx%2 == 1 && (B->sym[sidx/2] == SY || unfold_sum->B->sym[sidx/2] == SY)) sy = 1;
1254  if (!A->is_sparse && !B->is_sparse && (sidx2 != -1 ||
1255  (sy && (sidx%2 == 0 || !tnsr_B->sr->isequal(new_sum.beta, tnsr_B->sr->addid()))))){
1256  if (sidx%2 == 0){
1257  if (unfold_sum->A->sym[sidx/2] == NS){
1258  if (A->wrld->cdt.rank == 0)
1259  DPRINTF(1,"Performing operand desymmetrization for summation of A idx=%d\n",sidx/2);
1260  desymmetrize(tnsr_A, unfold_sum->A, 0);
1261  } else {
1262  if (A->wrld->cdt.rank == 0)
1263  DPRINTF(1,"Performing operand symmetrization for summation\n");
1264  symmetrize(unfold_sum->A, tnsr_A);
1265  }
1266  //unfold_sum->B = tnsr_B;
1267  unfold_sum->sym_sum_tsr(run_diag);
1268  // sym_sum_tsr(alpha, beta, &unfold_type, ftsr, felm, run_diag);
1269  if (tnsr_A != unfold_sum->A){
1270  unfold_sum->A->unfold();
1271  tnsr_A->pull_alias(unfold_sum->A);
1272  delete unfold_sum->A;
1273  }
1274  } else {
1275  //unfold_sum->A = tnsr_A;
1276  if (A->wrld->cdt.rank == 0)
1277  DPRINTF(1,"Performing product desymmetrization for summation\n");
1278  desymmetrize(tnsr_B, unfold_sum->B, 1);
1279  unfold_sum->sym_sum_tsr(run_diag);
1280  if (A->wrld->cdt.rank == 0)
1281  DPRINTF(1,"Performing product symmetrization for summation\n");
1282  if (tnsr_B->data != unfold_sum->B->data && !tnsr_B->sr->isequal(tnsr_B->sr->mulid(), unfold_sum->beta)){
1283  int sidx_B[tnsr_B->order];
1284  for (int iis=0; iis<tnsr_B->order; iis++){
1285  sidx_B[iis] = iis;
1286  }
1287  scaling sscl = scaling(tnsr_B, sidx_B, unfold_sum->beta);
1288  sscl.execute();
1289  }
1290  symmetrize(tnsr_B, unfold_sum->B);
1291 
1292  // sym_sum_tsr(alpha, beta, &unfold_type, ftsr, felm, run_diag);
1293  if (tnsr_B != unfold_sum->B){
1294  unfold_sum->B->unfold();
1295  tnsr_B->pull_alias(unfold_sum->B);
1296  delete unfold_sum->B;
1297  }
1298  }
1299  } else {
1300  if (sidx != -1 && sidx%2 == 1){
1301  delete unfold_sum->B;
1302  } else if (sidx != -1 && sidx%2 == 0){
1303  delete unfold_sum->A;
1304  }
1305  //get_sym_perms(&new_type, alpha, perm_types, signs);
1306  get_sym_perms(new_sum, perm_types, signs);
1307  if (A->wrld->cdt.rank == 0)
1308  DPRINTF(1,"Performing %d summation permutations\n",
1309  (int)perm_types.size());
1310  dbeta = beta;
1311  char * new_alpha = (char*)alloc(tnsr_B->sr->el_size);
1312 
1313  tensor * inv_tsr_A = NULL;
1314  bool need_inv = false;
1315  // if we have no multiplicative operator, must inverse sign manually
1316  if (tnsr_B->sr->mulid() == NULL){
1317  for (i=0; i<(int)perm_types.size(); i++){
1318  if (signs[i] == -1)
1319  need_inv = true;
1320  }
1321  if (need_inv){
1322  inv_tsr_A = new tensor(tnsr_A);
1323  inv_tsr_A->addinv();
1324  }
1325  }
1326  for (i=0; i<(int)perm_types.size(); i++){
1327  // if group apply additive inverse manually
1328  if (signs[i] == -1 && need_inv){
1329  perm_types[i].A = inv_tsr_A;
1330  } else {
1331  if (signs[i] == 1)
1332  tnsr_B->sr->safecopy(new_alpha, alpha);
1333  else
1334  tnsr_B->sr->safeaddinv(alpha, new_alpha);
1335  perm_types[i].alpha = new_alpha;
1336  }
1337  perm_types[i].beta = dbeta;
1338  perm_types[i].sum_tensors(run_diag);
1339  dbeta = new_sum.B->sr->mulid();
1340  }
1341  cdealloc(new_alpha);
1342  if (need_inv){
1343  delete inv_tsr_A;
1344  }
1345  /* for (i=0; i<(int)perm_types.size(); i++){
1346  free_type(&perm_types[i]);
1347  }*/
1348  perm_types.clear();
1349  signs.clear();
1350  }
1351  delete unfold_sum;
1352  } else {
1353  new_sum.alpha = alpha;
1354  new_sum.sum_tensors(run_diag);
1355  /* sum_tensors(alpha, beta, new_type.tid_A, new_type.tid_B, new_type.idx_map_A,
1356  new_type.idx_map_B, ftsr, felm, run_diag);*/
1357  }
1358  }
1359  if (tnsr_A != A) delete tnsr_A;
1360  for (i=nst_B-1; i>=0; i--){
1361 // extract_diag(dstack_tid_B[i], dstack_map_B[i], 0, &ntid_B, &new_idx_map);
1362  dstack_tsr_B[i]->extract_diag(dstack_map_B[i], 0, tnsr_B, &new_idx_map);
1363  //del_tsr(ntid_B);
1364  delete tnsr_B;
1365  cdealloc(dstack_map_B[i]);
1366  cdealloc(new_idx_map);
1367  tnsr_B = dstack_tsr_B[i];
1368  }
1369  ASSERT(tnsr_B == B);
1370  CTF_int::cdealloc(new_alpha);
1371  CTF_int::cdealloc(map_A);
1372  CTF_int::cdealloc(map_B);
1373  CTF_int::cdealloc(dstack_map_B);
1374  CTF_int::cdealloc(dstack_tsr_B);
1375 
1376  return SUCCESS;
1377  }
1378 
1379 
1384  int summation::sum_tensors(bool run_diag){
1385  int stat, * new_idx_map;
1386  int * map_A, * map_B;
1387  int nst_B;
1388  int ** dstack_map_B;
1389  tensor * tnsr_A, * tnsr_B, * new_tsr, ** dstack_tsr_B;
1390 // tsum<dtype> * sumf;
1391  tsum * sumf;
1392  //check_sum(tid_A, tid_B, idx_map_A, idx_map_B);
1393  //FIXME: hmm all of the below already takes place in sym_sum
1394  TAU_FSTART(sum_preprocessing);
1395  check_consistency();
1396  A->unfold();
1397  B->unfold();
1398  if (A->has_zero_edge_len || B->has_zero_edge_len){
1399  if (!B->sr->isequal(beta,B->sr->mulid()) && !B->has_zero_edge_len){
1400  /* fseq_scl<dtype> fs;
1401  fs.func_ptr=sym_seq_scl_ref<dtype>;
1402  fseq_elm_scl<dtype> felm;
1403  felm.func_ptr = NULL;*/
1404  int sub_idx_map_B[B->order];
1405  int sm_idx=0;
1406  for (int i=0; i<B->order; i++){
1407  sub_idx_map_B[i]=sm_idx;
1408  sm_idx++;
1409  for (int j=0; j<i; j++){
1410  if (idx_B[i]==idx_B[j]){
1411  sub_idx_map_B[i]=sub_idx_map_B[j];
1412  sm_idx--;
1413  break;
1414  }
1415  }
1416  }
1417  scaling scl = scaling(B, sub_idx_map_B, beta);
1418  scl.execute();
1419  }
1420  TAU_FSTOP(sum_preprocessing);
1421  return SUCCESS;
1422  }
1423 
1424 
1425  //FIXME: remove all of the below, sum_tensors should never be called without sym_sum
1426  CTF_int::alloc_ptr(sizeof(int)*A->order, (void**)&map_A);
1427  CTF_int::alloc_ptr(sizeof(int)*B->order, (void**)&map_B);
1428  CTF_int::alloc_ptr(sizeof(int*)*B->order, (void**)&dstack_map_B);
1429  CTF_int::alloc_ptr(sizeof(tensor*)*B->order, (void**)&dstack_tsr_B);
1430  tnsr_A = A;
1431  tnsr_B = B;
1432  memcpy(map_A, idx_A, tnsr_A->order*sizeof(int));
1433  memcpy(map_B, idx_B, tnsr_B->order*sizeof(int));
1434  while (!run_diag && tnsr_A->extract_diag(map_A, 1, new_tsr, &new_idx_map) == SUCCESS){
1435  if (tnsr_A != A) delete tnsr_A;
1436  CTF_int::cdealloc(map_A);
1437  tnsr_A = new_tsr;
1438  map_A = new_idx_map;
1439  }
1440  nst_B = 0;
1441  while (!run_diag && tnsr_B->extract_diag(map_B, 1, new_tsr, &new_idx_map) == SUCCESS){
1442  dstack_map_B[nst_B] = map_B;
1443  dstack_tsr_B[nst_B] = tnsr_B;
1444  nst_B++;
1445  tnsr_B = new_tsr;
1446  map_B = new_idx_map;
1447  }
1448 
1449  if (!tnsr_A->is_sparse && tnsr_B->is_sparse){
1450  tensor * stnsr_A = tnsr_A;
1451  tnsr_A = new tensor(stnsr_A);
1452  tnsr_A->sparsify();
1453  if (A != stnsr_A) delete stnsr_A;
1454 
1455  }
1456 
1457  TAU_FSTOP(sum_preprocessing);
1458 
1459  // FIXME: if A has self indices and function is distributive, presum A first, otherwise if it is dense and B is sparse, sparsify A
1460  summation new_sum = summation(*this);
1461  new_sum.A = tnsr_A;
1462  new_sum.B = tnsr_B;
1463  if (tnsr_A == tnsr_B){
1464  tensor * nnew_tsr = new tensor(tnsr_A);
1465  new_sum.A = nnew_tsr;
1466  new_sum.B = tnsr_B;
1467  } else{
1468  //FIXME: remove the below, sum_tensors should never be called without sym_sum
1469  int sign = align_symmetric_indices(tnsr_A->order,
1470  new_sum.idx_A,
1471  tnsr_A->sym,
1472  tnsr_B->order,
1473  new_sum.idx_B,
1474  tnsr_B->sym);
1475 
1476  #if DEBUG >= 1
1477  new_sum.print();
1478  #endif
1479 
1480  ASSERT(sign == 1);
1481 /* if (sign == -1){
1482  char * new_alpha = (char*)malloc(tnsr_B->sr->el_size);
1483  tnsr_B->sr->addinv(alpha, new_alpha);
1484  alpha = new_alpha;
1485  }*/
1486 
1487  #if 0 //VERIFY
1488  int64_t nsA, nsB;
1489  int64_t nA, nB;
1490  dtype * sA, * sB;
1491  dtype * uA, * uB;
1492  int order_A, order_B, i;
1493  int * edge_len_A, * edge_len_B;
1494  int * sym_A, * sym_B;
1495  stat = allread_tsr(ntid_A, &nsA, &sA);
1496  assert(stat == SUCCESS);
1497 
1498  stat = allread_tsr(ntid_B, &nsB, &sB);
1499  assert(stat == SUCCESS);
1500  #endif
1501 
1502  TAU_FSTART(sum_tensors);
1503 
1504  TAU_FSTART(sum_tensors_map);
1505 
1506  /* Check if the current tensor mappings can be summed on */
1507  #if REDIST
1508  if (1) {
1509  #else
1510  if (new_sum.check_mapping() == 0) {
1511  #endif
1512  #if DEBUG == 2
1513  if (A->wrld->cdt.rank == 0){
1514  printf("Remapping tensors for sum:\n");
1515  }
1516  #endif
1517  /* remap if necessary */
1518  stat = new_sum.map();
1519  if (stat == ERROR) {
1520  printf("Failed to map tensors to physical grid\n");
1521  TAU_FSTOP(sum_tensors);
1522  TAU_FSTOP(sum_tensors_map);
1523  return ERROR;
1524  }
1525  } else {
1526  #if DEBUG > 2
1527  if (A->wrld->cdt.rank == 0){
1528  printf("Keeping mappings:\n");
1529  }
1530  tnsr_A->print_map(stdout);
1531  tnsr_B->print_map(stdout);
1532  #endif
1533  }
1534  /* Construct the tensor algorithm we would like to use */
1535  ASSERT(new_sum.check_mapping());
1536  #if FOLD_TSR
1537  if (is_custom == false && new_sum.can_fold()){
1538  //FIXME bit of a guess, no?
1539  double est_time_nofold = 4.*(A->size + B->size)*COST_MEMBW;
1540  if (new_sum.est_time_fold() + (A->size + B->size)*COST_MEMBW < est_time_nofold){
1541  /*if (A->wrld->cdt.rank == 0)
1542  printf("Decided to fold\n");*/
1543  int inner_stride;
1544  TAU_FSTART(map_fold);
1545  inner_stride = new_sum.map_fold();
1546  TAU_FSTOP(map_fold);
1547  sumf = new_sum.construct_sum(inner_stride);
1548  } else {
1549  /*if (A->wrld->cdt.rank == 0)
1550  printf("Decided not to fold\n");*/
1551 
1552  sumf = new_sum.construct_sum();
1553  }
1554  } else{
1555  #if DEBUG >= 1
1556  if (A->wrld->cdt.rank == 0){
1557  printf("Could not fold summation, is_custom = %d, new_sum.can_fold = %d\n", is_custom, new_sum.can_fold());
1558  }
1559  #endif
1560  sumf = new_sum.construct_sum();
1561  }
1562  /*sumf = construct_sum(alpha, beta, ntid_A, map_A, ntid_B, map_B,
1563  ftsr, felm);*/
1564  #else
1565  sumf = new_sum.construct_sum();
1566  /*sumf = construct_sum(alpha, beta, ntid_A, map_A, ntid_B, map_B,
1567  ftsr, felm);*/
1568  #endif
1569  /*TAU_FSTART(zero_sum_padding);
1570  stat = zero_out_padding(ntid_A);
1571  TAU_FSTOP(zero_sum_padding);
1572  TAU_FSTART(zero_sum_padding);
1573  stat = zero_out_padding(ntid_B);
1574  TAU_FSTOP(zero_sum_padding);*/
1575  DEBUG_PRINTF("[%d] performing tensor sum\n", A->wrld->cdt.rank);
1576  #if DEBUG >=3
1577  /*if (A->wrld->cdt.rank == 0){
1578  for (int i=0; i<tensors[ntid_A]->order; i++){
1579  printf("padding[%d] = %d\n",i, tensors[ntid_A]->padding[i]);
1580  }
1581  for (int i=0; i<tensors[ntid_B]->order; i++){
1582  printf("padding[%d] = %d\n",i, tensors[ntid_B]->padding[i]);
1583  }
1584  }*/
1585  #endif
1586 
1587  TAU_FSTOP(sum_tensors_map);
1588  #if DEBUG >= 2
1589  if (tnsr_B->wrld->rank==0)
1590  sumf->print();
1591  tnsr_A->print_map();
1592  tnsr_B->print_map();
1593  #endif
1594  /* Invoke the contraction algorithm */
1595 #ifdef PROFILE
1596  TAU_FSTART(pre_sum_func_barrier);
1597  MPI_Barrier(tnsr_B->wrld->comm);
1598  TAU_FSTOP(pre_sum_func_barrier);
1599 #endif
1600  TAU_FSTART(activate_topo);
1601  tnsr_A->topo->activate();
1602  TAU_FSTOP(activate_topo);
1603  TAU_FSTART(sum_func);
1604  sumf->run();
1605  TAU_FSTOP(sum_func);
1606  if (tnsr_B->is_sparse){
1607  tspsum * spsumf = (tspsum*)sumf;
1608  if (tnsr_B->data != spsumf->new_B){
1609  tnsr_B->sr->pair_dealloc(tnsr_B->data);
1610  tnsr_B->data = spsumf->new_B;
1611  //tnsr_B->nnz_loc = spsumf->new_nnz_B;
1612  }
1613  tnsr_B->set_new_nnz_glb(tnsr_B->nnz_blk);
1614  ASSERT(tnsr_B->nnz_loc == spsumf->new_nnz_B);
1615  }
1616  /*tnsr_B->unfold();
1617  tnsr_B->print();
1618  MPI_Barrier(tnsr_B->wrld->comm);
1619  if (tnsr_B->wrld->rank==1){
1620  for (int i=0; i<tnsr_B->size; i++){
1621  printf("[%d] %dth element ",tnsr_B->wrld->rank,i);
1622  tnsr_B->sr->print(tnsr_B->data+i*tnsr_B->sr->el_size);
1623  printf("\n");
1624  }
1625  }*/
1626 #ifdef PROFILE
1627  TAU_FSTART(post_sum_func_barrier);
1628  MPI_Barrier(tnsr_B->wrld->comm);
1629  TAU_FSTOP(post_sum_func_barrier);
1630 #endif
1631  TAU_FSTART(sum_postprocessing);
1632  tnsr_A->topo->deactivate();
1633  tnsr_A->unfold();
1634  tnsr_B->unfold();
1635 #ifndef SEQ
1636  //FIXME: when is this actually needed? can we do it in sym_sum instead?
1637  stat = tnsr_B->zero_out_padding();
1638 #endif
1639 
1640  #if 0 //VERIFY
1641  stat = allread_tsr(ntid_A, &nA, &uA);
1642  assert(stat == SUCCESS);
1643  stat = get_info(ntid_A, &order_A, &edge_len_A, &sym_A);
1644  assert(stat == SUCCESS);
1645 
1646  stat = allread_tsr(ntid_B, &nB, &uB);
1647  assert(stat == SUCCESS);
1648  stat = get_info(ntid_B, &order_B, &edge_len_B, &sym_B);
1649  assert(stat == SUCCESS);
1650 
1651  if (nsA != nA) { printf("nsA = " PRId64 ", nA = " PRId64 "\n",nsA,nA); ABORT; }
1652  if (nsB != nB) { printf("nsB = " PRId64 ", nB = " PRId64 "\n",nsB,nB); ABORT; }
1653  for (i=0; (int64_t)i<nA; i++){
1654  if (fabs(uA[i] - sA[i]) > 1.E-6){
1655  printf("A[i] = %lf, sA[i] = %lf\n", uA[i], sA[i]);
1656  }
1657  }
1658 
1659  cpy_sym_sum(alpha, uA, order_A, edge_len_A, edge_len_A, sym_A, map_A,
1660  beta, sB, order_B, edge_len_B, edge_len_B, sym_B, map_B);
1661  assert(stat == SUCCESS);
1662 
1663  for (i=0; (int64_t)i<nB; i++){
1664  if (fabs(uB[i] - sB[i]) > 1.E-6){
1665  printf("B[%d] = %lf, sB[%d] = %lf\n", i, uB[i], i, sB[i]);
1666  }
1667  assert(fabs(sB[i] - uB[i]) < 1.E-6);
1668  }
1669  CTF_int::cdealloc(uA);
1670  CTF_int::cdealloc(uB);
1671  CTF_int::cdealloc(sA);
1672  CTF_int::cdealloc(sB);
1673  #endif
1674 
1675  delete sumf;
1676  if (tnsr_A != A){
1677  delete tnsr_A;
1678  }
1679  for (int i=nst_B-1; i>=0; i--){
1680  int ret = dstack_tsr_B[i]->extract_diag(dstack_map_B[i], 0, tnsr_B, &new_idx_map);
1681  ASSERT(ret == SUCCESS);
1682  delete tnsr_B;
1683  tnsr_B = dstack_tsr_B[i];
1684  }
1685  ASSERT(tnsr_B == B);
1686  }
1687  //#ifndef SEQ
1688  //stat = B->zero_out_padding();
1689  //#endif
1690  CTF_int::cdealloc(map_A);
1691  CTF_int::cdealloc(map_B);
1692  CTF_int::cdealloc(dstack_map_B);
1693  CTF_int::cdealloc(dstack_tsr_B);
1694  TAU_FSTOP(sum_postprocessing);
1695 
1696  TAU_FSTOP(sum_tensors);
1697  return SUCCESS;
1698  }
1699 
1700  int summation::unfold_broken_sym(summation ** nnew_sum){
1701  int sidx, i, num_tot, iA, iA2, iB;
1702  int * idx_arr;
1703  int bsym = NS;
1704  summation * new_sum;
1705 
1706  if (nnew_sum != NULL){
1707  new_sum = new summation(*this);
1708  *nnew_sum = new_sum;
1709  } else new_sum = NULL;
1710 
1711  inv_idx(A->order, idx_A,
1712  B->order, idx_B,
1713  &num_tot, &idx_arr);
1714 
1715  sidx = -1;
1716  for (i=0; i<A->order; i++){
1717  if (A->sym[i] != NS){
1718  iA = idx_A[i];
1719  if (idx_arr[2*iA+1] != -1){
1720  if (B->sym[idx_arr[2*iA+1]] == NS ||
1721  ((B->sym[idx_arr[2*iA+1]] == AS) ^ (A->sym[i] == AS)) ||
1722  idx_arr[2*idx_A[i+1]+1] == -1 ||
1723  idx_A[i+1] != idx_B[idx_arr[2*iA+1]+1]){
1724  sidx = 2*i;
1725  break;
1726  }
1727  } else if (idx_arr[2*idx_A[i+1]+1] != -1){
1728  sidx = 2*i;
1729  break;
1730  }
1731  }
1732  }
1733  if (sidx == -1){
1734  for (i=0; i<B->order; i++){
1735  if (B->sym[i] != NS){
1736  iB = idx_B[i];
1737  if (idx_arr[2*iB+0] != -1){
1738  if (A->sym[idx_arr[2*iB+0]] == NS ||
1739  ((A->sym[idx_arr[2*iB+0]] == AS) ^ (B->sym[i] == AS)) ||
1740  idx_arr[2*idx_B[i+1]+0] == -1 ||
1741  idx_B[i+1] != idx_A[idx_arr[2*iB+0]+1]){
1742  sidx = 2*i+1;
1743  break;
1744  } else if (A->sym[idx_arr[2*iB+0]] == NS){
1745  sidx = 2*i;
1746  bsym = B->sym[i];
1747  break;
1748  }
1749  } else if (idx_arr[2*idx_B[i+1]+0] != -1){
1750  sidx = 2*i+1;
1751  break;
1752  }
1753  }
1754  }
1755  }
1756  //if we have e.g. b[""] = A["ij"] with SY A, symmetry preserved bu t need to account for diagonal, this just unpacks (FIXME: suboptimal)
1757  if (sidx == -1){
1758  for (i=0; i<A->order; i++){
1759  if (A->sym[i] == SY){
1760  iA = idx_A[i];
1761  iA2 = idx_A[i+1];
1762  if (idx_arr[2*iA+1] == -1 &&
1763  idx_arr[2*iA2+1] == -1){
1764  sidx = 2*i;
1765  break;
1766  }
1767  }
1768  }
1769  }
1770  if (nnew_sum != NULL && sidx != -1){
1771  if(sidx%2 == 0){
1772  new_sum->A = new tensor(A, 0, 0);
1773  int nA_sym[A->order];
1774  memcpy(nA_sym, new_sum->A->sym, sizeof(int)*new_sum->A->order);
1775  nA_sym[sidx/2] = bsym;
1776  new_sum->A->set_sym(nA_sym);
1777  } else {
1778  new_sum->B = new tensor(B, 0, 0);
1779  int nB_sym[B->order];
1780  memcpy(nB_sym, new_sum->B->sym, sizeof(int)*new_sum->B->order);
1781  nB_sym[sidx/2] = NS;
1782  new_sum->B->set_sym(nB_sym);
1783  }
1784  }
1785  CTF_int::cdealloc(idx_arr);
1786  return sidx;
1787  }
1788 
1789  bool summation::check_consistency(){
1790  int i, num_tot, len;
1791  int iA, iB;
1792  int * idx_arr;
1793 
1794  inv_idx(A->order, idx_A,
1795  B->order, idx_B,
1796  &num_tot, &idx_arr);
1797 
1798  for (i=0; i<num_tot; i++){
1799  len = -1;
1800  iA = idx_arr[2*i+0];
1801  iB = idx_arr[2*i+1];
1802  if (iA != -1){
1803  len = A->lens[iA];
1804  }
1805  if (len != -1 && iB != -1 && len != B->lens[iB]){
1806  if (A->wrld->cdt.rank == 0){
1807  printf("i = %d Error in sum call: The %dth edge length (%d) of tensor %s does not",
1808  i, iA, len, A->name);
1809  printf("match the %dth edge length (%d) of tensor %s.\n",
1810  iB, B->lens[iB], B->name);
1811  }
1812  return false;
1813  }
1814  }
1815  CTF_int::cdealloc(idx_arr);
1816  return true;
1817 
1818  }
1819 
1820 
1822  int i;
1823 
1824  if (A != os.A) return 0;
1825  if (B != os.B) return 0;
1826 
1827  for (i=0; i<A->order; i++){
1828  if (idx_A[i] != os.idx_A[i]) return 0;
1829  }
1830  for (i=0; i<B->order; i++){
1831  if (idx_B[i] != os.idx_B[i]) return 0;
1832  }
1833  return 1;
1834  }
1835 
1836  int summation::check_mapping(){
1837  int i, pass, order_tot, iA, iB;
1838  int * idx_arr, * phys_map;
1839  //mapping * map;
1840 
1841  TAU_FSTART(check_sum_mapping);
1842  pass = 1;
1843 
1844  ASSERT(A->is_mapped);
1845  ASSERT(B->is_mapped);
1846 
1847  if (A->is_mapped == 0) pass = 0;
1848  if (B->is_mapped == 0) pass = 0;
1849 
1850 
1851  if (A->is_folded == 1) pass = 0;
1852  if (B->is_folded == 1) pass = 0;
1853 
1854  if (A->topo != B->topo) pass = 0;
1855 
1856  if (pass==0){
1857  DPRINTF(4,"failed confirmation here\n");
1858  TAU_FSTOP(check_sum_mapping);
1859  return 0;
1860  }
1861 
1862  CTF_int::alloc_ptr(sizeof(int)*A->topo->order, (void**)&phys_map);
1863  memset(phys_map, 0, sizeof(int)*A->topo->order);
1864 
1865  inv_idx(A->order, idx_A,
1866  B->order, idx_B,
1867  &order_tot, &idx_arr);
1868 
1869  if (!check_self_mapping(A, idx_A))
1870  pass = 0;
1871  if (!check_self_mapping(B, idx_B))
1872  pass = 0;
1873  if (pass == 0)
1874  DPRINTF(4,"failed confirmation here\n");
1875 
1876  for (i=0; i<order_tot; i++){
1877  iA = idx_arr[2*i];
1878  iB = idx_arr[2*i+1];
1879  if (iA != -1 && iB != -1) {
1880  if (!comp_dim_map(&A->edge_map[iA], &B->edge_map[iB])){
1881  pass = 0;
1882  DPRINTF(4,"failed confirmation here i=%d\n",i);
1883  }
1884  }
1885  if (iA != -1 && iB == -1) {
1886  mapping * map = &A->edge_map[iA];
1887  while (map->type == PHYSICAL_MAP){
1888  phys_map[map->cdt]++;
1889  if (map->has_child) map = map->child;
1890  else break;
1891  }
1892  }
1893  if (iB != -1 && iA == -1){
1894  mapping * map = &B->edge_map[iB];
1895  while (map->type == PHYSICAL_MAP){
1896  phys_map[map->cdt]++;
1897  if (map->has_child) map = map->child;
1898  else break;
1899  }
1900  }
1901  }
1902  /* Ensure that a replicated and a reduced mode are not mapped to processor grid dimensions not used by the other tensor */
1903  for (i=0; i<A->topo->order; i++){
1904  if (phys_map[i] > 1) {
1905  pass = 0;
1906  DPRINTF(3,"failed confirmation here i=%d\n",i);
1907  }
1908  }
1909 
1910  CTF_int::cdealloc(phys_map);
1911  CTF_int::cdealloc(idx_arr);
1912 
1913  //if we have don't have an additive id we can't replicate
1914  if (B->sr->addid() == NULL || B->is_sparse){
1915  int ndim_mapped = 0;
1916  for (int j=0; j<B->order; j++){
1917  mapping * map = &B->edge_map[j];
1918  if (map->type == PHYSICAL_MAP) ndim_mapped++;
1919  while (map->has_child) {
1920  map = map->child;
1921  if (map->type == PHYSICAL_MAP)
1922  ndim_mapped++;
1923  }
1924  }
1925  if (ndim_mapped < B->topo->order) pass = 0;
1926  }
1927 
1928  TAU_FSTOP(check_sum_mapping);
1929 
1930  return pass;
1931  }
1932 
1933  int summation::map_sum_indices(topology const * topo){
1934  int tsr_order, isum, iA, iB, i, j, jsum, jX, stat;
1935  int * tsr_edge_len, * tsr_sym_table, * restricted;
1936  int * idx_arr, * idx_sum;
1937  int num_sum, num_tot, idx_num;
1938  idx_num = 2;
1939  mapping * sum_map;
1940 
1941  TAU_FSTART(map_sum_indices);
1942 
1943  inv_idx(A->order, idx_A,
1944  B->order, idx_B,
1945  &num_tot, &idx_arr);
1946 
1947  CTF_int::alloc_ptr(sizeof(int)*num_tot, (void**)&idx_sum);
1948 
1949  num_sum = 0;
1950  for (i=0; i<num_tot; i++){
1951  if (idx_arr[2*i] != -1 && idx_arr[2*i+1] != -1){
1952  idx_sum[num_sum] = i;
1953  num_sum++;
1954  }
1955  }
1956 
1957  tsr_order = num_sum;
1958 
1959 
1960  CTF_int::alloc_ptr(tsr_order*sizeof(int), (void**)&restricted);
1961  CTF_int::alloc_ptr(tsr_order*sizeof(int), (void**)&tsr_edge_len);
1962  CTF_int::alloc_ptr(tsr_order*tsr_order*sizeof(int), (void**)&tsr_sym_table);
1963  CTF_int::alloc_ptr(tsr_order*sizeof(mapping), (void**)&sum_map);
1964 
1965  memset(tsr_sym_table, 0, tsr_order*tsr_order*sizeof(int));
1966  memset(restricted, 0, tsr_order*sizeof(int));
1967 
1968  for (i=0; i<tsr_order; i++){
1969  sum_map[i].type = NOT_MAPPED;
1970  sum_map[i].has_child = 0;
1971  sum_map[i].np = 1;
1972  }
1973  for (i=0; i<num_sum; i++){
1974  isum = idx_sum[i];
1975  iA = idx_arr[isum*2+0];
1976  iB = idx_arr[isum*2+1];
1977 
1978  if (A->edge_map[iA].type != NOT_MAPPED){
1979  ASSERT(B->edge_map[iB].type == NOT_MAPPED);
1980  copy_mapping(1, &A->edge_map[iA], &sum_map[i]);
1981  } else if (B->edge_map[iB].type != NOT_MAPPED){
1982  copy_mapping(1, &B->edge_map[iB], &sum_map[i]);
1983  }
1984  }
1985 
1986  /* Map a tensor of dimension.
1987  * Set the edge lengths and symmetries according to those in sum dims of A and B.
1988  * This gives us a mapping for the common mapped dimensions of tensors A and B. */
1989  for (i=0; i<num_sum; i++){
1990  isum = idx_sum[i];
1991  iA = idx_arr[isum*idx_num+0];
1992  iB = idx_arr[isum*idx_num+1];
1993 
1994  tsr_edge_len[i] = A->pad_edge_len[iA];
1995 
1996  /* Check if A has symmetry among the dimensions being contracted over.
1997  * Ignore symmetry with non-contraction dimensions.
1998  * FIXME: this algorithm can be more efficient but should not be a bottleneck */
1999  if (A->sym[iA] != NS){
2000  for (j=0; j<num_sum; j++){
2001  jsum = idx_sum[j];
2002  jX = idx_arr[jsum*idx_num+0];
2003  if (jX == iA+1){
2004  tsr_sym_table[i*tsr_order+j] = 1;
2005  tsr_sym_table[j*tsr_order+i] = 1;
2006  }
2007  }
2008  }
2009  if (B->sym[iB] != NS){
2010  for (j=0; j<num_sum; j++){
2011  jsum = idx_sum[j];
2012  jX = idx_arr[jsum*idx_num+1];
2013  if (jX == iB+1){
2014  tsr_sym_table[i*tsr_order+j] = 1;
2015  tsr_sym_table[j*tsr_order+i] = 1;
2016  }
2017  }
2018  }
2019  }
2020  /* Run the mapping algorithm on this construct */
2021  stat = map_tensor(topo->order, tsr_order,
2022  tsr_edge_len, tsr_sym_table,
2023  restricted, topo->dim_comm,
2024  NULL, 0,
2025  sum_map);
2026 
2027  if (stat == ERROR){
2028  TAU_FSTOP(map_sum_indices);
2029  return ERROR;
2030  }
2031 
2032  /* define mapping of tensors A and B according to the mapping of sum dims */
2033  if (stat == SUCCESS){
2034  for (i=0; i<num_sum; i++){
2035  isum = idx_sum[i];
2036  iA = idx_arr[isum*idx_num+0];
2037  iB = idx_arr[isum*idx_num+1];
2038 
2039  copy_mapping(1, &sum_map[i], &A->edge_map[iA]);
2040  copy_mapping(1, &sum_map[i], &B->edge_map[iB]);
2041  }
2042  }
2043  CTF_int::cdealloc(restricted);
2044  CTF_int::cdealloc(tsr_edge_len);
2045  CTF_int::cdealloc(tsr_sym_table);
2046  for (i=0; i<num_sum; i++){
2047  sum_map[i].clear();
2048  }
2049  CTF_int::cdealloc(sum_map);
2050  CTF_int::cdealloc(idx_sum);
2051  CTF_int::cdealloc(idx_arr);
2052 
2053  TAU_FSTOP(map_sum_indices);
2054  return stat;
2055 
2056  }
2057 
2058  int summation::map(){
2059  int i, ret, need_remap;
2060  int need_remap_A, need_remap_B;
2061  int d;
2062  topology * old_topo_A, * old_topo_B;
2063  int btopo;
2064  int gtopo;
2065 
2066  ASSERT(A->wrld->cdt.cm == B->wrld->cdt.cm);
2067  World * wrld = A->wrld;
2068 
2069  TAU_FSTART(map_tensor_pair);
2070  #if DEBUG >= 2
2071  if (wrld->rank == 0)
2072  printf("Initial mappings:\n");
2073  A->print_map(stdout);
2074  B->print_map(stdout);
2075  #endif
2076 
2077  //FIXME: try to avoid unfolding immediately, as its not always necessary
2078  A->unfold();
2079  B->unfold();
2080  A->set_padding();
2081  B->set_padding();
2082 
2083 
2084  distribution dA(A);
2085  distribution dB(B);
2086  old_topo_A = A->topo;
2087  old_topo_B = B->topo;
2088  mapping * old_map_A = new mapping[A->order];
2089  mapping * old_map_B = new mapping[B->order];
2090  copy_mapping(A->order, A->edge_map, old_map_A);
2091  copy_mapping(B->order, B->edge_map, old_map_B);
2092  btopo = -1;
2093  int64_t size;
2094  int64_t min_size = INT64_MAX;
2095  /* Attempt to map to all possible permutations of processor topology */
2096  for (i=A->wrld->cdt.rank; i<2*(int)A->wrld->topovec.size(); i+=A->wrld->cdt.np){
2097  // for (i=global_comm.rank*topovec.size(); i<2*(int)topovec.size(); i++){
2098  A->clear_mapping();
2099  B->clear_mapping();
2100  A->set_padding();
2101  B->set_padding();
2102 
2103  A->topo = wrld->topovec[i/2];
2104  B->topo = wrld->topovec[i/2];
2105  A->is_mapped = 1;
2106  B->is_mapped = 1;
2107 
2108  if (i%2 == 0){
2109  ret = map_self_indices(A, idx_A);
2110  if (ret == NEGATIVE) continue;
2111  else if (ret != SUCCESS) return ret;
2112  } else {
2113  ret = map_self_indices(B, idx_B);
2114  if (ret == NEGATIVE) continue;
2115  else if (ret != SUCCESS) return ret;
2116  }
2117  ret = map_sum_indices(A->topo);
2118  if (ret == NEGATIVE) continue;
2119  else if (ret != SUCCESS){
2120  return ret;
2121  }
2122  if (i%2 == 0){
2123  ret = map_self_indices(A, idx_A);
2124  if (ret == NEGATIVE) continue;
2125  else if (ret != SUCCESS) return ret;
2126  } else {
2127  ret = map_self_indices(B, idx_B);
2128  if (ret == NEGATIVE) continue;
2129  else if (ret != SUCCESS) return ret;
2130  }
2131 
2132  if (i%2 == 0){
2133  ret = map_self_indices(A, idx_A);
2134  if (ret == NEGATIVE) continue;
2135  else if (ret != SUCCESS) return ret;
2136  ret = A->map_tensor_rem(A->topo->order,
2137  A->topo->dim_comm);
2138  if (ret == NEGATIVE) continue;
2139  else if (ret != SUCCESS) return ret;
2140  copy_mapping(A->order, B->order,
2141  idx_A, A->edge_map,
2142  idx_B, B->edge_map,0);
2143  ret = B->map_tensor_rem(B->topo->order,
2144  B->topo->dim_comm);
2145  if (ret == NEGATIVE) continue;
2146  else if (ret != SUCCESS) return ret;
2147  } else {
2148  ret = map_self_indices(B, idx_B);
2149  if (ret == NEGATIVE) continue;
2150  else if (ret != SUCCESS) return ret;
2151  ret = B->map_tensor_rem(B->topo->order,
2152  B->topo->dim_comm);
2153  if (ret == NEGATIVE) continue;
2154  else if (ret != SUCCESS) return ret;
2155  copy_mapping(B->order, A->order,
2156  idx_B, B->edge_map,
2157  idx_A, A->edge_map,0);
2158  ret = A->map_tensor_rem(A->topo->order,
2159  A->topo->dim_comm);
2160  if (ret == NEGATIVE) continue;
2161  else if (ret != SUCCESS) return ret;
2162  }
2163  if (i%2 == 0){
2164  ret = map_self_indices(B, idx_B);
2165  if (ret == NEGATIVE) continue;
2166  else if (ret != SUCCESS) return ret;
2167  } else {
2168  ret = map_self_indices(A, idx_A);
2169  if (ret == NEGATIVE) continue;
2170  else if (ret != SUCCESS) return ret;
2171  }
2172 
2173  /* ret = map_symtsr(A->order, A->sym_table, A->edge_map);
2174  ret = map_symtsr(B->order, B->sym_table, B->edge_map);
2175  if (ret!=SUCCESS) return ret;
2176  return SUCCESS;*/
2177 
2178  #if DEBUG >= 4
2179  A->print_map(stdout,0);
2180  B->print_map(stdout,0);
2181  #endif
2182  if (!check_mapping()) continue;
2183  A->set_padding();
2184  B->set_padding();
2185  size = A->size + B->size;
2186 
2187  need_remap_A = 0;
2188  need_remap_B = 0;
2189 
2190  if (A->topo == old_topo_A){
2191  for (d=0; d<A->order; d++){
2192  if (!comp_dim_map(&A->edge_map[d],&old_map_A[d]))
2193  need_remap_A = 1;
2194  }
2195  } else
2196  need_remap_A = 1;
2197  if (need_remap_A){
2198  if (!A->is_sparse && can_block_reshuffle(A->order, dA.phase, A->edge_map)){
2199  size += A->size*std::max(1.0,log2(wrld->cdt.np));
2200  } else {
2201  if (A->is_sparse){
2202  double nnz_frac_A = std::min(2,(int)A->calc_npe())*((double)A->nnz_tot)/(A->size*A->calc_npe());
2203  size += 25.*nnz_frac_A*A->size*std::max(1.0,log2(wrld->cdt.np));
2204  } else
2205  size += 5.*A->size*std::max(1.0,log2(wrld->cdt.np));
2206  }
2207  }
2208  if (B->topo == old_topo_B){
2209  for (d=0; d<B->order; d++){
2210  if (!comp_dim_map(&B->edge_map[d],&old_map_B[d]))
2211  need_remap_B = 1;
2212  }
2213  } else
2214  need_remap_B = 1;
2215  if (need_remap_B){
2216  if (!B->is_sparse && can_block_reshuffle(B->order, dB.phase, B->edge_map)){
2217  size += B->size*std::max(1.0,log2(wrld->cdt.np));
2218  } else {
2219  double pref = 1.0;
2220  if (B->is_home)
2221  pref = 2.0;
2222  if (B->is_sparse){
2223  double nnz_frac_A = std::min(2,(int)A->calc_npe())*((double)A->nnz_tot)/(A->size*A->calc_npe());
2224  double nnz_frac_B = std::min(2,(int)B->calc_npe())*((double)B->nnz_tot)/(B->size*B->calc_npe());
2225  nnz_frac_B = std::max(nnz_frac_B, nnz_frac_A);
2226  size += 25.*pref*nnz_frac_B*B->size*std::max(1.0,log2(wrld->cdt.np));
2227  } else
2228  size += 5.*pref*B->size*std::max(1.0,log2(wrld->cdt.np));
2229  }
2230  }
2231 
2232  /*nvirt = (int64_t)calc_nvirt(A);
2233  tnvirt = nvirt*(int64_t)calc_nvirt(B);
2234  if (tnvirt < nvirt) nvirt = UINT64_MAX;
2235  else nvirt = tnvirt;
2236  if (btopo == -1 || nvirt < bnvirt ) {
2237  bnvirt = nvirt;
2238  btopo = i;
2239  }*/
2240  if (btopo == -1 || size < min_size){
2241  min_size = size;
2242  btopo = i;
2243  }
2244  }
2245  if (btopo == -1)
2246  min_size = INT64_MAX;
2247  /* pick lower dimensional mappings, if equivalent */
2248  gtopo = get_best_topo(min_size, btopo, wrld->cdt);
2249  TAU_FSTOP(map_tensor_pair);
2250  if (gtopo == -1){
2251  printf("CTF ERROR: Failed to map pair!\n");
2252  //ABORT;
2253  return ERROR;
2254  }
2255 
2256  A->clear_mapping();
2257  B->clear_mapping();
2258  A->set_padding();
2259  B->set_padding();
2260 
2261  A->topo = wrld->topovec[gtopo/2];
2262  B->topo = wrld->topovec[gtopo/2];
2263 
2264  A->is_mapped = 1;
2265  B->is_mapped = 1;
2266 
2267  if (gtopo%2 == 0){
2268  ret = map_self_indices(A, idx_A);
2269  ASSERT(ret == SUCCESS);
2270  } else {
2271  ret = map_self_indices(B, idx_B);
2272  ASSERT(ret == SUCCESS);
2273  }
2274  ret = map_sum_indices(A->topo);
2275  ASSERT(ret == SUCCESS);
2276  if (gtopo%2 == 0){
2277  ret = map_self_indices(A, idx_A);
2278  ASSERT(ret == SUCCESS);
2279  } else {
2280  ret = map_self_indices(B, idx_B);
2281  ASSERT(ret == SUCCESS);
2282  }
2283 
2284  if (gtopo%2 == 0){
2285  ret = map_self_indices(A, idx_A);
2286  ASSERT(ret == SUCCESS);
2287  ret = A->map_tensor_rem(A->topo->order,
2288  A->topo->dim_comm);
2289  ASSERT(ret == SUCCESS);
2290  copy_mapping(A->order, B->order,
2291  idx_A, A->edge_map,
2292  idx_B, B->edge_map,0);
2293  ret = B->map_tensor_rem(B->topo->order,
2294  B->topo->dim_comm);
2295  ASSERT(ret == SUCCESS);
2296  } else {
2297  ret = map_self_indices(B, idx_B);
2298  ASSERT(ret == SUCCESS);
2299  ret = B->map_tensor_rem(B->topo->order,
2300  B->topo->dim_comm);
2301  ASSERT(ret == SUCCESS);
2302  copy_mapping(B->order, A->order,
2303  idx_B, B->edge_map,
2304  idx_A, A->edge_map,0);
2305  ret = A->map_tensor_rem(A->topo->order,
2306  A->topo->dim_comm);
2307  ASSERT(ret == SUCCESS);
2308  }
2309  if (gtopo%2 == 0){
2310  ret = map_self_indices(B, idx_B);
2311  ASSERT(ret == SUCCESS);
2312  } else {
2313  ret = map_self_indices(A, idx_A);
2314  ASSERT(ret == SUCCESS);
2315  }
2316 
2317  ASSERT(check_mapping());
2318 
2319  A->set_padding();
2320  B->set_padding();
2321  #if DEBUG > 2
2322  if (wrld->cdt.rank == 0)
2323  printf("New mappings:\n");
2324  A->print_map(stdout);
2325  B->print_map(stdout);
2326  #endif
2327 
2328  TAU_FSTART(redistribute_for_sum);
2329 
2330  A->is_cyclic = 1;
2331  B->is_cyclic = 1;
2332  need_remap = 0;
2333  if (A->topo == old_topo_A){
2334  for (d=0; d<A->order; d++){
2335  if (!comp_dim_map(&A->edge_map[d],&old_map_A[d]))
2336  need_remap = 1;
2337  }
2338  } else
2339  need_remap = 1;
2340  if (need_remap){
2341  A->redistribute(dA);
2342  }
2343  need_remap = 0;
2344  if (B->topo == old_topo_B){
2345  for (d=0; d<B->order; d++){
2346  if (!comp_dim_map(&B->edge_map[d],&old_map_B[d]))
2347  need_remap = 1;
2348  }
2349  } else
2350  need_remap = 1;
2351  if (need_remap){
2352  B->redistribute(dB);
2353  }
2354 
2355  TAU_FSTOP(redistribute_for_sum);
2356  delete [] old_map_A;
2357  delete [] old_map_B;
2358 
2359  return SUCCESS;
2360  }
2361 
2363  int i;
2364  //max = A->order+B->order;
2365 
2366  CommData global_comm = A->wrld->cdt;
2367  MPI_Barrier(global_comm.cm);
2368  if (global_comm.rank == 0){
2369  char sname[200];
2370  sname[0] = '\0';
2371  sprintf(sname, "%s", B->name);
2372  sprintf(sname+strlen(sname),"[");
2373  for (i=0; i<B->order; i++){
2374  if (i>0)
2375  sprintf(sname+strlen(sname)," %d",idx_B[i]);
2376  else
2377  sprintf(sname+strlen(sname),"%d",idx_B[i]);
2378  }
2379  sprintf(sname+strlen(sname),"] <- ");
2380  sprintf(sname+strlen(sname), "%s", A->name);
2381  sprintf(sname+strlen(sname),"[");
2382  for (i=0; i<A->order; i++){
2383  if (i>0)
2384  sprintf(sname+strlen(sname)," %d",idx_A[i]);
2385  else
2386  sprintf(sname+strlen(sname),"%d",idx_A[i]);
2387  }
2388  sprintf(sname+strlen(sname),"]");
2389  printf("CTF: Summation %s\n",sname);
2390 
2391 
2392 /* printf("Summing Tensor %s into %s\n", A->name, B->name);
2393  printf("alpha is ");
2394  if (alpha != NULL) A->sr->print(alpha);
2395  else printf("NULL");
2396  printf("\nbeta is ");
2397  if (beta != NULL) B->sr->print(beta);
2398  else printf("NULL");
2399  printf("\n");
2400  printf("Summation index table:\n");
2401  printf(" A B\n");
2402  for (i=0; i<max; i++){
2403  ex_A=0;
2404  ex_B=0;
2405  printf("%d: ",i);
2406  for (j=0; j<A->order; j++){
2407  if (idx_A[j] == i){
2408  ex_A++;
2409  if (A->sym[j] == SY)
2410  printf("%dY ",j);
2411  else if (A->sym[j] == SH)
2412  printf("%dH ",j);
2413  else if (A->sym[j] == AS)
2414  printf("%dS ",j);
2415  else
2416  printf("%d ",j);
2417  }
2418  }
2419  if (ex_A == 0)
2420  printf(" ");
2421  if (ex_A == 1)
2422  printf(" ");
2423  for (j=0; j<B->order; j++){
2424  if (idx_B[j] == i){
2425  ex_B=1;
2426  if (B->sym[j] == SY)
2427  printf("%dY ",j);
2428  else if (B->sym[j] == SH)
2429  printf("%dH ",j);
2430  else if (B->sym[j] == AS)
2431  printf("%dS ",j);
2432  else
2433  printf("%d ",j);
2434  }
2435  }
2436  printf("\n");
2437  if (ex_A + ex_B== 0) break;
2438  }*/
2439  }
2440  }
2441 
2442 
2443  void summation::sp_sum(){
2444  int64_t num_pair;
2445  char * mapped_data;
2446 
2447  bool is_idx_matched = true;
2448  if (A->order != B->order)
2449  is_idx_matched = false;
2450  else {
2451  for (int o=0; o<A->order; o++){
2452  if (idx_A[o] != idx_B[o]){
2453  is_idx_matched = false;
2454  }
2455  }
2456  }
2457 
2458 
2459  //read data from A
2460  A->read_local_nnz(&num_pair, &mapped_data);
2461 
2462  if (!is_idx_matched){
2463  int64_t lda_A[A->order];
2464  int64_t lda_B[B->order];
2465  lda_A[0] = 1;
2466  for (int o=1; o<A->order; o++){
2467  lda_A[o] = lda_A[o-1]*A->lens[o];
2468  }
2469  lda_B[0] = 1;
2470  for (int o=1; o<B->order; o++){
2471  lda_B[o] = lda_B[o-1]*B->lens[o];
2472  }
2473  PairIterator pi(A->sr, mapped_data);
2474 #ifdef USE_OMP
2475  #pragma omp parallel for
2476 #endif
2477  for (int i=0; i<num_pair; i++){
2478  int64_t k = pi[i].k();
2479  int64_t k_new = 0;
2480  for (int o=0; o<A->order; o++){
2481  int64_t kpart = (k/lda_A[o])%A->lens[o];
2482  //FIXME: slow, but handles diagonal indexing, probably worth having separate versions
2483  for (int q=0; q<B->order; q++){
2484  if (idx_A[o] == idx_B[q]){
2485  k_new += kpart*lda_B[q];
2486  }
2487  }
2488  }
2489  ((int64_t*)(pi[i].ptr))[0] = k_new;
2490  }
2491 
2492  // when idx_A has indices idx_B does not, we need to reduce, which can be done partially here since the elements of A should be sorted
2493  bool is_reduce = false;
2494  for (int oA=0; oA<A->order; oA++){
2495  bool inB = false;
2496  for (int oB=0; oB<B->order; oB++){
2497  if (idx_A[oA] == idx_B[oB]){
2498  inB = true;
2499  }
2500  }
2501  if (!inB) is_reduce = true;
2502  }
2503 
2504  if (is_reduce && num_pair > 0){
2505  pi.sort(num_pair);
2506  int64_t nuniq=1;
2507  for (int64_t i=1; i<num_pair; i++){
2508  if (pi[i].k() != pi[i-1].k()) nuniq++;
2509  }
2510  if (nuniq != num_pair){
2511  char * swap_data = mapped_data;
2512  alloc_ptr(A->sr->pair_size()*nuniq, (void**)&mapped_data);
2513  PairIterator pi_new(A->sr, mapped_data);
2514  int64_t cp_st = 0;
2515  int64_t acc_st = -1;
2516  int64_t pfx = 0;
2517  for (int64_t i=1; i<num_pair; i++){
2518  if (pi[i].k() == pi[i-1].k()){
2519  if (cp_st < i){
2520  memcpy(pi_new[pfx].ptr, pi[cp_st].ptr, A->sr->pair_size()*(i-cp_st));
2521  pfx += i-cp_st;
2522  }
2523  cp_st = i+1;
2524 
2525  if (acc_st == -1) acc_st = i;
2526  } else {
2527  if (acc_st != -1){
2528  for (int64_t j=acc_st; j<i; j++){
2529  A->sr->add(pi_new[pfx-1].d(), pi[j].d(), pi_new[pfx-1].d());
2530  }
2531  }
2532  acc_st = -1;
2533  }
2534  }
2535  if (cp_st < num_pair)
2536  memcpy(pi_new[pfx].ptr, pi[cp_st].ptr, A->sr->pair_size()*(num_pair-cp_st));
2537  if (acc_st != -1){
2538  for (int64_t j=acc_st; j<num_pair; j++){
2539  A->sr->add(pi_new[pfx-1].d(), pi[j].d(), pi_new[pfx-1].d());
2540  }
2541  }
2542  cdealloc(swap_data);
2543  num_pair = nuniq;
2544  }
2545  }
2546 
2547  // if applying custom function, apply immediately on reduced form
2548  if (is_custom && !func->is_accumulator()){
2549  char * swap_data = mapped_data;
2550  alloc_ptr(B->sr->pair_size()*num_pair, (void**)&mapped_data);
2551  PairIterator pi_new(B->sr, mapped_data);
2552 #ifdef USE_OMP
2553  #pragma omp parallel for
2554 #endif
2555  for (int64_t i=0; i<num_pair; i++){
2556  if (alpha == NULL)
2557  func->apply_f(pi[i].d(), pi_new[i].d());
2558  else {
2559  char tmp_A[A->sr->el_size];
2560  A->sr->mul(pi[i].d(), alpha, tmp_A);
2561  func->apply_f(tmp_A, pi_new[i].d());
2562  }
2563  }
2564  cdealloc(swap_data);
2565  alpha = NULL;
2566  }
2567 
2568  // when idx_B has indices idx_A does not, we need to map, which we do by replicating the key value pairs of B
2569  // FIXME this is probably not most efficient, but not entirely stupid, as at least the set of replicated pairs is not expected to be bigger than B
2570  int nmap_idx = 0;
2571  int64_t map_idx_len[B->order];
2572  int64_t map_idx_lda[B->order];
2573  int map_idx_rev[B->order];
2574  for (int oB=0; oB<B->order; oB++){
2575  bool inA = false;
2576  for (int oA=0; oA<A->order; oA++){
2577  if (idx_A[oA] == idx_B[oB]){
2578  inA = true;
2579  }
2580  }
2581  if (!inA){
2582  bool is_rep=false;
2583  for (int ooB=0; ooB<oB; ooB++){
2584  if (idx_B[ooB] == idx_B[oB]){
2585  is_rep = true;
2586  map_idx_lda[map_idx_rev[ooB]] += lda_B[oB];
2587  break;
2588  }
2589  }
2590  if (!is_rep){
2591  map_idx_len[nmap_idx] = B->lens[oB];
2592  map_idx_lda[nmap_idx] = lda_B[oB];
2593  map_idx_rev[nmap_idx] = oB;
2594  nmap_idx++;
2595  }
2596  }
2597  }
2598  if (nmap_idx > 0){
2599  int64_t tot_rep=1;
2600  for (int midx=0; midx<nmap_idx; midx++){
2601  tot_rep *= map_idx_len[midx];
2602  }
2603  char * swap_data = mapped_data;
2604  alloc_ptr(A->sr->pair_size()*num_pair*tot_rep, (void**)&mapped_data);
2605  PairIterator pi_new(A->sr, mapped_data);
2606 #ifdef USE_OMP
2607  #pragma omp parallel for
2608 #endif
2609  for (int64_t i=0; i<num_pair; i++){
2610  for (int64_t r=0; r<tot_rep; r++){
2611  memcpy(pi_new[i*tot_rep+r].ptr, pi[i].ptr, A->sr->pair_size());
2612  }
2613  }
2614 #ifdef USE_OMP
2615  #pragma omp parallel for
2616 #endif
2617  for (int64_t i=0; i<num_pair; i++){
2618  int64_t phase=1;
2619  for (int midx=0; midx<nmap_idx; midx++){
2620  int64_t stride=phase;
2621  phase *= map_idx_len[midx];
2622  for (int64_t r=0; r<tot_rep/phase; r++){
2623  for (int64_t m=1; m<map_idx_len[midx]; m++){
2624  for (int64_t s=0; s<stride; s++){
2625  ((int64_t*)(pi_new[i*tot_rep + r*phase + m*stride + s].ptr))[0] += m*map_idx_lda[midx];
2626  }
2627  }
2628  }
2629  }
2630  }
2631  cdealloc(swap_data);
2632  num_pair *= tot_rep;
2633  }
2634  }
2635 
2636  B->write(num_pair, alpha, beta, mapped_data, 'w');
2637  cdealloc(mapped_data);
2638 
2639  }
2640 
2641 }
int map_tensor(int num_phys_dims, int tsr_order, int const *tsr_edge_len, int const *tsr_sym_table, int *restricted, CommData *phys_comm, int const *comm_idx, int fill, mapping *tsr_edge_map)
map a tensor
Definition: mapping.cxx:244
bool is_custom
whether there is a elementwise custom function
Definition: summation.h:32
char * home_buffer
buffer associated with home mapping of tensor, to which it is returned
CTF_int::CommData cdt
communicator data for MPI comm defining this world
Definition: world.h:32
algstrct const * sr_A
Definition: sum_tsr.h:70
map_type type
Definition: mapping.h:22
bool is_home
whether the latest tensor data is in the home buffer
int64_t * nnz_blk
nonzero elements in each block owned locally
int * sym
symmetries among tensor dimensions
#define DPRINTF(...)
Definition: util.h:235
int * idx_A
indices of left operand
Definition: summation.h:28
virtual int pair_size() const
gets pair size el_size plus the key size
Definition: algstrct.h:46
std::list< mem_transfer > contract_mst()
gets rid of empty space on the stack
Definition: memcontrol.cxx:125
strp_tsr * rec_strp_B
Definition: strp_tsr.h:95
void calc_dim(int order, int64_t size, int const *edge_len, mapping const *edge_map, int64_t *vrt_sz, int *vrt_edge_len, int *blk_edge_len)
calculate the block-sizes of a tensor
void safecopy(char *&a, char const *b) const
copies element b to element a, , with checks for NULL and alloc as necessary
Definition: algstrct.cxx:529
void execute(bool run_diag=false)
run summation
Definition: summation.cxx:119
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340
algstrct const * sr
Definition: algstrct.h:436
template int conv_idx< int >(int, int const *, int **)
int * pad_edge_len
padded tensor edge lengths
summation(summation const &other)
copy constructor
Definition: summation.cxx:24
void get_sym_perms(summation const &sum, std::vector< summation > &perms, std::vector< int > &signs)
finds all permutations of a summation that must be done for a broken symmetry
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
Definition: ctr_tsr.cxx:592
int sparsify(char const *threshold=NULL, bool take_abs=true)
reduce tensor to sparse format, storing only nonzero data, or data above a specified threshold...
bool has_home
whether the tensor has a home mapping/buffer
double sign(int par)
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
Definition: common.h:37
int get_best_topo(int64_t nvirt, int topo, CommData global_comm, int64_t bcomm_vol, int64_t bmemuse)
get the best topologoes (least nvirt) over all procs
Definition: topology.cxx:591
tsum * rec_tsum
Definition: sum_tsr.h:94
int64_t home_size
size of home buffer
void desymmetrize(tensor *sym_tsr, tensor *nonsym_tsr, bool is_C)
unfolds the data of a tensor
void set_sym(int const *sym)
sets symmetry, WARNING: for internal use only !!!!
untyped internal class for doubly-typed univariate function
Definition: sum_tsr.h:14
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
void set_new_nnz_glb(int64_t const *nnz_blk)
sets the number of nonzeros both locally (nnz_loc) and overall globally (nnz_tot) ...
#define DEBUG_PRINTF(...)
Definition: util.h:238
void sort(int64_t n)
sorts set of pairs using std::sort
Definition: algstrct.cxx:825
void copy_mapping(int order, mapping const *mapping_A, mapping *mapping_B)
copies mapping A to B
Definition: mapping.cxx:190
tensor * self_reduce(int const *idx_A, int **new_idx_A, int order_B, int const *idx_B, int **new_idx_B, int order_C=0, int const *idx_C=NULL, int **new_idx_C=NULL)
performs a partial reduction on the tensor (used in summation and contraction)
an instance of the CTF library (world) on a MPI communicator
Definition: world.h:19
int64_t blk_sz_B
Definition: sum_tsr.h:102
bool is_sparse
whether only the non-zero elements of the tensor are stored
int order
number of tensor dimensions
strp_tsr * rec_strp_A
Definition: strp_tsr.h:94
double estimate_time()
predicts execution time in seconds using performance models
Definition: summation.cxx:132
void set_padding()
sets padding and local size of a tensor given a mapping
int64_t new_nnz_B
Definition: spsum_tsr.h:18
CTF::World * wrld
distributed processor context on which tensor is defined
class for execution distributed scaling of a tensor
Definition: scaling.h:14
tsum * rec_tsum
Definition: strp_tsr.h:92
int rank
rank of local processor
Definition: world.h:24
int overcounting_factor(int order_A, const T &idx_A, const int *sym_A, int order_B, const T &idx_B, const int *sym_B, int order_C, const T &idx_C, const int *sym_C)
performs replication along a dimension, generates 2.5D algs
Definition: sum_tsr.h:122
~summation()
lazy constructor
Definition: summation.cxx:19
int zero_out_padding()
sets padded portion of tensor to zero (this should be maintained internally)
int strip_diag(int order, int order_tot, int const *idx_map, int64_t vrt_sz, mapping const *edge_map, topology const *topo, algstrct const *sr, int *blk_edge_len, int64_t *blk_sz, strp_tsr **stpr)
build stack required for stripping out diagonals of tensor
Definition: strp_tsr.cxx:273
#define COST_MEMBW
Definition: util.h:52
char const * alpha
scaling of A
Definition: summation.h:23
int can_block_reshuffle(int order, int const *old_phase, mapping const *map)
determines if tensor can be permuted by block
Definition: redist.cxx:618
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
void print_map(FILE *stream=stdout, bool allcall=1) const
displays mapping information
int comp_dim_map(mapping const *map_A, mapping const *map_B)
compares two mappings
Definition: mapping.cxx:143
bool is_data_aliased
whether the tensor data is an alias of another tensor object&#39;s data
int64_t k() const
returns key of pair at head of ptr
Definition: algstrct.cxx:789
int64_t nnz_tot
maximum number of local nonzero elements over all procs
void pull_alias(tensor const *other)
pulls data from an tensor with an aliased buffer
double est_time_transp(int order, int const *new_order, int const *edge_len, int dir, algstrct const *sr)
estimates time needed to transposes a non-symmetric (folded) tensor based on performance models ...
virtual void run()
Definition: sum_tsr.h:77
algstrct * sr
algstrct on which tensor elements and operations are defined
virtual void pair_dealloc(char *ptr) const
deallocate given pointer containing contiguous array of pairs
Definition: algstrct.cxx:693
univar_function const * func
Definition: sum_tsr.h:165
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
void unfold(bool was_mod=0)
undo the folding of a local tensor block unsets is_folded and deletes rec_tsr
CommData * dim_comm
Definition: topology.h:20
tensor * rec_tsr
representation of folded tensor (shares data pointer)
int * idx_B
indices of output
Definition: summation.h:30
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
bool is_mapped
whether a mapping has been selected
mapping * child
Definition: mapping.h:26
MPI_Comm cm
Definition: common.h:129
univar_function const * func
Definition: spsum_tsr.h:111
int64_t calc_nvirt() const
calculate virtualization factor of tensor return virtualization factor
void print()
print contraction details
Definition: summation.cxx:2362
univar_function const * func
function to execute on elements
Definition: summation.h:34
void permute_target(int order, int const *perm, int *arr)
permutes a permutation array
Definition: util.cxx:222
int check_self_mapping(tensor const *tsr, int const *idx_map)
checks mapping in preparation for tensors scale, summ or contract
Definition: mapping.cxx:332
int64_t nnz_loc
number of local nonzero elements
virtual void add(char const *a, char const *b, char *c) const
c = a+b
Definition: algstrct.cxx:109
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
char const * beta
scaling of existing B
Definition: summation.h:25
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
Definition: apsp.cxx:17
tensor * B
output
Definition: summation.h:20
std::vector< CTF_int::topology * > topovec
derived topologies
Definition: world.h:28
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
int is_equal(summation const &os)
returns 1 if summations have same tensors and index map
Definition: summation.cxx:1821
internal distributed tensor class
algstrct const * sr_B
Definition: sum_tsr.h:73
void nosym_transpose(tensor *A, int all_fdim_A, int const *all_flen_A, int const *new_order, int dir)
int extract_diag(int const *idx_map, int rw, tensor *&new_tsr, int **idx_map_new)
extracts the diagonal of a tensor if the index map specifies to do so
int execute()
run scaling
Definition: scaling.cxx:64
int align_symmetric_indices(int order_A, T &idx_A, const int *sym_A, int order_B, T &idx_B, const int *sym_B)
Definition: sym_indices.cxx:40
int map_self_indices(tensor const *tsr, int const *idx_map)
create virtual mapping for idx_maps that have repeating indices
Definition: mapping.cxx:423
topology * topo
topology to which the tensor is mapped
Definition: common.h:37
performs replication along a dimension, generates 2.5D algs
Definition: spsum_tsr.h:64
tensor * A
left operand
Definition: summation.h:18
class for execution distributed summation of tensors
Definition: summation.h:15
char * new_B
Definition: spsum_tsr.h:19
virtual void safeaddinv(char const *a, char *&b) const
b = -a, with checks for NULL and alloc as necessary
Definition: algstrct.cxx:97
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
Definition: common.h:37
#define ABORT
Definition: util.h:162
void symmetrize(tensor *sym_tsr, tensor *nonsym_tsr)
folds the data of a tensor
int64_t blk_sz_A
Definition: sum_tsr.h:99
virtual void print()
Definition: sum_tsr.h:78
char * name
name given to tensor
int sum_tensors(bool run_diag)
PDAXPY: a*idx_map_A(A) + b*idx_map_B(B) -> idx_map_B(B). Treats symmetric as lower triangular...
Definition: summation.cxx:1384
MPI_Comm comm
set of processors making up this world
Definition: world.h:22
void clear()
resets mapping to NOT_MAPPED
Definition: mapping.cxx:94
int conv_idx(int order, type const *cidx, int **iidx)
Definition: common.cxx:50