2 #include "../redistribution/nosym_transp.h" 3 #include "../scaling/strp_tsr.h" 4 #include "../mapping/mapping.h" 5 #include "../mapping/distribution.h" 6 #include "../tensor/untyped_tensor.h" 7 #include "../shared/util.h" 8 #include "../shared/memcontrol.h" 16 #include "../symmetry/sym_indices.h" 17 #include "../symmetry/symmetrization.h" 18 #include "../redistribution/nosym_transp.h" 19 #include "../redistribution/redist.h" 20 #include "../sparse_formats/coo.h" 21 #include "../sparse_formats/csr.h" 38 memcpy(idx_A, other.
idx_A,
sizeof(
int)*other.
A->
order);
41 memcpy(idx_B, other.
idx_B,
sizeof(
int)*other.
B->
order);
44 memcpy(idx_C, other.
idx_C,
sizeof(
int)*other.
C->
order);
64 if (func_ == NULL) is_custom = 0;
70 idx_A = (
int*)
alloc(
sizeof(
int)*A->order);
71 idx_B = (
int*)
alloc(
sizeof(
int)*B->order);
72 idx_C = (
int*)
alloc(
sizeof(
int)*C->order);
73 memcpy(idx_A, idx_A_,
sizeof(
int)*A->order);
74 memcpy(idx_B, idx_B_,
sizeof(
int)*B->order);
75 memcpy(idx_C, idx_C_,
sizeof(
int)*C->order);
90 if (func_ == NULL) is_custom = 0;
96 conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B, C->order, cidx_C, &idx_C);
100 #if (DEBUG >= 2 || VERBOSE >= 1) 102 if (A->wrld->cdt.rank == 0) printf(
"Contraction::execute (head):\n");
111 int stat = home_contract();
113 printf(
"CTF ERROR: Failed to perform contraction\n");
116 template<
typename ptype>
169 if (this->A != os.
A)
return 0;
170 if (this->B != os.
B)
return 0;
171 if (this->C != os.
C)
return 0;
173 for (
int i=0; i<A->order; i++){
174 if (idx_A[i] != os.
idx_A[i])
return 0;
176 for (
int i=0; i<B->order; i++){
177 if (idx_B[i] != os.
idx_B[i])
return 0;
179 for (
int i=0; i<C->order; i++){
180 if (idx_C[i] != os.
idx_C[i])
return 0;
207 int const * ordering_A,
208 int const * ordering_B,
210 int i, num_tot, num_ctr, num_no_ctr_A, num_no_ctr_B, num_weigh;
217 num_ctr = 0, num_no_ctr_A = 0, num_no_ctr_B = 0, num_weigh = 0;
218 for (i=0; i<num_tot; i++){
219 if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
221 }
else if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1){
223 }
else if (idx_arr[3*i] != -1){
225 }
else if (idx_arr[3*i+1] != -1){
233 for (i=0; i<A->
order; i++){
234 if (i >= num_ctr+num_no_ctr_A){
237 else if (i >= num_ctr)
242 for (i=0; i<B->
order; i++){
243 if (i >= num_ctr && i< num_ctr + num_no_ctr_B)
251 bool contraction::is_sparse(){
252 return A->is_sparse || B->is_sparse || C->is_sparse;
255 void contraction::get_fold_indices(
int * num_fold,
257 int i, in, num_tot, nfold, broken;
258 int iA, iB, iC, inA, inB, inC, iiA, iiB, iiC;
259 int * idx_arr, * idx;
266 for (i=0; i<num_tot; i++){
270 for (iA=0; iA<A->order; iA++){
278 inB = idx_arr[3*in+1];
279 inC = idx_arr[3*in+2];
280 if ((iA>=0) + (iB>=0) + (iC>=0) == 2){
281 if (((inA>=0) + (inB>=0) + (inC>=0) != 2) ||
282 ((inB == -1) ^ (iB == -1)) ||
283 ((inC == -1) ^ (iC == -1)) ||
284 (iB != -1 && inB - iB != in-i) ||
285 (iC != -1 && inC - iC != in-i) ||
286 (iB != -1 && A->sym[inA] != B->sym[inB]) ||
287 (iC != -1 && A->sym[inA] != C->sym[inC])){
290 }
else if ((iA>=0) + (iB>=0) + (iC>=0) != 3){
291 if ((inA>=0) + (inB>=0) + (inC>=0) != 3 ||
292 A->sym[inA] != B->sym[inB] ||
293 A->sym[inA] != C->sym[inC]){
297 if (((inA>=0) + (inB>=0) + (inC>=0) != 3) ||
298 ((inB == -1) ^ (iB == -1)) ||
299 ((inC == -1) ^ (iC == -1)) ||
300 ((inA == -1) ^ (iA == -1)) ||
301 (inB - iB != in-i) ||
302 (inC - iC != in-i) ||
303 (inA - iA != in-i) ||
304 (A->sym[inA] != B->sym[inB]) ||
305 (B->sym[inB] != C->sym[inC]) ||
306 (A->sym[inA] != C->sym[inC])){
311 }
while (A->sym[inA-1] !=
NS);
313 for (iiA=iA;iiA<inA;iiA++){
319 for (iC=0; iC<C->order; iC++){
327 inA = idx_arr[3*in+0];
328 inB = idx_arr[3*in+1];
329 if (((inC>=0) + (inA>=0) + (inB>=0) == 1) ||
330 ((inA == -1) ^ (iA == -1)) ||
331 ((inB == -1) ^ (iB == -1)) ||
332 (iA != -1 && inA - iA != in-i) ||
333 (iB != -1 && inB - iB != in-i) ||
334 (iA != -1 && C->sym[inC] != A->sym[inA]) ||
335 (iB != -1 && C->sym[inC] != B->sym[inB])){
339 }
while (C->sym[inC-1] !=
NS);
341 for (iiC=iC;iiC<inC;iiC++){
347 for (iB=0; iB<B->order; iB++){
355 inC = idx_arr[3*in+2];
356 inA = idx_arr[3*in+0];
357 if (((inB>=0) + (inC>=0) + (inA>=0) == 1) ||
358 ((inC == -1) ^ (iC == -1)) ||
359 ((inA == -1) ^ (iA == -1)) ||
360 (iC != -1 && inC - iC != in-i) ||
361 (iA != -1 && inA - iA != in-i) ||
362 (iC != -1 && B->sym[inB] != C->sym[inC]) ||
363 (iA != -1 && B->sym[inB] != A->sym[inA])){
367 }
while (B->sym[inB-1] !=
NS);
369 for (iiB=iB;iiB<inB;iiB++){
376 for (i=0; i<num_tot; i++){
388 int contraction::can_fold(){
389 int nfold, * fold_idx, i, j;
390 if (!is_sparse() && is_custom)
return 0;
391 for (i=0; i<A->order; i++){
392 for (j=i+1; j<A->order; j++){
393 if (idx_A[i] == idx_A[j])
return 0;
396 for (i=0; i<B->order; i++){
397 for (j=i+1; j<B->order; j++){
398 if (idx_B[i] == idx_B[j])
return 0;
401 for (i=0; i<C->order; i++){
402 for (j=i+1; j<C->order; j++){
403 if (idx_C[i] == idx_C[j])
return 0;
406 get_fold_indices(&nfold, &fold_idx);
409 if ((A->order+B->order+C->order)%2 == 1 ||
410 (A->order+B->order+C->order)/2 < nfold ){
422 for (i=0; i<num_tot; i++){
423 if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
458 int ** new_ordering_A,
459 int ** new_ordering_B,
460 int ** new_ordering_C){
461 int i, num_tot, num_ctr, num_no_ctr_A, num_no_ctr_B, num_weigh;
462 int idx_no_ctr_A, idx_no_ctr_B, idx_ctr, idx_weigh;
463 int idx_self_C, idx_self_A, idx_self_B;
464 int num_self_C, num_self_A, num_self_B;
465 int * ordering_A, * ordering_B, * ordering_C, * idx_arr;
475 num_no_ctr_B = 0, num_ctr = 0, num_no_ctr_A = 0, num_weigh = 0;
476 num_self_A = 0, num_self_B = 0, num_self_C = 0;
477 for (i=0; i<num_tot; i++){
478 if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
480 }
else if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1){
482 }
else if (idx_arr[3*i] != -1 && idx_arr[3*i+2] != -1){
484 }
else if (idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
486 }
else if (idx_arr[3*i] != -1){
488 }
else if (idx_arr[3*i+1] != -1){
490 }
else if (idx_arr[3*i+2] != -1){
496 idx_self_A = 0, idx_self_B = 0, idx_self_C=0;
498 idx_ctr = 0, idx_no_ctr_A = 0, idx_no_ctr_B = 0, idx_weigh = 0;
499 for (i=0; i<num_tot; i++){
500 if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
501 ordering_A[idx_weigh+num_no_ctr_A+num_ctr] = idx_arr[3*i];
502 ordering_B[idx_weigh+num_no_ctr_B+num_ctr] = idx_arr[3*i+1];
503 ordering_C[idx_weigh+num_no_ctr_A+num_no_ctr_B] = idx_arr[3*i+2];
505 }
else if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1){
506 ordering_A[idx_ctr] = idx_arr[3*i];
507 ordering_B[idx_ctr] = idx_arr[3*i+1];
510 if (idx_arr[3*i] != -1 && idx_arr[3*i+2] != -1){
511 ordering_A[num_ctr+idx_no_ctr_A] = idx_arr[3*i];
512 ordering_C[idx_no_ctr_A] = idx_arr[3*i+2];
514 }
else if (idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
515 ordering_B[num_ctr+idx_no_ctr_B] = idx_arr[3*i+1];
516 ordering_C[num_no_ctr_A+idx_no_ctr_B] = idx_arr[3*i+2];
518 }
else if (idx_arr[3*i] != -1){
520 ordering_A[num_ctr+num_no_ctr_A+num_weigh+idx_self_A] = idx_arr[3*i];
521 }
else if (idx_arr[3*i+1] != -1){
523 ordering_B[num_ctr+num_no_ctr_B+num_weigh+idx_self_B] = idx_arr[3*i+1];
524 }
else if (idx_arr[3*i+2] != -1){
526 ordering_C[num_no_ctr_A+num_no_ctr_B+num_weigh+idx_self_C] = idx_arr[3*i+2];
531 *new_ordering_A = ordering_A;
532 *new_ordering_B = ordering_B;
533 *new_ordering_C = ordering_C;
541 void contraction::get_fold_ctr(
contraction *& fold_ctr,
549 int * fold_idx, * fidx_A, * fidx_B, * fidx_C;
552 get_fold_indices(&nfold, &fold_idx);
563 A->fold(nfold, fold_idx, idx_A,
564 &all_fdim_A, &all_flen_A);
565 B->fold(nfold, fold_idx, idx_B,
566 &all_fdim_B, &all_flen_B);
567 C->fold(nfold, fold_idx, idx_C,
568 &all_fdim_C, &all_flen_C);
573 for (i=0; i<A->order; i++){
574 for (j=0; j<nfold; j++){
575 if (A->sym[i] ==
NS && idx_A[i] == fold_idx[j]){
582 for (i=0; i<B->order; i++){
583 for (j=0; j<nfold; j++){
584 if (B->sym[i] ==
NS && idx_B[i] == fold_idx[j]){
591 for (i=0; i<C->order; i++){
592 for (j=0; j<nfold; j++){
593 if (C->sym[i] ==
NS && idx_C[i] == fold_idx[j]){
604 int * sidx_A, * sidx_B, * sidx_C;
606 fB->
order, fidx_B, &sidx_B,
607 fC->
order, fidx_C, &sidx_C);
609 fold_ctr =
new contraction(fA, sidx_A, fB, sidx_B, alpha, fC, sidx_C, beta, func);
621 void contraction::select_ctr_perm(
contraction const * fold_ctr,
625 int const * all_flen_A,
626 int const * all_flen_B,
627 int const * all_flen_C,
633 int tall_fdim_A, tall_fdim_B, tall_fdim_C;
634 int const * tall_flen_A, * tall_flen_B, * tall_flen_C;
636 tensor * tfA, * tfB, * tfC;
637 int * tidx_A, * tidx_B, * tidx_C;
638 int * tfnew_ord_A, * tfnew_ord_B, * tfnew_ord_C;
639 int * tAiord, * tBiord, * tCiord;
643 if (is_custom && !func->commutative) max_ord = 3;
646 for (
int iord=0; iord<max_ord; iord++){
647 get_perm<tensor*>(iord, A, B, C,
649 get_perm<tensor*>(iord, fold_ctr->
A, fold_ctr->
B, fold_ctr->
C,
651 get_perm<int*>(iord, fold_ctr->
idx_A, fold_ctr->
idx_B, fold_ctr->
idx_C,
652 tidx_A, tidx_B, tidx_C);
653 get_perm<int const*>(iord, all_flen_A, all_flen_B, all_flen_C,
654 tall_flen_A, tall_flen_B, tall_flen_C);
655 get_perm<int>(iord, all_fdim_A, all_fdim_B, all_fdim_C,
656 tall_fdim_A, tall_fdim_B, tall_fdim_C);
658 &tfnew_ord_A, &tfnew_ord_B, &tfnew_ord_C);
661 calc_fold_lnmk(tfA, tfB, tfC, tidx_A, tidx_B, tidx_C, tfnew_ord_A, tfnew_ord_B, &iprm);
668 memcpy(tAiord, tA->inner_ordering, tall_fdim_A*
sizeof(
int));
669 memcpy(tBiord, tB->inner_ordering, tall_fdim_B*
sizeof(
int));
676 double time_est = 0.0;
678 time_est += tA->nnz_tot/(tA->size*tA->calc_npe())*tA->calc_nvirt()*
est_time_transp(tall_fdim_A, tAiord, tall_flen_A, 1, tA->sr);
680 time_est += tA->calc_nvirt()*
est_time_transp(tall_fdim_A, tAiord, tall_flen_A, 1, tA->sr);
682 time_est += tB->nnz_tot/(tB->size*tB->calc_npe())*tB->calc_nvirt()*
est_time_transp(tall_fdim_B, tBiord, tall_flen_B, 1, tB->sr);
684 time_est += tB->calc_nvirt()*
est_time_transp(tall_fdim_B, tBiord, tall_flen_B, 1, tB->sr);
691 if (time_est <= btime){
697 if (time_est <= btime){
710 switch (bperm_order){
761 iparam contraction::map_fold(
bool do_transp){
762 int all_fdim_A, all_fdim_B, all_fdim_C;
763 int * fnew_ord_A, * fnew_ord_B, * fnew_ord_C;
764 int * all_flen_A, * all_flen_B, * all_flen_C;
767 int bperm_order = -1;
768 double btime = DBL_MAX;
769 int tall_fdim_A, tall_fdim_B, tall_fdim_C;
771 tensor * tfA, * tfB, * tfC;
772 int * tidx_A, * tidx_B, * tidx_C;
774 get_fold_ctr(fold_ctr, all_fdim_A, all_fdim_B, all_fdim_C,
775 all_flen_A, all_flen_B, all_flen_C);
779 CommData global_comm = A->wrld->cdt;
780 if (global_comm.
rank == 0){
781 printf(
"Folded contraction type:\n");
786 select_ctr_perm(fold_ctr, all_fdim_A, all_fdim_B, all_fdim_C,
787 all_flen_A, all_flen_B, all_flen_C,
788 bperm_order, btime, iprm);
796 get_perm<tensor*>(bperm_order, A, B, C,
798 get_perm<tensor*>(bperm_order, fold_ctr->
A, fold_ctr->
B, fold_ctr->
C,
800 get_perm<int*>(bperm_order, fold_ctr->
idx_A, fold_ctr->
idx_B, fold_ctr->
idx_C,
801 tidx_A, tidx_B, tidx_C);
802 get_perm<int>(bperm_order, all_fdim_A, all_fdim_B, all_fdim_C,
803 tall_fdim_A, tall_fdim_B, tall_fdim_C);
806 &fnew_ord_A, &fnew_ord_B, &fnew_ord_C);
813 bool csr_or_coo = B->is_sparse || C->is_sparse || is_custom || !A->sr->has_coo_ker;
818 for (
int i=0; i<A->order; i++){
819 for (
int j=0; j<C->order; j++){
820 if (idx_A[i] == idx_C[j]) nrow_idx++;
823 A->spmatricize(iprm.
m, iprm.
k, nrow_idx, csr_or_coo);
833 for (
int i=0; i<B->order; i++){
834 for (
int j=0; j<A->order; j++){
835 if (idx_B[i] == idx_A[j]) nrow_idx++;
838 B->spmatricize(iprm.
k, iprm.
n, nrow_idx, csr_or_coo);
845 for (
int i=0; i<C->order; i++){
846 for (
int j=0; j<A->order; j++){
847 if (idx_C[i] == idx_A[j]) nrow_idx++;
850 C->spmatricize(iprm.
m, iprm.
n, nrow_idx, csr_or_coo);
851 C->sr->dealloc(C->data);
868 double contraction::est_time_fold(){
869 int all_fdim_A, all_fdim_B, all_fdim_C;
870 int * all_flen_A, * all_flen_B, * all_flen_C;
873 int bperm_order = -1;
874 double btime = DBL_MAX;
876 get_fold_ctr(fold_ctr, all_fdim_A, all_fdim_B, all_fdim_C,
877 all_flen_A, all_flen_B, all_flen_C);
879 select_ctr_perm(fold_ctr, all_fdim_A, all_fdim_B, all_fdim_C,
880 all_flen_A, all_flen_B, all_flen_C,
881 bperm_order, btime, iprm);
896 int contraction::unfold_broken_sym(
contraction ** new_contraction){
897 int i, num_tot, iA, iB, iC;
903 if (new_contraction != NULL){
907 nctr =
new contraction(nA, idx_A, nB, idx_B, alpha, nC, idx_C, beta, func);
908 *new_contraction = nctr;
941 int nA_sym[A->order];
942 if (new_contraction != NULL)
943 memcpy(nA_sym, nA->
sym,
sizeof(
int)*nA->
order);
944 for (i=0; i<A->order; i++){
945 if (A->sym[i] !=
NS){
947 if (idx_arr[3*iA+1] != -1){
948 if (B->sym[idx_arr[3*iA+1]] != A->sym[i] ||
949 idx_A[i+1] != idx_B[idx_arr[3*iA+1]+1]){
950 if (new_contraction != NULL){
958 if (idx_arr[3*idx_A[i+1]+1] != -1){
959 if (new_contraction != NULL){
967 if (idx_arr[3*iA+2] != -1){
968 if (C->sym[idx_arr[3*iA+2]] != A->sym[i] ||
969 idx_A[i+1] != idx_C[idx_arr[3*iA+2]+1]){
970 if (new_contraction != NULL){
978 if (idx_arr[3*idx_A[i+1]+2] != -1){
979 if (new_contraction != NULL){
991 int nB_sym[B->order];
992 if (new_contraction != NULL)
993 memcpy(nB_sym, nB->
sym,
sizeof(
int)*nB->
order);
994 for (i=0; i<B->order; i++){
995 if (B->sym[i] !=
NS){
997 if (idx_arr[3*iB+0] != -1){
998 if (A->sym[idx_arr[3*iB+0]] != B->sym[i] ||
999 idx_B[i+1] != idx_A[idx_arr[3*iB+0]+1]){
1000 if (new_contraction != NULL){
1008 if (idx_arr[3*idx_B[i+1]+0] != -1){
1009 if (new_contraction != NULL){
1017 if (idx_arr[3*iB+2] != -1){
1018 if (C->sym[idx_arr[3*iB+2]] != B->sym[i] ||
1019 idx_B[i+1] != idx_C[idx_arr[3*iB+2]+1]){
1020 if (new_contraction != NULL){
1028 if (idx_arr[3*idx_B[i+1]+2] != -1){
1029 if (new_contraction != NULL){
1040 bool is_preserv =
true;
1041 if (A != B) is_preserv =
false;
1043 for (
int j=0; j<A->order; j++){
1044 if (idx_A[j] != idx_B[j]){
1047 if (idx_arr[3*iA+2] == -1 || idx_arr[3*iB+2] == -1) is_preserv =
false;
1049 for (
int k=
MIN(idx_arr[3*iA+2],idx_arr[3*iB+2]);
1050 k<
MAX(idx_arr[3*iA+2],idx_arr[3*iB+2]);
1052 if (C->sym[k] !=
SY) is_preserv =
false;
1059 int nC_sym[C->order];
1060 if (new_contraction != NULL)
1061 memcpy(nC_sym, nC->
sym,
sizeof(
int)*nC->
order);
1062 for (i=0; i<C->order; i++){
1063 if (C->sym[i] !=
NS){
1065 if (idx_arr[3*iC+1] != -1){
1066 if (B->sym[idx_arr[3*iC+1]] != C->sym[i] ||
1067 idx_C[i+1] != idx_B[idx_arr[3*iC+1]+1]){
1068 if (new_contraction != NULL){
1075 }
else if (idx_arr[3*idx_C[i+1]+1] != -1){
1076 if (new_contraction != NULL){
1083 if (idx_arr[3*iC+0] != -1){
1084 if (A->sym[idx_arr[3*iC+0]] != C->sym[i] ||
1085 idx_C[i+1] != idx_A[idx_arr[3*iC+0]+1]){
1086 if (new_contraction != NULL){
1093 }
else if (idx_arr[3*iC+0] == -1){
1094 if (idx_arr[3*idx_C[i+1]] != -1){
1095 if (new_contraction != NULL){
1110 bool contraction::check_consistency(){
1111 int i, num_tot, len;
1118 &num_tot, &idx_arr);
1120 for (i=0; i<num_tot; i++){
1122 iA = idx_arr[3*i+0];
1123 iB = idx_arr[3*i+1];
1124 iC = idx_arr[3*i+2];
1128 if (len != -1 && iB != -1 && len != B->lens[iB]){
1129 if (A->wrld->cdt.rank == 0){
1130 printf(
"Error in contraction call: The %dth edge length of tensor %s does not",
1132 printf(
"match the %dth edge length of tensor %s.\n",
1137 if (len != -1 && iC != -1 && len != C->lens[iC]){
1138 if (A->wrld->cdt.rank == 0){
1139 printf(
"Error in contraction call: The %dth edge length of tensor %s (%d) does not",
1141 printf(
"match the %dth edge length of tensor %s (%d).\n",
1142 iC, C->name, C->lens[iC]);
1149 if (len != -1 && iC != -1 && len != C->lens[iC]){
1150 if (A->wrld->cdt.rank == 0){
1151 printf(
"Error in contraction call: The %dth edge length of tensor %s does not",
1153 printf(
"match the %dth edge length of tensor %s.\n",
1164 int contraction::check_mapping(){
1166 int num_tot, i, ph_A, ph_B, iA, iB, iC, pass, order, topo_order;
1168 int * phys_mismatched, * phys_mapped;
1174 if (A->is_mapped == 0) pass = 0;
1175 if (B->is_mapped == 0) pass = 0;
1176 if (C->is_mapped == 0) pass = 0;
1179 if (A->is_folded == 1) pass = 0;
1180 if (B->is_folded == 1) pass = 0;
1181 if (C->is_folded == 1) pass = 0;
1184 DPRINTF(3,
"failed confirmation here\n");
1188 if (A->topo != B->topo) pass = 0;
1189 if (A->topo != C->topo) pass = 0;
1192 DPRINTF(3,
"failed confirmation here\n");
1196 topo_order = A->topo->order;
1199 memset(phys_mismatched, 0,
sizeof(
int)*topo_order);
1200 memset(phys_mapped, 0,
sizeof(
int)*topo_order);
1206 &num_tot, &idx_arr);
1215 DPRINTF(3,
"failed confirmation here\n");
1219 for (i=0; i<num_tot; i++){
1220 if (idx_arr[3*i+0] != -1 &&
1221 idx_arr[3*i+1] != -1 &&
1222 idx_arr[3*i+2] != -1){
1223 iA = idx_arr[3*i+0];
1224 iB = idx_arr[3*i+1];
1225 iC = idx_arr[3*i+2];
1229 if (0 ==
comp_dim_map(&B->edge_map[iB], &A->edge_map[iA]) ||
1230 0 ==
comp_dim_map(&B->edge_map[iB], &C->edge_map[iC])){
1231 DPRINTF(3,
"failed confirmation here %d %d %d\n",iA,iB,iC);
1235 map = &A->edge_map[iA];
1238 if (phys_mapped[map->
cdt] == 1){
1239 DPRINTF(3,
"failed confirmation here %d\n",iA);
1243 phys_mapped[map->
cdt] = 1;
1244 phys_mismatched[map->
cdt] = 1;
1253 for (i=0; i<num_tot; i++){
1254 if (idx_arr[3*i+0] == -1 ||
1255 idx_arr[3*i+1] == -1 ||
1256 idx_arr[3*i+2] == -1){
1257 for (order=0; order<3; order++){
1262 iA = idx_arr[3*i+0];
1263 iB = idx_arr[3*i+1];
1264 iC = idx_arr[3*i+2];
1269 iA = idx_arr[3*i+0];
1270 iB = idx_arr[3*i+2];
1271 iC = idx_arr[3*i+1];
1276 iA = idx_arr[3*i+2];
1277 iB = idx_arr[3*i+1];
1278 iC = idx_arr[3*i+0];
1287 if (phys_mapped[map->
cdt] == 1){
1288 DPRINTF(3,
"failed confirmation here %d\n",iA);
1292 phys_mapped[map->
cdt] = 1;
1298 }
else if (iA == -1){
1302 if (phys_mapped[map->
cdt] == 1){
1303 DPRINTF(3,
"failed confirmation here %d\n",iA);
1307 phys_mapped[map->
cdt] = 1;
1320 DPRINTF(3,
"failed confirmation here iA=%d iB=%d\n",iA,iB);
1329 if (phys_mapped[map->
cdt] == 1){
1330 DPRINTF(3,
"failed confirmation here %d\n",iB);
1333 phys_mapped[map->
cdt] = 1;
1341 DPRINTF(3,
"failed confirmation here %d, matched and folded physical mapping not allowed\n",iB);
1349 if (phys_mismatched[map->
cdt] == 1){
1350 DPRINTF(3,
"failed confirmation here i=%d iA=%d iB=%d\n",i,iA,iB);
1354 phys_mismatched[map->
cdt] = 1;
1363 if (phys_mismatched[map->
cdt] == 1){
1364 DPRINTF(3,
"failed confirmation here i=%d iA=%d iB=%d\n",i,iA,iB);
1368 phys_mismatched[map->
cdt] = 1;
1380 for (i=0; i<topo_order; i++){
1381 if (phys_mismatched[i] == 1 && phys_mapped[i] == 0){
1382 DPRINTF(3,
"failed confirmation here i=%d\n",i);
1411 map_weigh_indices(
int const * idx_arr,
1412 int const * idx_weigh,
1419 int tsr_order, iweigh, iA, iB, iC, i, j, k, jX, stat, num_sub_phys_dims;
1420 int * tsr_edge_len, * tsr_sym_table, * restricted, * comm_idx;
1426 tsr_order = num_weigh;
1429 for (i=0; i<num_weigh; i++){
1430 iweigh = idx_weigh[i];
1431 iA = idx_arr[iweigh*3+0];
1432 iB = idx_arr[iweigh*3+1];
1433 iC = idx_arr[iweigh*3+2];
1445 memset(tsr_sym_table, 0, tsr_order*tsr_order*
sizeof(
int));
1446 memset(restricted, 0, tsr_order*
sizeof(
int));
1449 num_sub_phys_dims, &sub_phys_comm, &comm_idx);
1451 for (i=0; i<tsr_order; i++){
1454 weigh_map[i].
np = 1;
1456 for (i=0; i<num_weigh; i++){
1457 iweigh = idx_weigh[i];
1458 iA = idx_arr[iweigh*3+0];
1459 iB = idx_arr[iweigh*3+1];
1460 iC = idx_arr[iweigh*3+2];
1469 for (j=i+1; j<num_weigh; j++){
1470 jX = idx_arr[idx_weigh[j]*3+0];
1472 for (k=
MIN(iA,jX); k<
MAX(iA,jX); k++){
1473 if (A->
sym[k] ==
NS)
1477 tsr_sym_table[i*tsr_order+j] = 1;
1478 tsr_sym_table[j*tsr_order+i] = 1;
1481 jX = idx_arr[idx_weigh[j]*3+1];
1483 for (k=
MIN(iB,jX); k<
MAX(iB,jX); k++){
1484 if (B->
sym[k] ==
NS)
1488 tsr_sym_table[i*tsr_order+j] = 1;
1489 tsr_sym_table[j*tsr_order+i] = 1;
1492 jX = idx_arr[idx_weigh[j]*3+2];
1494 for (k=
MIN(iC,jX); k<
MAX(iC,jX); k++){
1495 if (C->
sym[k] ==
NS)
1499 tsr_sym_table[i*tsr_order+j] = 1;
1500 tsr_sym_table[j*tsr_order+i] = 1;
1504 stat =
map_tensor(num_sub_phys_dims, tsr_order,
1505 tsr_edge_len, tsr_sym_table,
1506 restricted, sub_phys_comm,
1515 for (i=0; i<num_weigh; i++){
1516 iweigh = idx_weigh[i];
1517 iA = idx_arr[iweigh*3+0];
1518 iB = idx_arr[iweigh*3+1];
1519 iC = idx_arr[iweigh*3+2];
1529 for (i=0; i<num_weigh; i++){
1530 weigh_map[i].
clear();
1552 map_ctr_indices(
int const * idx_arr,
1553 int const * idx_ctr,
1559 int tsr_order, ictr, iA, iB, i, j, jctr, jX, stat, num_sub_phys_dims;
1560 int * tsr_edge_len, * tsr_sym_table, * restricted, * comm_idx;
1566 tsr_order = num_ctr*2;
1573 memset(tsr_sym_table, 0, tsr_order*tsr_order*
sizeof(
int));
1574 memset(restricted, 0, tsr_order*
sizeof(
int));
1576 for (i=0; i<tsr_order; i++){
1581 for (i=0; i<num_ctr; i++){
1583 iA = idx_arr[ictr*3+0];
1584 iB = idx_arr[ictr*3+1];
1595 num_sub_phys_dims, &sub_phys_comm, &comm_idx);
1601 for (i=0; i<num_ctr; i++){
1603 iA = idx_arr[ictr*3+0];
1604 iB = idx_arr[ictr*3+1];
1609 tsr_sym_table[2*i*tsr_order+2*i+1] = 1;
1610 tsr_sym_table[(2*i+1)*tsr_order+2*i] = 1;
1615 if (A->
sym[iA] !=
NS){
1616 for (j=0; j<num_ctr; j++){
1618 jX = idx_arr[jctr*3+0];
1620 tsr_sym_table[2*i*tsr_order+2*j] = 1;
1621 tsr_sym_table[2*i*tsr_order+2*j+1] = 1;
1622 tsr_sym_table[2*j*tsr_order+2*i] = 1;
1623 tsr_sym_table[2*j*tsr_order+2*i+1] = 1;
1624 tsr_sym_table[(2*i+1)*tsr_order+2*j] = 1;
1625 tsr_sym_table[(2*i+1)*tsr_order+2*j+1] = 1;
1626 tsr_sym_table[(2*j+1)*tsr_order+2*i] = 1;
1627 tsr_sym_table[(2*j+1)*tsr_order+2*i+1] = 1;
1631 if (B->
sym[iB] !=
NS){
1632 for (j=0; j<num_ctr; j++){
1634 jX = idx_arr[jctr*3+1];
1636 tsr_sym_table[2*i*tsr_order+2*j] = 1;
1637 tsr_sym_table[2*i*tsr_order+2*j+1] = 1;
1638 tsr_sym_table[2*j*tsr_order+2*i] = 1;
1639 tsr_sym_table[2*j*tsr_order+2*i+1] = 1;
1640 tsr_sym_table[(2*i+1)*tsr_order+2*j] = 1;
1641 tsr_sym_table[(2*i+1)*tsr_order+2*j+1] = 1;
1642 tsr_sym_table[(2*j+1)*tsr_order+2*i] = 1;
1643 tsr_sym_table[(2*j+1)*tsr_order+2*i+1] = 1;
1652 stat =
map_tensor(num_sub_phys_dims, tsr_order,
1653 tsr_edge_len, tsr_sym_table,
1654 restricted, sub_phys_comm,
1664 for (i=0; i<num_ctr; i++){
1666 iA = idx_arr[ictr*3+0];
1667 iB = idx_arr[ictr*3+1];
1678 for (i=0; i<2*num_ctr; i++){
1701 map_no_ctr_indices(
int const * idx_arr,
1702 int const * idx_no_ctr,
1709 int stat, i, inoctr, iA, iB, iC;
1735 for (i=0; i<num_no_ctr; i++){
1736 inoctr = idx_no_ctr[i];
1737 iA = idx_arr[3*inoctr+0];
1738 iB = idx_arr[3*inoctr+1];
1739 iC = idx_arr[3*inoctr+2];
1742 if (iA != -1 && iC != -1){
1745 if (iB != -1 && iC != -1){
1754 for (i=0; i<num_no_ctr; i++){
1755 inoctr = idx_no_ctr[i];
1756 iA = idx_arr[3*inoctr+0];
1757 iB = idx_arr[3*inoctr+1];
1758 iC = idx_arr[3*inoctr+2];
1761 if (iA != -1 && iC != -1){
1764 if (iB != -1 && iC != -1){
1784 map_extra_indices(
int const * idx_arr,
1785 int const * idx_extra,
1790 int i, iA, iB, iC, iextra;
1793 for (i=0; i<num_extra; i++){
1794 iextra = idx_extra[i];
1795 iA = idx_arr[3*iextra+0];
1796 iB = idx_arr[3*iextra+1];
1797 iC = idx_arr[3*iextra+2];
1833 get_num_map_variants(
topology const * topo){
1834 int nAB, nAC, nBC, nmax_ctr_2d;
1835 return get_num_map_variants(topo, nmax_ctr_2d, nAB, nAC, nBC);
1839 get_num_map_variants(
topology const * topo,
1850 &num_tot, &idx_arr);
1855 for (
int i=0; i<num_tot; i++){
1856 if (idx_arr[3*i+0] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] == -1)
1858 if (idx_arr[3*i+0] != -1 && idx_arr[3*i+1] == -1 && idx_arr[3*i+2] != -1)
1860 if (idx_arr[3*i+0] == -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1)
1863 nmax_ctr_2d = std::min(topo->
order/2,std::min(nAC,std::min(nAB,nBC)));
1865 for (
int nctr_2d=0; nctr_2d<=nmax_ctr_2d; nctr_2d++){
1866 if (topo->
order-2*nctr_2d <= num_tot){
1875 bool contraction::switch_topo_perm(){
1878 int new_order[topo->
order*
sizeof(int)];
1879 std::fill(new_order, new_order+topo->
order, -1);
1881 for (
int i=0; i<A->
order; i++){
1884 new_order[map->
cdt] = il;
1890 for (
int i=0; i<B->
order; i++){
1893 if (new_order[map->
cdt] != -1 || new_order[map->
child->
cdt] != -1){
1894 if (new_order[map->
child->
cdt] != new_order[map->
cdt]+1)
1897 new_order[map->
cdt] = il;
1904 for (
int i=0; i<C->
order; i++){
1907 if (new_order[map->
cdt] != -1 || new_order[map->
child->
cdt] != -1){
1908 if (new_order[map->
child->
cdt] != new_order[map->
cdt]+1)
1911 new_order[map->
cdt] = il;
1919 for (
int i=0; i<topo->
order; i++){
1920 if (new_order[i] == -1){
1925 int new_lens[topo->
order];
1926 for (
int i=0; i<topo->
order; i++){
1927 new_lens[new_order[i]] = topo->
lens[i];
1931 for (
int i=0; i<(int)A->
wrld->
topovec.size(); i++){
1933 bool has_same_len =
true;
1934 for (
int j=0; j<topo->
order; j++){
1935 if (A->
wrld->
topovec[i]->lens[j] != new_lens[j]) has_same_len =
false;
1943 ASSERT(new_topo != NULL);
1947 for (
int i=0; i<A->
order; i++){
1950 map->
cdt = new_order[map->
cdt];
1955 for (
int i=0; i<B->
order; i++){
1958 map->
cdt = new_order[map->
cdt];
1963 for (
int i=0; i<C->
order; i++){
1966 map->
cdt = new_order[map->
cdt];
1976 exh_map_to_topo(
topology const * topo,
1984 &num_tot, &idx_arr);
1986 int nAB, nAC, nBC, nmax_ctr_2d;
1987 int nvar_tot = get_num_map_variants(topo, nmax_ctr_2d, nAB, nAC, nBC);
1988 ASSERT(variant<nvar_tot);
1991 for (
int nctr_2d=0; nctr_2d<=nmax_ctr_2d; nctr_2d++){
1993 if (topo->
order-2*nctr_2d <= num_tot){
1997 int v = variant - nv0;
1998 int rep_choices =
choose(num_tot,topo->
order-2*nctr_2d);
1999 int rep_ch = v%rep_choices;
2000 int rep_inds[topo->
order-2*nctr_2d];
2002 for (
int i=2*nctr_2d; i<topo->
order; i++){
2003 int r = rep_inds[i-2*nctr_2d];
2004 if (idx_arr[3*r+0] != -1)
2006 if (idx_arr[3*r+1] != -1)
2008 if (idx_arr[3*r+2] != -1)
2016 for (
int i=0; i<nctr_2d; i++){
2021 v = v/
choose(nAB,nctr_2d);
2023 v = v/
choose(nAC,nctr_2d);
2025 v = v/
choose(nBC,nctr_2d);
2027 for (
int i=0; i<nctr_2d; i++){
2032 for (
int j=0; j<num_tot; j++){
2033 if (idx_arr[3*j+0] != -1 && idx_arr[3*j+1] != -1 && idx_arr[3*j+2] == -1){
2034 if (iAB[i] == iiAB){
2049 if (idx_arr[3*j+0] != -1 && idx_arr[3*j+1] == -1 && idx_arr[3*j+2] != -1){
2050 if (iAC[i] == iiAC){
2068 if (idx_arr[3*j+0] == -1 && idx_arr[3*j+1] != -1 && idx_arr[3*j+2] != -1){
2069 if (iBC[i] == iiBC){
2196 for (
int i=0; i<num_tot; i++){
2197 int iA = idx_arr[3*i+0];
2198 int iB = idx_arr[3*i+1];
2199 int iC = idx_arr[3*i+2];
2230 int num_tot, num_ctr, num_no_ctr, num_weigh, num_extra, i, ret;
2231 int const * tidx_A, * tidx_B, * tidx_C;
2232 int * idx_arr, * idx_extra, * idx_no_ctr, * idx_weigh, * idx_ctr;
2235 get_perm<tensor*>(order, A, B, C, tA, tB, tC);
2236 get_perm<const int*>(order, idx_A, idx_B, idx_C, tidx_A, tidx_B, tidx_C);
2241 &num_tot, &idx_arr);
2247 num_ctr = 0, num_no_ctr = 0, num_extra = 0, num_weigh = 0;
2248 for (i=0; i<num_tot; i++){
2249 if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1 && idx_arr[3*i+2] != -1){
2250 idx_weigh[num_weigh] = i;
2252 }
else if (idx_arr[3*i] != -1 && idx_arr[3*i+1] != -1){
2253 idx_ctr[num_ctr] = i;
2255 }
else if (idx_arr[3*i+2] != -1 &&
2256 ((idx_arr[3*i+0] != -1) || (idx_arr[3*i+1] != -1))){
2257 idx_no_ctr[num_no_ctr] = i;
2260 idx_extra[num_extra] = i;
2272 ret = map_weigh_indices(idx_arr, idx_weigh, num_tot, num_weigh, topo, tA, tB, tC);
2285 ret = map_ctr_indices(idx_arr, idx_ctr, num_tot, num_ctr, topo, tA, tB);
2323 ret = map_extra_indices(idx_arr, idx_extra, num_extra, tA, tB, tC);
2335 ret = map_no_ctr_indices(idx_arr, idx_no_ctr, num_tot, num_no_ctr, topo, tA, tB, tC);
2343 ret =
map_symtsr(tA->order, tA->sym_table, tA->edge_map);
2345 ret =
map_symtsr(tB->order, tB->sym_table, tB->edge_map);
2351 ret = map_ctr_indices(idx_arr, idx_ctr, num_tot, num_ctr, topo, tA ,tB);
2359 ret = map_no_ctr_indices(idx_arr, idx_no_ctr, num_tot, num_no_ctr, topo, tA, tB, tC);
2373 ret =
map_symtsr(tA->order, tA->sym_table, tA->edge_map);
2375 ret =
map_symtsr(tB->order, tB->sym_table, tB->edge_map);
2388 int contraction::try_topo_morph(){
2391 tensor * tsr_keep, * tsr_change_A, * tsr_change_B;
2397 if (tA == tB && tB == tC){
2423 tA = tsr_change_A->
topo;
2424 tB = tsr_change_B->
topo;
2425 tC = tsr_keep->
topo;
2441 tsr_change_A->
topo = tC;
2446 tsr_change_B->
topo = tC;
2452 void contraction::get_best_sel_map(
distribution const * dA,
distribution const * dB,
distribution const * dC,
topology * old_topo_A,
topology * old_topo_B,
topology * old_topo_C,
mapping const * old_map_A,
mapping const * old_map_B,
mapping const * old_map_C,
int & idx,
double & time){
2454 int need_remap_A, need_remap_B, need_remap_C;
2456 double est_time, best_time;
2458 bool is_ctr_sparse = is_sparse();
2462 best_time = DBL_MAX;
2468 &num_tot, &idx_arr);
2471 for (j=0; j<6; j++){
2474 for (
int t=1; t<(int)wrld->
topovec.size()+8; t++){
2476 for (
int t=global_comm.
rank+1; t<(
int)wrld->
topovec.size()+8; t+=global_comm.
np){
2488 if (old_topo_A == NULL)
continue;
2490 topo_i = old_topo_A;
2495 if (old_topo_B == NULL || (topo_i != NULL && topo_i != old_topo_B))
continue;
2497 topo_i = old_topo_B;
2503 if (old_topo_C == NULL || (topo_i != NULL && topo_i != old_topo_C))
continue;
2505 topo_i = old_topo_C;
2509 }
else topo_i = wrld->
topovec[t-8];
2512 ret = map_to_topology(topo_i, j);
2526 if (check_mapping() == 0){
2534 if (global_comm.
rank == 0){
2535 printf(
"\nTest mappings:\n");
2542 double nnz_frac_A = 1.0;
2543 double nnz_frac_B = 1.0;
2544 double nnz_frac_C = 1.0;
2549 int64_t len_ctr = 1;
2550 for (
int i=0; i<num_tot; i++){
2551 if (idx_arr[3*i+2]==-1){
2552 int edge_len = idx_arr[3*i+0] != -1 ? A->
lens[idx_arr[3*i+0]]
2553 : B->
lens[idx_arr[3*i+1]];
2554 len_ctr *= edge_len;
2557 nnz_frac_C = std::min(1.,std::max(nnz_frac_C,nnz_frac_A*nnz_frac_B*len_ctr));
2561 if (size_memuse >= (
double)max_memuse){
2562 if (global_comm.
rank == 0)
2563 DPRINTF(1,
"Not enough memory available for topo %d with order %d to store tensors %ld/%ld\n", t,j,(int64_t)size_memuse,max_memuse);
2569 est_time = est_time_fold();
2570 iparam prm = map_fold(
false);
2571 sctr = construct_ctr(1, &prm);
2573 est_time = ((
spctr*)sctr)->est_time_rec(sctr->
num_lyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
2582 sctr = construct_ctr();
2584 est_time = ((
spctr*)sctr)->est_time_rec(sctr->
num_lyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
2590 if (global_comm.
rank == 0){
2591 printf(
"mapping passed contr est_time = %E sec\n", est_time);
2599 if (topo_i == old_topo_A){
2600 for (d=0; d<A->
order; d++){
2611 if (topo_i == old_topo_B){
2612 for (d=0; d<B->
order; d++){
2622 if (topo_i == old_topo_C){
2623 for (d=0; d<C->
order; d++){
2631 memuse = std::max(1.0*memuse,2.*C->
get_redist_mem(*dC, nnz_frac_C));
2634 memuse =
MAX(((
spctr*)sctr)->spmem_rec(nnz_frac_A,nnz_frac_B,nnz_frac_C), memuse);
2636 memuse =
MAX((int64_t)sctr->
mem_rec(), memuse);
2638 if (global_comm.
rank == 0){
2639 printf(
"total (with redistribution and transp) est_time = %E\n", est_time);
2644 if ((int64_t)memuse >= max_memuse){
2645 if (global_comm.
rank == 0)
2646 DPRINTF(1,
"Not enough memory available for topo %d with order %d memory %ld/%ld\n", t,j,memuse,max_memuse);
2651 DPRINTF(1,
"MPI does not handle enough bits for topo %d with order\n", j);
2656 if (est_time < best_time) {
2657 best_time = est_time;
2667 MPI_Allreduce(&best_time, &gbest_time, 1, MPI_DOUBLE, MPI_MIN, global_comm.
cm);
2668 if (best_time != gbest_time){
2672 MPI_Allreduce(&btopo, &ttopo, 1, MPI_INT, MPI_MIN, global_comm.
cm);
2681 void contraction::get_best_exh_map(
distribution const * dA,
distribution const * dB,
distribution const * dC,
topology * old_topo_A,
topology * old_topo_B,
topology * old_topo_C,
mapping const * old_map_A,
mapping const * old_map_B,
mapping const * old_map_C,
int & idx,
double & time,
double init_best_time=DBL_MAX){
2683 int need_remap_A, need_remap_B, need_remap_C;
2685 double est_time, best_time;
2687 bool is_ctr_sparse = is_sparse();
2691 best_time = init_best_time;
2697 &num_tot, &idx_arr);
2698 int64_t tot_num_choices = 0;
2699 for (
int i=0; i<(int)wrld->
topovec.size(); i++){
2701 tot_num_choices += get_num_map_variants(wrld->
topovec[i]);
2703 int64_t valid_mappings = 0;
2704 int64_t choice_offset = 0;
2707 for (
int i=0; i<(int)wrld->
topovec.size(); i++){
2709 int tnum_choices = get_num_map_variants(wrld->
topovec[i]);
2711 int64_t old_off = choice_offset;
2712 choice_offset += tnum_choices;
2713 for (
int j=0; j<tnum_choices; j++){
2714 if ((old_off + j)%global_comm.
np != global_comm.
rank)
2724 bool br = exh_map_to_topo(topo_i, j);
2726 if (!br)
DPRINTF(3,
"exh_map_to_topo returned false\n");
2736 br = switch_topo_perm();
2738 if (!br){
DPRINTF(3,
"switch topo perm returned false\n"); }
2741 if (check_mapping() == 0){
2754 printf(
"\nTest mappings:\n");
2761 double nnz_frac_A = 1.0;
2762 double nnz_frac_B = 1.0;
2763 double nnz_frac_C = 1.0;
2768 nnz_frac_C = std::max(nnz_frac_C,nnz_frac_A);
2769 nnz_frac_C = std::max(nnz_frac_C,nnz_frac_B);
2770 int64_t len_ctr = 1;
2771 for (
int i=0; i<num_tot; i++){
2772 if (idx_arr[3*i+2]==-1){
2773 int edge_len = idx_arr[3*i+0] != -1 ? A->
lens[idx_arr[3*i+0]]
2774 : B->
lens[idx_arr[3*i+1]];
2775 len_ctr *= edge_len;
2778 nnz_frac_C = std::min(1.,std::max(nnz_frac_C,nnz_frac_A*nnz_frac_B*len_ctr));
2785 est_time = est_time_fold();
2786 iparam prm = map_fold(
false);
2787 sctr = construct_ctr(1, &prm);
2789 est_time = ((
spctr*)sctr)->est_time_rec(sctr->
num_lyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
2798 sctr = construct_ctr();
2800 est_time = ((
spctr*)sctr)->est_time_rec(sctr->
num_lyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
2806 printf(
"mapping passed contr est_time = %E sec\n", est_time);
2814 if (topo_i == old_topo_A){
2815 for (d=0; d<A->
order; d++){
2826 if (topo_i == old_topo_B){
2827 for (d=0; d<B->
order; d++){
2837 if (topo_i == old_topo_C){
2838 for (d=0; d<C->
order; d++){
2846 memuse = std::max(1.*memuse,2.*C->
get_redist_mem(*dC, nnz_frac_C));
2849 if (est_time >= best_time)
continue;
2852 memuse =
MAX(((
spctr*)sctr)->spmem_rec(nnz_frac_A,nnz_frac_B,nnz_frac_C), memuse);
2854 memuse =
MAX((int64_t)sctr->
mem_rec(), memuse);
2856 printf(
"total (with redistribution and transp) est_time = %E\n", est_time);
2862 if ((int64_t)memuse >= max_memuse){
2863 DPRINTF(1,
"[EXH] Not enough memory available for topo %d with order %d memory %ld/%ld\n", i,j,memuse,max_memuse);
2872 DPRINTF(1,
"MPI does not handle enough bits for topo %d with order %d \n", i, j);
2876 if (est_time < best_time) {
2877 best_time = est_time;
2885 int64_t tot_valid_mappings;
2886 MPI_Allreduce(&valid_mappings, &tot_valid_mappings, 1, MPI_INT64_T, MPI_SUM, global_comm.
cm);
2887 if (A->
wrld->
rank == 0)
DPRINTF(2,
"number valid mappings was %ld/%ld\n", tot_valid_mappings, tot_num_choices);
2891 MPI_Allreduce(&best_time, &gbest_time, 1, MPI_DOUBLE, MPI_MIN, global_comm.
cm);
2892 if (best_time != gbest_time){
2896 MPI_Allreduce(&btopo, &ttopo, 1, MPI_INT, MPI_MIN, global_comm.
cm);
2906 int contraction::map(
ctr ** ctrf,
bool do_remap){
2907 int ret, j, need_remap, d;
2908 int * old_phase_A, * old_phase_B, * old_phase_C;
2909 topology * old_topo_A, * old_topo_B, * old_topo_C;
2911 old_topo_A = A->
topo;
2912 old_topo_B = B->
topo;
2913 old_topo_C = C->
topo;
2937 if (global_comm.
rank == 0)
2938 printf(
"Initial mappings:\n");
2955 for (j=0; j<A->
order; j++){
2959 for (j=0; j<B->
order; j++){
2963 for (j=0; j<C->
order; j++){
2968 int ttopo, ttopo_sel, ttopo_exh;
2969 double gbest_time_sel, gbest_time_exh;
2972 get_best_sel_map(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, ttopo_sel, gbest_time_sel);
2974 if (gbest_time_sel < 1.){
2975 gbest_time_exh = gbest_time_sel+1.;
2976 ttopo_exh = ttopo_sel;
2979 get_best_exh_map(dA, dB, dC, old_topo_A, old_topo_B, old_topo_C, old_map_A, old_map_B, old_map_C, ttopo_exh, gbest_time_exh, gbest_time_sel);
2982 if (gbest_time_sel <= gbest_time_exh){
2995 if (!do_remap || ttopo == INT_MAX || ttopo == -1){
2999 delete [] old_map_A;
3000 delete [] old_map_B;
3001 delete [] old_map_C;
3006 if (ttopo == INT_MAX || ttopo == -1){
3007 printf(
"ERROR: Failed to map contraction!\n");
3015 if (gbest_time_sel <= gbest_time_exh){
3018 if (((ttopo/6) & 1) > 0){
3019 topo_g = old_topo_A;
3022 if (((ttopo/6) & 2) > 0){
3023 topo_g = old_topo_B;
3027 if (((ttopo/6) & 4) > 0){
3028 topo_g = old_topo_C;
3031 assert(topo_g != NULL);
3033 }
else topo_g = wrld->
topovec[(ttopo-48)/6];
3035 int64_t choice_offset = 0;
3037 int64_t old_off = 0;
3038 for (i=0; i<(int)wrld->
topovec.size(); i++){
3040 int tnum_choices = get_num_map_variants(wrld->
topovec[i]);
3041 old_off = choice_offset;
3042 choice_offset += tnum_choices;
3043 if (choice_offset > ttopo)
break;
3046 j_g = ttopo-old_off;
3056 if (gbest_time_sel <= gbest_time_exh){
3057 ret = map_to_topology(topo_g, j_g);
3059 printf(
"ERROR ON FINAL MAP ATTEMPT, THIS SHOULD NOT HAPPEN\n");
3063 exh_map_to_topo(topo_g, j_g);
3067 if (!check_mapping())
3068 printf(
"ERROR ON FINAL MAP ATTEMPT, THIS SHOULD NOT HAPPEN\n");
3076 iparam prm = map_fold(
false);
3077 *ctrf = construct_ctr(1, &prm);
3082 *ctrf = construct_ctr();
3084 if (global_comm.
rank == 0)
3085 printf(
"New mappings:\n");
3089 MPI_Barrier(global_comm.
cm);
3093 double nnz_frac_A = 1.0;
3094 double nnz_frac_B = 1.0;
3095 double nnz_frac_C = 1.0;
3101 &num_tot, &idx_arr);
3106 int64_t len_ctr = 1;
3107 for (
int i=0; i<num_tot; i++){
3108 if (idx_arr[3*i+2]==-1){
3109 int edge_len = idx_arr[3*i+0] != -1 ? A->
lens[idx_arr[3*i+0]]
3110 : B->
lens[idx_arr[3*i+1]];
3111 len_ctr *= edge_len;
3114 nnz_frac_C = std::min(1.,std::max(nnz_frac_C,nnz_frac_A*nnz_frac_B*len_ctr));
3119 memuse =
MAX(((
spctr*)*ctrf)->spmem_rec(nnz_frac_A,nnz_frac_B,nnz_frac_C), memuse);
3121 memuse =
MAX((int64_t)(*ctrf)->mem_rec(), memuse);
3123 if (global_comm.
rank == 0){
3124 VPRINTF(1,
"Contraction will use %E bytes per processor out of %E available memory and take an estimated of %E sec\n",
3146 if (A->
topo == old_topo_A){
3147 for (d=0; d<A->
order; d++){
3156 if (B->
topo == old_topo_B){
3157 for (d=0; d<B->
order; d++){
3166 if (C->
topo == old_topo_C){
3167 for (d=0; d<C->
order; d++){
3176 TAU_FSTOP(redistribute_for_contraction);
3182 delete [] old_map_A;
3183 delete [] old_map_B;
3184 delete [] old_map_C;
3195 ctr * contraction::construct_dense_ctr(
int is_inner,
3196 iparam const * inner_params,
3199 int const * phys_mapped){
3200 int num_tot, i, i_A, i_B, i_C, is_top, nphys_dim;
3202 int64_t blk_sz_A, blk_sz_B, blk_sz_C;
3203 int64_t vrt_sz_A, vrt_sz_B, vrt_sz_C;
3206 int * blk_len_A, * virt_blk_len_A, * blk_len_B;
3207 int * virt_blk_len_B, * blk_len_C, * virt_blk_len_C;
3208 int * idx_arr, * virt_dim;
3212 ctr ** rec_ctr = NULL;
3223 &num_tot, &idx_arr);
3240 &vrt_sz_A, virt_blk_len_A, blk_len_A);
3242 &vrt_sz_B, virt_blk_len_B, blk_len_B);
3244 &vrt_sz_C, virt_blk_len_C, blk_len_C);
3275 for (i=0; i<nphys_dim; i++){
3276 if (phys_mapped[3*i+0] == 0 ||
3277 phys_mapped[3*i+1] == 0 ||
3278 phys_mapped[3*i+2] == 0){
3287 if (global_comm.
rank == 0)
3288 DPRINTF(2,
"Replicating tensor\n");
3302 int upload_phase_A = 1;
3303 int upload_phase_B = 1;
3304 int download_phase_C = 1;
3311 for (i=0; i<num_tot; i++){
3313 i_A = idx_arr[3*i+0];
3314 i_B = idx_arr[3*i+1];
3315 i_C = idx_arr[3*i+2];
3317 if ((i_A != -1 && i_B != -1 && i_C == -1) ||
3318 (i_A != -1 && i_B == -1 && i_C != -1) ||
3319 (i_A == -1 && i_B != -1 && i_C != -1)) {
3322 ctr_gen->alloc_host_buf =
false;
3446 if (bottom_ctr_gen == NULL)
3447 bottom_ctr_gen = ctr_gen;
3458 virt_dim[i] = map->
np;
3459 }
else if (i_B != -1){
3463 virt_dim[i] = map->
np;
3464 }
else if (i_C != -1){
3468 virt_dim[i] = map->
np;
3479 nvirt = nvirt * virt_dim[i];
3481 if (nvirt_all != NULL)
3483 bool do_offload =
false;
3485 if ((!is_custom || func->has_off_gemm) && is_inner > 0 && (is_custom || C->
sr->
is_offloadable())){
3487 if (bottom_ctr_gen != NULL)
3488 bottom_ctr_gen->alloc_host_buf =
true;
3489 ctr_offload * ctroff =
new ctr_offload(
this, blk_sz_A, blk_sz_B, blk_sz_C, total_iter, upload_phase_A, upload_phase_B, download_phase_C);
3496 rec_ctr = &ctroff->rec_ctr;
3500 if (blk_sz_A < vrt_sz_A){
3501 printf(
"blk_sz_A = %ld, vrt_sz_A = %ld\n", blk_sz_A, vrt_sz_A);
3502 printf(
"sizes are %ld %ld %ld\n",A->
size,B->
size,C->
size);
3507 if (blk_sz_B < vrt_sz_B){
3508 printf(
"blk_sz_B = %ld, vrt_sz_B = %ld\n", blk_sz_B, vrt_sz_B);
3509 printf(
"sizes are %ld %ld %ld\n",A->
size,B->
size,C->
size);
3514 if (blk_sz_C < vrt_sz_C){
3515 printf(
"blk_sz_C = %ld, vrt_sz_C = %ld\n", blk_sz_C, vrt_sz_C);
3516 printf(
"sizes are %ld %ld %ld\n",A->
size,B->
size,C->
size);
3521 ASSERT(blk_sz_A >= vrt_sz_A);
3522 ASSERT(blk_sz_B >= vrt_sz_B);
3523 ASSERT(blk_sz_C >= vrt_sz_C);
3526 ctr_virt * ctrv =
new ctr_virt(
this, num_tot, virt_dim, vrt_sz_A, vrt_sz_B, vrt_sz_C);
3539 if (inner_params != NULL)
3540 inp_cpy = *inner_params;
3543 seq_tsr_ctr * ctrseq =
new seq_tsr_ctr(
this, is_inner, &inp_cpy, virt_blk_len_A, virt_blk_len_B, virt_blk_len_C, vrt_sz_C);
3589 ctr * contraction::construct_sparse_ctr(
int is_inner,
3590 iparam const * inner_params,
3593 int const * phys_mapped){
3594 int num_tot, i, i_A, i_B, i_C, nphys_dim, is_top, nvirt;
3595 int * idx_arr, * virt_dim;
3596 int64_t blk_sz_A, blk_sz_B, blk_sz_C;
3597 int64_t vrt_sz_A, vrt_sz_B, vrt_sz_C;
3598 int * blk_len_A, * virt_blk_len_A, * blk_len_B;
3599 int * virt_blk_len_B, * blk_len_C, * virt_blk_len_C;
3601 spctr * hctr = NULL;
3602 spctr ** rec_ctr = NULL;
3614 &num_tot, &idx_arr);
3632 &vrt_sz_A, virt_blk_len_A, blk_len_A);
3634 &vrt_sz_B, virt_blk_len_B, blk_len_B);
3636 &vrt_sz_C, virt_blk_len_C, blk_len_C);
3675 for (i=0; i<nphys_dim; i++){
3676 if (phys_mapped[3*i+0] == 0 ||
3677 phys_mapped[3*i+1] == 0 ||
3678 phys_mapped[3*i+2] == 0){
3687 if (global_comm.
rank == 0)
3688 DPRINTF(2,
"Replicating tensor\n");
3703 int upload_phase_A = 1;
3704 int upload_phase_B = 1;
3705 int download_phase_C = 1;
3712 int spvirt_blk_len_A[A->
order];
3718 std::fill(spvirt_blk_len_A, spvirt_blk_len_A+A->
order, 1);
3720 memcpy(spvirt_blk_len_A, virt_blk_len_A,
sizeof(
int)*A->
order);
3721 int spvirt_blk_len_B[B->
order];
3727 std::fill(spvirt_blk_len_B, spvirt_blk_len_B+B->
order, 1);
3729 memcpy(spvirt_blk_len_B, virt_blk_len_B,
sizeof(
int)*B->
order);
3730 int spvirt_blk_len_C[C->
order];
3736 std::fill(spvirt_blk_len_C, spvirt_blk_len_C+C->
order, 1);
3738 memcpy(spvirt_blk_len_C, virt_blk_len_C,
sizeof(
int)*C->
order);
3740 for (i=0; i<num_tot; i++){
3742 i_A = idx_arr[3*i+0];
3743 i_B = idx_arr[3*i+1];
3744 i_C = idx_arr[3*i+2];
3746 if ((i_A != -1 && i_B != -1 && i_C == -1) ||
3747 (i_A != -1 && i_B == -1 && i_C != -1) ||
3748 (i_A == -1 && i_B != -1 && i_C != -1)) {
3751 ctr_gen->alloc_host_buf =
false;
3878 if (bottom_ctr_gen == NULL)
3879 bottom_ctr_gen = ctr_gen;
3890 virt_dim[i] = map->
np;
3891 }
else if (i_B != -1){
3895 virt_dim[i] = map->
np;
3896 }
else if (i_C != -1){
3900 virt_dim[i] = map->
np;
3911 nvirt = nvirt * virt_dim[i];
3914 if (nvirt_all != NULL)
3921 bool do_offload =
false;
3924 if (is_custom && func->has_off_gemm){
3926 if (bottom_ctr_gen != NULL)
3927 bottom_ctr_gen->alloc_host_buf =
true;
3928 spctr_offload * ctroff =
new spctr_offload(
this, blk_sz_A, blk_sz_B, blk_sz_C, total_iter, upload_phase_A, upload_phase_B, download_phase_C);
3935 rec_ctr = &ctroff->rec_ctr;
3978 if (inner_params != NULL)
3979 inp_cpy = *inner_params;
3984 ctrseq =
new seq_tsr_spctr(
this, krnl_type, &inp_cpy, virt_blk_len_A, virt_blk_len_B, virt_blk_len_C, 0);
3986 ctrseq =
new seq_tsr_spctr(
this, krnl_type, &inp_cpy, virt_blk_len_A, virt_blk_len_B, virt_blk_len_C, vrt_sz_C);
4002 ctr * contraction::construct_ctr(
int is_inner,
4003 iparam const * inner_params,
4015 memset(phys_mapped, 0,
sizeof(
int)*nphys_dim*3);
4017 for (i=0; i<A->
order; i++){
4020 phys_mapped[3*map->
cdt+0] = 1;
4025 phys_mapped[3*map->
cdt+0] = 1;
4029 for (i=0; i<B->
order; i++){
4032 phys_mapped[3*map->
cdt+1] = 1;
4037 phys_mapped[3*map->
cdt+1] = 1;
4041 for (i=0; i<C->
order; i++){
4044 phys_mapped[3*map->
cdt+2] = 1;
4049 phys_mapped[3*map->
cdt+2] = 1;
4055 hctr = construct_sparse_ctr(is_inner, inner_params, nvirt_all, is_used, phys_mapped);
4057 hctr = construct_dense_ctr(is_inner, inner_params, nvirt_all, is_used, phys_mapped);
4064 int contraction::contract(){
4075 for (
int i=0; i<C->
order; i++){
4076 new_idx_C[i]=i-num_diag;
4077 for (
int j=0; j<i; j++){
4078 if (idx_C[i] == idx_C[j]){
4079 new_idx_C[i]=j-num_diag;
4092 if (A == B || A == C){
4095 new_ctr.
A = new_tsr;
4096 stat = new_ctr.contract();
4103 new_ctr.
B = new_tsr;
4104 stat = new_ctr.contract();
4112 contraction CBA(B,idx_B,A,idx_A,alpha,C,idx_C,beta,func);
4171 #if DEBUG >= 1 //|| VERBOSE >= 1) 4180 prescale_operands();
4184 int64_t nA, nB, nC, up_nC;
4185 dtype * sA, * sB, * ans_C;
4186 dtype * uA, * uB, * uC;
4187 dtype * up_C, * up_ans_C, * pup_C;
4188 int order_A, order_B, order_C, i, pass;
4189 int * edge_len_A, * edge_len_B, * edge_len_C;
4190 int * sym_A, * sym_B, * sym_C;
4192 stat = allread_tsr(type->tid_A, &nsA, &sA);
4195 stat = allread_tsr(type->tid_B, &nsB, &sB);
4198 stat = allread_tsr(type->tid_C, &nC, &ans_C);
4212 MPI_Barrier(global_comm.
cm);
4218 if (stat ==
ERROR) {
4219 printf(
"Failed to map tensors to physical grid\n");
4223 if (check_mapping() != 0) {
4226 if (global_comm.
rank == 0)
4227 printf(
"Keeping mappings:\n");
4231 if (global_comm.
rank == 0){
4232 printf(
"Initial mappings are unsuitable mappings:\n");
4240 if (stat ==
ERROR) {
4241 printf(
"Failed to map tensors to physical grid\n");
4244 #if (VERBOSE >= 1 || DEBUG >= 1) 4245 if (global_comm.
rank == 0){
4265 MPI_Barrier(global_comm.
cm);
4270 bool is_inner =
false;
4272 is_inner = can_fold();
4279 ctrf = construct_ctr(1, &prm);
4283 if (global_comm.
rank == 0){
4288 double dtt = MPI_Wtime();
4312 MPI_Barrier(global_comm.
cm);
4319 int64_t * size_blk_A = NULL;
4320 int64_t * size_blk_B = NULL;
4321 int64_t * size_blk_C = NULL;
4349 else data_A = A->
data;
4351 else data_B = B->
data;
4353 else data_C = C->
data;
4365 if (data_C != C->
data) {
4381 if (size_blk_A != NULL)
cdealloc(size_blk_A);
4382 if (size_blk_B != NULL)
cdealloc(size_blk_B);
4383 if (size_blk_C != NULL)
cdealloc(size_blk_C);
4391 MPI_Barrier(global_comm.
cm);
4404 VPRINTF(2,
"Contraction permutation completed in %lf sec.\n",MPI_Wtime()-dtt);
4410 stat = allread_tsr(type->tid_A, &nA, &uA);
4412 stat = get_tsr_info(type->tid_A, &order_A, &edge_len_A, &sym_A);
4415 stat = allread_tsr(type->tid_B, &nB, &uB);
4417 stat = get_tsr_info(type->tid_B, &order_B, &edge_len_B, &sym_B);
4420 if (nsA != nA) { printf(
"nsA = " PRId64
", nA = " PRId64
"\n",nsA,nA);
ABORT; }
4421 if (nsB != nB) { printf(
"nsB = " PRId64
", nB = " PRId64
"\n",nsB,nB);
ABORT; }
4422 for (i=0; (int64_t)i<nA; i++){
4423 if (fabs(uA[i] - sA[i]) > 1.E-6){
4424 printf(
"A[i] = %lf, sA[i] = %lf\n", uA[i], sA[i]);
4427 for (i=0; (int64_t)i<nB; i++){
4428 if (fabs(uB[i] - sB[i]) > 1.E-6){
4429 printf(
"B[%d] = %lf, sB[%d] = %lf\n", i, uB[i], i, sB[i]);
4433 stat = allread_tsr(type->tid_C, &nC, &uC);
4435 stat = get_tsr_info(type->tid_C, &order_C, &edge_len_C, &sym_C);
4437 DEBUG_PRINTF(
"packed size of C is " PRId64
" (should be " PRId64
")\n", nC,
4443 uA, order_A, edge_len_A, edge_len_A, sym_A, idx_A,
4444 uB, order_B, edge_len_B, edge_len_B, sym_B, idx_B,
4446 ans_C, order_C, edge_len_C, edge_len_C, sym_C, idx_C);
4450 for (i=0; i<nC; i++){
4452 printf(
"PACKED: C[%d] = %lf, ans_C[%d] = %lf\n",
4453 i, C[i], i, ans_C[i]);
4458 punpack_tsr(uC, order_C, edge_len_C,
4459 sym_C, 1, &sym_tmp, &up_C);
4460 punpack_tsr(ans_C, order_C, edge_len_C,
4461 sym_C, 1, &sym_tmp, &up_ans_C);
4462 punpack_tsr(up_ans_C, order_C, edge_len_C,
4463 sym_C, 0, &sym_tmp, &pup_C);
4464 for (i=0; (int64_t)i<nC; i++){
4465 assert(fabs(pup_C[i] - ans_C[i]) < 1.E-6);
4469 for (i=0; i<order_C; i++){ up_nC *= edge_len_C[i]; };
4471 for (i=0; i<(int)up_nC; i++){
4472 if (fabs((up_C[i]-up_ans_C[i])/up_ans_C[i]) > 1.E-6 &&
4473 fabs((up_C[i]-up_ans_C[i])) > 1.E-6){
4474 printf(
"C[%d] = %lf, ans_C[%d] = %lf\n",
4475 i, up_C[i], i, up_ans_C[i]);
4490 int contraction::sym_contract(){
4496 int * map_A, * map_B, * map_C;
4498 int ** dstack_map_C;
4500 std::vector<contraction> perm_types;
4501 std::vector<int> signs;
4504 tensor * tnsr_A, * tnsr_B, * tnsr_C;
4506 bool is_cons = this->check_consistency();
4507 if (!is_cons)
return ERROR;
4520 for (
int i=0; i<C->
order; i++){
4521 new_idx_C[i]=i-num_diag;
4522 for (
int j=0; j<i; j++){
4523 if (idx_C[i] == idx_C[j]){
4524 new_idx_C[i]=j-num_diag;
4537 int * new_idx_A, * new_idx_B, * new_idx_C;
4538 if (!is_custom || func->left_distributive){
4540 if (new_tsr_A != A) {
4541 contraction ctr(new_tsr_A, new_idx_A, B, new_idx_B, alpha, C, new_idx_C, beta, func);
4552 if (!is_custom || func->right_distributive){
4554 if (new_tsr_B != B) {
4555 contraction ctr(A, new_idx_A, new_tsr_B, new_idx_B, alpha, C, new_idx_C, beta, func);
4571 memcpy(map_A, idx_A, A->
order*
sizeof(
int));
4572 memcpy(map_B, idx_B, B->
order*
sizeof(
int));
4573 memcpy(map_C, idx_C, C->
order*
sizeof(
int));
4581 if (tnsr_A != A)
delete tnsr_A;
4587 if (tnsr_B != B)
delete tnsr_B;
4594 dstack_map_C[nst_C] = map_C;
4595 dstack_tsr_C[nst_C] = tnsr_C;
4602 if (is_custom) fptr = func;
4610 if (tnsr_A == tnsr_C){
4613 nnew_ctr.
A = nnew_tsr;
4614 stat = nnew_ctr.sym_contract();
4616 }
else if (tnsr_B == tnsr_C){
4619 nnew_ctr.
B = nnew_tsr;
4620 stat = nnew_ctr.sym_contract();
4646 char const * align_alpha = alpha;
4650 tnsr_C->
sr->
addinv(alpha, u_align_alpha);
4653 align_alpha = u_align_alpha;
4657 tnsr_C->
sr->
safecopy(oc_align_alpha, align_alpha);
4662 for (
int i=0; i<ocfact; i++){
4663 tnsr_C->
sr->
add(oc_align_alpha, align_alpha, oc_align_alpha);
4673 if (new_ctr.unfold_broken_sym(NULL) != -1){
4674 if (global_comm.
rank == 0)
4675 DPRINTF(2,
"Contraction index is broken\n");
4678 new_ctr.unfold_broken_sym(&unfold_ctr);
4679 if (unfold_ctr->map(&ctrf, 0) ==
SUCCESS){
4693 if (tnsr_A == tnsr_B){
4694 tnsr_A =
new tensor(tnsr_B);
4699 if (global_comm.
rank == 0)
4700 DPRINTF(1,
"%d Performing index desymmetrization\n",tnsr_A->
wrld->
rank);
4701 unfold_ctr->
alpha = align_alpha;
4702 stat = unfold_ctr->sym_contract();
4704 int sidx_C[tnsr_C->
order];
4705 for (
int iis=0; iis<tnsr_C->
order; iis++){
4712 if (tnsr_A != unfold_ctr->
A){
4715 delete unfold_ctr->
A;
4717 if (tnsr_B != unfold_ctr->
B){
4720 delete unfold_ctr->
B;
4722 if (tnsr_C != unfold_ctr->
C){
4725 delete unfold_ctr->
C;
4728 DPRINTF(1,
"%d Not Performing index desymmetrization\n",tnsr_A->
wrld->
rank);
4733 for (i=0; i<(int)perm_types.size(); i++){
4735 C->
sr->
copy(new_alpha, oc_align_alpha);
4738 tnsr_C->
sr->
addinv(oc_align_alpha, new_alpha);
4740 perm_types[i].alpha = new_alpha;
4741 perm_types[i].beta = dbeta;
4742 stat = perm_types[i].contract();
4750 new_ctr.
alpha = oc_align_alpha;
4751 stat = new_ctr.contract();
4753 if (tnsr_A != A)
delete tnsr_A;
4754 if (tnsr_B != B)
delete tnsr_B;
4755 for (
int i=nst_C-1; i>=0; i--){
4756 dstack_tsr_C[i]->
extract_diag(dstack_map_C[i], 0, tnsr_C, &new_idx);
4758 tnsr_C = dstack_tsr_C[i];
4773 int contraction::home_contract(){
4774 #ifndef HOME_CONTRACT 4775 return sym_contract();
4778 int was_home_A, was_home_B, was_home_C;
4787 int64_t * nnz_blk_C = (int64_t*)
alloc(
sizeof(int64_t)*C->
calc_nvirt());
4789 int64_t * nnz_blk_zero = (int64_t*)
alloc(
sizeof(int64_t)*C->
calc_nvirt());
4790 std::fill(nnz_blk_zero, nnz_blk_zero+C->
calc_nvirt(), 0);
4804 for (
int i=0; i<C->
order; i++){ idx[i] =
'a'+i; }
4819 for (
int i=0; i<C->
order; i++){
4820 new_idx_C[i]=i-num_diag;
4821 for (
int j=0; j<i; j++){
4822 if (idx_C[i] == idx_C[j]){
4823 new_idx_C[i]=new_idx_C[j];
4860 pre_new_ctr.
A =
new tensor(A, 1, 1);
4861 pre_new_ctr.
A->
sparsify([](
char const *){
return true; });
4863 delete pre_new_ctr.
A;
4869 pre_new_ctr.
B =
new tensor(B, 1, 1);
4870 pre_new_ctr.
B->
sparsify([](
char const *){
return true; });
4872 delete pre_new_ctr.
B;
4884 if (this->is_sparse()){
4885 int num_tot, * idx_arr;
4889 &num_tot, &idx_arr);
4890 int iA = -1, iB = -1;
4891 int has_weigh =
false;
4893 int64_t szA1=1, szA2=1, szB1=1, szB2=1;
4894 for (
int i=0; i<num_tot; i++){
4895 if (idx_arr[3*i+0] != -1 &&
4896 idx_arr[3*i+1] != -1 &&
4897 idx_arr[3*i+2] != -1){
4899 iA = idx_arr[3*i+0];
4900 iB = idx_arr[3*i+1];
4901 szA1 *= A->
lens[iA];
4902 szA2 *= A->
lens[iA];
4903 szB1 *= B->
lens[iB];
4904 szB2 *= B->
lens[iB];
4905 }
else if (idx_arr[3*i+0] != -1 &&
4906 idx_arr[3*i+1] != -1 &&
4907 idx_arr[3*i+2] == -1){
4908 szA1 *= A->
lens[idx_arr[3*i+0]];
4909 szB1 *= B->
lens[idx_arr[3*i+1]];
4910 }
else if (idx_arr[3*i+0] != -1 &&
4911 idx_arr[3*i+1] == -1 &&
4912 idx_arr[3*i+2] != -1){
4913 szA1 *= A->
lens[idx_arr[3*i+0]];
4914 }
else if (idx_arr[3*i+0] == -1 &&
4915 idx_arr[3*i+1] != -1 &&
4916 idx_arr[3*i+2] != -1){
4917 szB1 *= B->
lens[idx_arr[3*i+1]];
4924 A_sz = std::max(A_sz, std::min(szA1, szA2));
4926 B_sz = std::max(B_sz, std::min(szB1, szB2));
4939 int * lensX = (
int*)
alloc(
sizeof(
int)*(X->
order+1));
4940 int * symX = (
int*)
alloc(
sizeof(
int)*(X->
order+1));
4941 int * nidxX = (
int*)
alloc(
sizeof(
int)*(X->
order));
4942 int * sidxX = (
int*)
alloc(
sizeof(
int)*(X->
order+1));
4943 int * cidxX = (
int*)
alloc(
sizeof(
int)*(X->
order+1));
4944 for (
int i=0; i<X->
order; i++){
4946 lensX[i] = X->
lens[i];
4947 symX[i] = X->
sym[i];
4950 cidxX[i] = idx_X[i];
4952 lensX[i+1] = X->
lens[i];
4953 symX[i+1] = X->
sym[i];
4956 cidxX[i+1] = idx_X[i];
4959 lensX[iX] = lensX[iX+1];
4961 sidxX[iX] = sidxX[iX+1];
4963 cidxX[iX] = num_tot;
4964 char * nname = (
char*)
alloc(strlen(X->
name) + 2);
4966 strcpy(nname, X->
name);
4974 nc =
new contraction(X2, cidxX, B, idx_B, alpha, C, idx_C, beta, func);
4975 nc->
idx_B[iB] = num_tot;
4977 nc =
new contraction(A, idx_A, X2, cidxX, alpha, C, idx_C, beta, func);
4978 nc->
idx_A[iA] = num_tot;
5000 new_ctr.
A =
new tensor(A, 0, 0);
5017 new_ctr.
B = new_ctr.
A;
5019 new_ctr.
B =
new tensor(B, 0, 0);
5041 new_ctr.
C = new_ctr.
A;
5043 new_ctr.
C = new_ctr.
B;
5045 new_ctr.
C =
new tensor(C, 0, 0);
5060 ret = new_ctr.sym_contract();
5061 if (ret!=
SUCCESS)
return ret;
5062 if (was_home_A) new_ctr.
A->
unfold();
5063 if (was_home_B && A != B) new_ctr.
B->
unfold();
5064 if (was_home_C) new_ctr.
C->
unfold();
5066 if (was_home_C && !new_ctr.
C->
is_home){
5068 DPRINTF(2,
"Migrating tensor %s back to home\n", C->
name);
5093 }
else if (was_home_C) {
5101 if (new_ctr.
A != new_ctr.
C){
5102 if (was_home_A && !new_ctr.
A->
is_home){
5110 }
else if (was_home_A) {
5116 if (new_ctr.
B != new_ctr.
A && new_ctr.
B != new_ctr.
C){
5117 if (was_home_B && A != B && !new_ctr.
B->
is_home){
5125 }
else if (was_home_B && A != B) {
5138 bool contraction::need_prescale_operands(){
5139 int num_tot, * idx_arr;
5143 &num_tot, &idx_arr);
5146 int * idx_T, * idx_V;
5162 for (
int iT=0; iT<T->
order; iT++){
5164 int iV = idx_arr[3*i+fV];
5166 int iiC = idx_arr[3*i+2];
5167 ASSERT(iT == idx_arr[3*i+fT]);
5169 while (T->
sym[iT+npres] ==
SY && iiC == -1 &&
5170 ( (iV == -1 && iiV == -1) ||
5171 (iV != -1 && iiV != -1 && V->
sym[iiV] ==
SY)
5175 int ii = idx_T[iT+npres];
5176 iiV = idx_arr[3*ii+fV];
5177 iiC = idx_arr[3*ii+2];
5179 if (T->
sym[iT+npres] ==
NS){
5181 ( (iV == -1 && iiV == -1) ||
5182 (iV != -1 && iiV != -1 && V->
sym[iiV] ==
NS)
5191 for (
int iV=0; iV<V->
order; iV++){
5193 int iiT = idx_arr[3*i+fT];
5194 int iiC = idx_arr[3*i+2];
5195 ASSERT(iV == idx_arr[3*i+fV]);
5197 while (V->
sym[iV+npres] ==
SY && iiC == -1 && iiT == -1){
5199 int ii = idx_V[iV+npres];
5200 iiT = idx_arr[3*ii+fT];
5201 iiC = idx_arr[3*i+2];
5203 if (V->
sym[iV+npres] ==
NS && iiC == -1 && iiT == -1){
5212 void contraction::prescale_operands(){
5213 int num_tot, * idx_arr;
5217 &num_tot, &idx_arr);
5220 int * idx_T, * idx_V;
5236 for (
int iT=0; iT<T->
order; iT++){
5238 int iV = idx_arr[3*i+fV];
5240 int iiC = idx_arr[3*i+2];
5241 ASSERT(iT == idx_arr[3*i+fT]);
5243 while (T->
sym[iT+npres] ==
SY && iiC == -1 &&
5244 ( (iV == -1 && iiV == -1) ||
5245 (iV != -1 && iiV != -1 && V->
sym[iiV] ==
SY)
5249 int ii = idx_T[iT+npres];
5250 iiV = idx_arr[3*ii+fV];
5251 iiC = idx_arr[3*ii+2];
5253 if (T->
sym[iT+npres] ==
NS){
5255 ( (iV == -1 && iiV == -1) ||
5256 (iV != -1 && iiV != -1 && V->
sym[iiV] ==
NS)
5262 int sym_mask[T->
order];
5263 std::fill(sym_mask, sym_mask+T->
order, 0);
5264 std::fill(sym_mask+iT, sym_mask+iT+npres, 1);
5278 iT += std::max(npres-1, 0);
5281 for (
int iV=0; iV<V->
order; iV++){
5283 int iiT = idx_arr[3*i+fT];
5284 int iiC = idx_arr[3*i+2];
5285 ASSERT(iV == idx_arr[3*i+fV]);
5287 while (V->
sym[iV+npres] ==
SY && iiC == -1 && iiT == -1){
5289 int ii = idx_V[iV+npres];
5290 iiT = idx_arr[3*ii+fT];
5291 iiC = idx_arr[3*i+2];
5293 if (V->
sym[iV+npres] ==
NS && iiC == -1 && iiT == -1){
5305 int sym_mask[V->
order];
5306 std::fill(sym_mask, sym_mask+V->
order, 0);
5307 std::fill(sym_mask+iV, sym_mask+iV+npres, 1);
5310 iV += std::max(npres-1, 0);
5315 void contraction::print(){
5319 MPI_Barrier(global_comm.
cm);
5320 if (global_comm.
rank == 0){
5324 sprintf(cname,
"%s", C->
name);
5325 sprintf(cname+strlen(cname),
"[");
5326 for (i=0; i<C->
order; i++){
5328 sprintf(cname+strlen(cname),
" %d",idx_C[i]);
5330 sprintf(cname+strlen(cname),
"%d",idx_C[i]);
5332 sprintf(cname+strlen(cname),
"] <- ");
5333 sprintf(cname+strlen(cname),
"%s", A->
name);
5334 sprintf(cname+strlen(cname),
"[");
5335 for (i=0; i<A->
order; i++){
5337 sprintf(cname+strlen(cname),
" %d",idx_A[i]);
5339 sprintf(cname+strlen(cname),
"%d",idx_A[i]);
5341 sprintf(cname+strlen(cname),
"]*");
5342 sprintf(cname+strlen(cname),
"%s", B->
name);
5343 sprintf(cname+strlen(cname),
"[");
5344 for (i=0; i<B->
order; i++){
5346 sprintf(cname+strlen(cname),
" %d",idx_B[i]);
5348 sprintf(cname+strlen(cname),
"%d",idx_B[i]);
5350 sprintf(cname+strlen(cname),
"]");
5351 printf(
"CTF: Contraction %s\n",cname);
void get_len_ordering(tensor const *A, tensor const *B, tensor const *C, int const *idx_A, int const *idx_B, int const *idx_C, int **new_ordering_A, int **new_ordering_B, int **new_ordering_C)
find ordering of indices of tensor to reduce to DGEMM (A, B, and C may be permuted ...
int64_t get_redist_mem(distribution const &old_dist, double nnz_frac)
int map_tensor(int num_phys_dims, int tsr_order, int const *tsr_edge_len, int const *tsr_sym_table, int *restricted, CommData *phys_comm, int const *comm_idx, int fill, mapping *tsr_edge_map)
map a tensor
void get_perm(int perm_order, ptype A, ptype B, ptype C, ptype &tA, ptype &tB, ptype &tC)
void scale_diagonals(int const *sym_mask)
scales each element by 1/(number of entries equivalent to it after permutation of indices for which s...
char * home_buffer
buffer associated with home mapping of tensor, to which it is returned
bool has_coo_ker
whether there was a custom COO CSRMM kernel provided for this algebraic structure ...
CTF_int::CommData cdt
communicator data for MPI comm defining this world
bool is_home
whether the latest tensor data is in the home buffer
int64_t * nnz_blk
nonzero elements in each block owned locally
void get_choice(int64_t n, int64_t k, int64_t ch, int *chs)
int * sym
symmetries among tensor dimensions
void execute()
run contraction
virtual int pair_size() const
gets pair size el_size plus the key size
std::list< mem_transfer > contract_mst()
gets rid of empty space on the stack
int calc_phase() const
compute the phase of a mapping
void aug_phys(topology const *topo, int idim)
adds a physical mapping to this mapping
double est_redist_time(distribution const &old_dist, double nnz_frac)
void calc_dim(int order, int64_t size, int const *edge_len, mapping const *edge_map, int64_t *vrt_sz, int *vrt_edge_len, int *blk_edge_len)
calculate the block-sizes of a tensor
void safecopy(char *&a, char const *b) const
copies element b to element a, , with checks for NULL and alloc as necessary
void execute(bool run_diag=false)
run summation
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
virtual int64_t mem_rec()
template int conv_idx< int >(int, int const *, int **)
int * pad_edge_len
padded tensor edge lengths
int * inner_ordering
ordering of the dimensions according to which the tensori s folded
int calc_phys_phase() const
compute the physical phase of a mapping
void get_sym_perms(summation const &sum, std::vector< summation > &perms, std::vector< int > &signs)
finds all permutations of a summation that must be done for a broken symmetry
void * mst_alloc(int64_t len)
mst_alloc allocates buffer on the specialized memory stack
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
int sparsify(char const *threshold=NULL, bool take_abs=true)
reduce tensor to sparse format, storing only nonzero data, or data above a specified threshold...
virtual void copy(char *a, char const *b) const
copies element b to element a
bool has_home
whether the tensor has a home mapping/buffer
int64_t size
current size of local tensor data chunk (mapping-dependent)
void * alloc(int64_t len)
alloc abstraction
int64_t home_size
size of home buffer
void desymmetrize(tensor *sym_tsr, tensor *nonsym_tsr, bool is_C)
unfolds the data of a tensor
void set_sym(int const *sym)
sets symmetry, WARNING: for internal use only !!!!
virtual char const * addid() const
MPI datatype for pairs.
void set_new_nnz_glb(int64_t const *nnz_blk)
sets the number of nonzeros both locally (nnz_loc) and overall globally (nnz_tot) ...
#define DEBUG_PRINTF(...)
virtual void dealloc(char *ptr) const
deallocate given pointer containing contiguous array of values
void copy_mapping(int order, mapping const *mapping_A, mapping *mapping_B)
copies mapping A to B
tensor * self_reduce(int const *idx_A, int **new_idx_A, int order_B, int const *idx_B, int **new_idx_B, int order_C=0, int const *idx_C=NULL, int **new_idx_C=NULL)
performs a partial reduction on the tensor (used in summation and contraction)
an instance of the CTF library (world) on a MPI communicator
untyped internal class for triply-typed bivariate function
bivar_function const * func
function to execute on elements
char const * beta
scaling of existing C
bool is_sparse
whether only the non-zero elements of the tensor are stored
int order
number of tensor dimensions
bool is_custom
whether there is a elementwise custom function
virtual char * alloc(int64_t n) const
allocate space for n items, necessary for object types
void set_padding()
sets padding and local size of a tensor given a mapping
virtual void addinv(char const *a, char *b) const
b = -a
CTF::World * wrld
distributed processor context on which tensor is defined
class for execution distributed scaling of a tensor
int rank
rank of local processor
bool is_cyclic
whether the tensor data is cyclically distributed (blocked if false)
virtual bool is_offloadable() const
int overcounting_factor(int order_A, const T &idx_A, const int *sym_A, int order_B, const T &idx_B, const int *sym_B, int order_C, const T &idx_C, const int *sym_C)
class for execution distributed contraction of tensors
int zero_out_padding()
sets padded portion of tensor to zero (this should be maintained internally)
int * lens
unpadded tensor edge lengths
contraction()
lazy constructor
int * idx_B
indices of right operand
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
void print_map(FILE *stream=stdout, bool allcall=1) const
displays mapping information
int comp_dim_map(mapping const *map_A, mapping const *map_B)
compares two mappings
bool is_data_aliased
whether the tensor data is an alias of another tensor object's data
int64_t nnz_tot
maximum number of local nonzero elements over all procs
void pull_alias(tensor const *other)
pulls data from an tensor with an aliased buffer
double est_time_transp(int order, int const *new_order, int const *edge_len, int dir, algstrct const *sr)
estimates time needed to transposes a non-symmetric (folded) tensor based on performance models ...
algstrct * sr
algstrct on which tensor elements and operations are defined
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
int64_t calc_npe() const
calculate the number of processes this tensor is distributed over return number of processes owning a...
void unfold(bool was_mod=0)
undo the folding of a local tensor block unsets is_folded and deletes rec_tsr
tensor * rec_tsr
representation of folded tensor (shares data pointer)
int is_equal(contraction const &os)
returns 1 if contractions have same tensors and index map
double estimate_time()
predicts execution time in seconds using performance models
bool is_mapped
whether a mapping has been selected
int set_zero()
elects a mapping and sets tensor data to zero
int * sym_table
order-by-order table of dimensional symmetry relations
virtual double est_time_rec(int nlyr)
int * idx_C
indices of output
int map_tensor_rem(int num_phys_dims, CommData *phys_comm, int fill=0)
map the remainder of a tensor
int64_t calc_nvirt() const
calculate virtualization factor of tensor return virtualization factor
int can_morph(topology const *topo_keep, topology const *topo_change)
determines if two topologies are compatible with each other
void permute_target(int order, int const *perm, int *arr)
permutes a permutation array
int map_symtsr(int tsr_order, int const *tsr_sym_table, mapping *tsr_edge_map)
adjust a mapping to maintan symmetry
int check_self_mapping(tensor const *tsr, int const *idx_map)
checks mapping in preparation for tensors scale, summ or contract
virtual void add(char const *a, char const *b, char *c) const
c = a+b
virtual void run(char *A, char *B, char *C)
int el_size
size of each element of algstrct in bytes
bool profile
whether profiling should be done for contractions/sums involving this tensor
int cdealloc(void *ptr)
free abstraction
int64_t proc_bytes_available()
gives total memory available on this MPI process
void calc_fold_lnmk(tensor const *A, tensor const *B, tensor const *C, int const *idx_A, int const *idx_B, int const *idx_C, int const *ordering_A, int const *ordering_B, iparam *inner_prm)
calculate the dimensions of the matrix the contraction gets reduced to (A, B, and C may be permuted) ...
std::vector< CTF_int::topology * > topovec
derived topologies
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
internal distributed tensor class
void nosym_transpose(tensor *A, int all_fdim_A, int const *all_flen_A, int const *new_order, int dir)
int extract_diag(int const *idx_map, int rw, tensor *&new_tsr, int **idx_map_new)
extracts the diagonal of a tensor if the index map specifies to do so
int64_t choose(int64_t n, int64_t k)
char const * alpha
scaling of A*B
int align_symmetric_indices(int order_A, T &idx_A, const int *sym_A, int order_B, T &idx_B, const int *sym_B)
topology * topo
topology to which the tensor is mapped
int redistribute(distribution const &old_dist, int const *old_offsets=NULL, int *const *old_permutation=NULL, int const *new_offsets=NULL, int *const *new_permutation=NULL)
permutes the data of a tensor to its new layout
int ctr_2d_gen_build(int is_used, CommData global_comm, int i, int *virt_dim, int &cg_edge_len, int &total_iter, tensor *A, int i_A, CommData *&cg_cdt_A, int64_t &cg_ctr_lda_A, int64_t &cg_ctr_sub_lda_A, bool &cg_move_A, int *blk_len_A, int64_t &blk_sz_A, int const *virt_blk_len_A, int &load_phase_A, tensor *B, int i_B, CommData *&cg_cdt_B, int64_t &cg_ctr_lda_B, int64_t &cg_ctr_sub_lda_B, bool &cg_move_B, int *blk_len_B, int64_t &blk_sz_B, int const *virt_blk_len_B, int &load_phase_B, tensor *C, int i_C, CommData *&cg_cdt_C, int64_t &cg_ctr_lda_C, int64_t &cg_ctr_sub_lda_C, bool &cg_move_C, int *blk_len_C, int64_t &blk_sz_C, int const *virt_blk_len_C, int &load_phase_C)
sets up a ctr_2d_general (2D SUMMA) level where A is not communicated function will be called with A/...
void extract_free_comms(topology const *topo, int order_A, mapping const *edge_map_A, int order_B, mapping const *edge_map_B, int &num_sub_phys_dims, CommData **psub_phys_comm, int **pcomm_idx)
extracts the set of physical dimensions still available for mapping
class for execution distributed summation of tensors
bool has_zero_edge_len
if true tensor has a zero edge length, so is zero, which short-cuts stuff
int * idx_A
indices of left operand
virtual char const * mulid() const
identity element for multiplication i.e. 1
void aug_virt(int tot_phase)
augments mapping to have sufficient virtualization so that the total phas is exactly tot_phase (assum...
void remove_fold()
removes folding without doing transpose unsets is_folded and deletes rec_tsr
void symmetrize(tensor *sym_tsr, tensor *nonsym_tsr)
folds the data of a tensor
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
char * name
name given to tensor
int np
number of processors
MPI_Comm comm
set of processors making up this world
void clear()
resets mapping to NOT_MAPPED
int conv_idx(int order, type const *cidx, int **iidx)
void morph_topo(topology const *new_topo, topology const *old_topo, int order, mapping *edge_map)
morphs a tensor topology into another
void clear_mapping()
zeros out mapping