2 #include "../scaling/strp_tsr.h" 3 #include "../mapping/mapping.h" 4 #include "../mapping/distribution.h" 5 #include "../tensor/untyped_tensor.h" 6 #include "../shared/util.h" 7 #include "../shared/memcontrol.h" 9 #include "../symmetry/sym_indices.h" 10 #include "../symmetry/symmetrization.h" 11 #include "../redistribution/nosym_transp.h" 12 #include "../redistribution/redist.h" 13 #include "../scaling/scaling.h" 27 memcpy(idx_A, other.
idx_A,
sizeof(
int)*other.
A->
order);
30 memcpy(idx_B, other.
idx_B,
sizeof(
int)*other.
B->
order);
50 idx_A = (
int*)
alloc(
sizeof(
int)*A->order);
51 idx_B = (
int*)
alloc(
sizeof(
int)*B->order);
53 memcpy(idx_A, idx_A_,
sizeof(
int)*A->order);
54 memcpy(idx_B, idx_B_,
sizeof(
int)*B->order);
70 conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B);
91 idx_A = (
int*)
alloc(
sizeof(
int)*A->order);
92 idx_B = (
int*)
alloc(
sizeof(
int)*B->order);
94 memcpy(idx_A, idx_A_,
sizeof(
int)*A->order);
95 memcpy(idx_B, idx_B_,
sizeof(
int)*B->order);
116 conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B);
120 #if (DEBUG >= 2 || VERBOSE >= 1) 122 if (A->wrld->cdt.rank == 0) printf(
"Summation::execute (head):\n");
127 int stat = home_sum_tsr(run_diag);
129 printf(
"CTF ERROR: Failed to perform summation\n");
137 void summation::get_fold_indices(
int * num_fold,
139 int i, in, num_tot, nfold, broken;
140 int iA, iB, inA, inB, iiA, iiB;
141 int * idx_arr, * idx;
147 for (i=0; i<num_tot; i++){
151 for (iA=0; iA<A->order; iA++){
158 inB = idx_arr[2*in+1];
159 if (((inA>=0) + (inB>=0) != 2) ||
160 (iB != -1 && inB - iB != in-i) ||
161 (iB != -1 && A->sym[inA] != B->sym[inB])){
166 }
while (A->sym[inA-1] !=
NS);
168 for (iiA=iA;iiA<inA;iiA++){
174 for (iB=0; iB<B->order; iB++){
181 inA = idx_arr[2*in+0];
182 if (((inB>=0) + (inA>=0) != 2) ||
183 (iA != -1 && inA - iA != in-i) ||
184 (iA != -1 && B->sym[inB] != A->sym[inA])){
188 }
while (B->sym[inB-1] !=
NS);
190 for (iiB=iB;iiB<inB;iiB++){
198 for (i=0; i<num_tot; i++){
209 int summation::can_fold(){
210 int i, j, nfold, * fold_idx;
212 if (A->is_sparse || B->is_sparse)
return 0;
214 for (i=0; i<A->order; i++){
215 for (j=i+1; j<A->order; j++){
216 if (idx_A[i] == idx_A[j])
return 0;
219 for (i=0; i<B->order; i++){
220 for (j=i+1; j<B->order; j++){
221 if (idx_B[i] == idx_B[j])
return 0;
224 get_fold_indices(&nfold, &fold_idx);
230 void summation::get_fold_sum(
summation *& fold_sum,
236 int * fold_idx, * fidx_map_A, * fidx_map_B;
237 tensor * ftsr_A, * ftsr_B;
239 get_fold_indices(&nfold, &fold_idx);
250 A->fold(nfold, fold_idx, idx_A,
251 &all_fdim_A, &all_flen_A);
252 B->fold(nfold, fold_idx, idx_B,
253 &all_fdim_B, &all_flen_B);
256 for (i=0; i<A->order; i++){
257 for (j=0; j<nfold; j++){
258 if (A->sym[i] ==
NS && idx_A[i] == fold_idx[j]){
265 for (i=0; i<B->order; i++){
266 for (j=0; j<nfold; j++){
267 if (B->sym[i] ==
NS && idx_B[i] == fold_idx[j]){
277 int * sidx_A, * sidx_B;
279 ftsr_B->
order, fidx_map_B, &sidx_B);
285 fold_sum =
new summation(A->rec_tsr, sidx_A, alpha, B->rec_tsr, sidx_B, beta);
290 int summation::map_fold(){
291 int i, all_fdim_A, all_fdim_B;
292 int * fnew_ord_A, * fnew_ord_B;
293 int * all_flen_A, * all_flen_B;
297 get_fold_sum(fold_sum, all_fdim_A, all_fdim_B, all_flen_A, all_flen_B);
299 if (A->wrld->rank == 0){
300 printf(
"Folded summation type:\n");
306 fold_sum->get_len_ordering(&fnew_ord_A, &fnew_ord_B);
323 for (i=0; i<fold_sum->
A->
order; i++){
337 double summation::est_time_fold(){
338 int all_fdim_A, all_fdim_B;
339 int * fnew_ord_A, * fnew_ord_B;
340 int * all_flen_A, * all_flen_B;
341 int * tAiord, * tBiord;
344 get_fold_sum(fold_sum, all_fdim_A, all_fdim_B, all_flen_A, all_flen_B);
345 fold_sum->get_len_ordering(&fnew_ord_A, &fnew_ord_B);
348 memcpy(tAiord, A->inner_ordering, all_fdim_A*
sizeof(
int));
349 memcpy(tBiord, B->inner_ordering, all_fdim_B*
sizeof(
int));
361 double esttime = 0.0;
363 esttime += A->calc_nvirt()*
est_time_transp(all_fdim_A, tAiord, all_flen_A, 1, A->sr);
364 esttime += 2.*B->calc_nvirt()*
est_time_transp(all_fdim_B, tBiord, all_flen_B, 1, B->sr);
379 void summation::get_len_ordering(
int ** new_ordering_A,
380 int ** new_ordering_B){
382 int * ordering_A, * ordering_B, * idx_arr;
390 ASSERT(num_tot == A->order);
391 ASSERT(num_tot == B->order);
396 for (
int i=0; i<num_tot; i++){
398 for (
int j=0; j<num_tot; j++){
399 if (idx_A[j] == idx_B[i])
404 *new_ordering_A = ordering_A;
405 *new_ordering_B = ordering_B;
409 tspsum * summation::construct_sparse_sum(
int const * phys_mapped){
410 int nvirt, i, iA, iB, order_tot, is_top, need_rep;
411 int64_t blk_sz_A, blk_sz_B, vrt_sz_A, vrt_sz_B;
415 int * virt_blk_len_A, * virt_blk_len_B;
416 int * blk_len_A, * blk_len_B;
418 tspsum * htsum = NULL , ** rec_tsum = NULL;
423 &order_tot, &idx_arr);
425 nphys_dim = A->topo->order;
436 calc_dim(A->order, blk_sz_A, A->pad_edge_len, A->edge_map,
437 &vrt_sz_A, virt_blk_len_A, blk_len_A);
438 calc_dim(B->order, blk_sz_B, B->pad_edge_len, B->edge_map,
439 &vrt_sz_B, virt_blk_len_B, blk_len_B);
442 for (i=0; i<order_tot; i++){
446 map = &A->edge_map[iA];
449 virt_dim[i] = map->
np;
451 else virt_dim[i] = 1;
454 map = &B->edge_map[iB];
457 virt_dim[i] = map->
np;
459 else virt_dim[i] = 1;
461 nvirt *= virt_dim[i];
467 if (A->wrld->np > 1){
488 if (B->wrld->np > 1){
533 for (i=0; i<nphys_dim; i++){
534 if (phys_mapped[2*i+0] == 0 ||
535 phys_mapped[2*i+1] == 0){
571 int * new_sym_A, * new_sym_B;
573 memcpy(new_sym_A, A->sym,
sizeof(
int)*A->order);
575 memcpy(new_sym_B, B->sym,
sizeof(
int)*B->order);
580 tsumseq->
sym_A = new_sym_A;
582 tsumseq->
sym_B = new_sym_B;
586 tsumseq->
func = func;
587 }
else tsumseq->
func = NULL;
602 tsum * summation::construct_dense_sum(
int inner_stride,
603 int const * phys_mapped){
604 int i, iA, iB, order_tot, is_top, sA, sB, need_rep, i_A, i_B, j, k;
605 int64_t blk_sz_A, blk_sz_B, vrt_sz_A, vrt_sz_B;
606 int nphys_dim, nvirt;
607 int * idx_arr, * virt_dim;
608 int * virt_blk_len_A, * virt_blk_len_B;
609 int * blk_len_A, * blk_len_B;
610 tsum * htsum = NULL , ** rec_tsum = NULL;
617 &order_tot, &idx_arr);
619 nphys_dim = A->topo->order;
630 calc_dim(A->order, blk_sz_A, A->pad_edge_len, A->edge_map,
631 &vrt_sz_A, virt_blk_len_A, blk_len_A);
632 calc_dim(B->order, blk_sz_B, B->pad_edge_len, B->edge_map,
633 &vrt_sz_B, virt_blk_len_B, blk_len_B);
635 sA =
strip_diag(A->order, order_tot, idx_A, vrt_sz_A,
636 A->edge_map, A->topo, A->sr,
637 blk_len_A, &blk_sz_A, &str_A);
638 sB =
strip_diag(B->order, order_tot, idx_B, vrt_sz_B,
639 B->edge_map, B->topo, B->sr,
640 blk_len_B, &blk_sz_B, &str_B);
642 if (A->wrld->cdt.rank == 0)
643 DPRINTF(1,
"Stripping tensor\n");
659 for (i=0; i<order_tot; i++){
663 map = &A->edge_map[iA];
666 virt_dim[i] = map->
np;
667 if (sA) virt_dim[i] = virt_dim[i]/str_A->
strip_dim[iA];
669 else virt_dim[i] = 1;
672 map = &B->edge_map[iB];
675 virt_dim[i] = map->
np;
676 if (sB) virt_dim[i] = virt_dim[i]/str_B->
strip_dim[iA];
678 else virt_dim[i] = 1;
680 nvirt *= virt_dim[i];
684 for (i=0; i<nphys_dim; i++){
685 if (phys_mapped[2*i+0] == 0 ||
686 phys_mapped[2*i+1] == 0){
723 int * new_sym_A, * new_sym_B;
725 memcpy(new_sym_A, A->sym,
sizeof(
int)*A->order);
727 memcpy(new_sym_B, B->sym,
sizeof(
int)*B->order);
730 if (inner_stride == -1){
738 for (i=0; i<A->order; i++){
739 if (A->sym[i] ==
NS){
740 for (j=0; j<itsr->
order; j++){
741 if (A->inner_ordering[j] == i_A){
745 }
while (j>=0 && A->sym[j] !=
NS);
746 for (k=j+1; k<=i; k++){
747 virt_blk_len_A[k] = 1;
758 for (i=0; i<B->order; i++){
759 if (B->sym[i] ==
NS){
760 for (j=0; j<itsr->
order; j++){
761 if (B->inner_ordering[j] == i_B){
765 }
while (j>=0 && B->sym[j] !=
NS);
766 for (k=j+1; k<=i; k++){
767 virt_blk_len_B[k] = 1;
778 tsumseq->
sym_A = new_sym_A;
780 tsumseq->
sym_B = new_sym_B;
784 tsumseq->
func = func;
785 }
else tsumseq->
func = NULL;
801 tsum * summation::construct_sum(
int inner_stride){
808 nphys_dim = A->topo->order;
811 memset(phys_mapped, 0,
sizeof(
int)*nphys_dim*2);
815 for (i=0; i<A->order; i++){
816 map = &A->edge_map[i];
818 phys_mapped[2*map->
cdt+0] = 1;
823 phys_mapped[2*map->
cdt+0] = 1;
827 for (i=0; i<B->order; i++){
828 map = &B->edge_map[i];
830 phys_mapped[2*map->
cdt+1] = 1;
835 phys_mapped[2*map->
cdt+1] = 1;
839 if (A->is_sparse || B->is_sparse){
840 htsum = construct_sparse_sum(phys_mapped);
842 htsum = construct_dense_sum(inner_stride, phys_mapped);
850 int summation::home_sum_tsr(
bool run_diag){
851 int ret, was_home_A, was_home_B;
852 tensor * tnsr_A, * tnsr_B;
940 #ifndef HOME_CONTRACT 942 ret = sym_sum_tsr(run_diag);
945 ret = sum_tensors(run_diag);
949 if (A->has_zero_edge_len ||
950 B->has_zero_edge_len){
951 if (!B->sr->isequal(beta,B->sr->mulid()) && !B->has_zero_edge_len){
952 int sub_idx_map_B[B->order];
954 for (
int i=0; i<B->order; i++){
955 sub_idx_map_B[i]=sm_idx;
957 for (
int j=0; j<i; j++){
958 if (idx_B[i]==idx_B[j]){
959 sub_idx_map_B[i]=sub_idx_map_B[j];
977 was_home_A = A->is_home;
978 was_home_B = B->is_home;
980 tnsr_A =
new tensor(A,0,0);
981 tnsr_A->
data = A->data;
987 tnsr_A->
topo = A->topo;
995 }
else tnsr_A = NULL;
997 tnsr_B =
new tensor(B,0,0);
998 tnsr_B->
data = B->data;
1004 tnsr_B->
topo = B->topo;
1012 }
else tnsr_B = NULL;
1014 if (A->wrld->cdt.rank == 0)
1015 printf(
"Start head sum:\n");
1019 ret = osum.sym_sum_tsr(run_diag);
1024 if (A->wrld->cdt.rank == 0)
1025 printf(
"End head sum:\n");
1028 if (ret!=
SUCCESS)
return ret;
1029 if (was_home_A) tnsr_A->
unfold();
1038 B->nnz_blk[i] = tnsr_B->
nnz_blk[i];
1043 B->data = tnsr_B->
data;
1046 if (was_home_B && !tnsr_B->
is_home){
1047 if (A->wrld->cdt.rank == 0)
1048 DPRINTF(1,
"Migrating tensor %s back to home\n", B->name);
1053 B->redistribute(odst);
1056 B->sr->copy(B->home_buffer, B->data, B->size);
1057 B->sr->dealloc(B->data);
1058 B->data = B->home_buffer;
1069 }
else if (was_home_B){
1071 if (tnsr_B->
data != B->data){
1072 printf(
"Tensor %s is a copy of %s and did not leave home but buffer is %p was %p\n", tnsr_B->
name, B->name, tnsr_B->
data, B->data);
1076 B->data = tnsr_B->
data;
1082 if (was_home_A && !tnsr_A->
is_home){
1093 }
else if (was_home_A) {
1098 A->data = tnsr_A->
data;
1105 int summation::sym_sum_tsr(
bool run_diag){
1106 int sidx, i, nst_B, * new_idx_map;
1107 int * map_A, * map_B;
1108 int ** dstack_map_B;
1109 tensor * tnsr_A, * tnsr_B, * new_tsr, ** dstack_tsr_B;
1110 std::vector<summation> perm_types;
1111 std::vector<int> signs;
1116 bool is_cons = check_consistency();
1117 if (!is_cons)
return ERROR;
1121 if (A->has_zero_edge_len || B->has_zero_edge_len){
1122 if (!B->sr->isequal(beta, B->sr->mulid()) && !B->has_zero_edge_len){
1123 int sub_idx_map_B[B->order];
1125 for (
int i=0; i<B->order; i++){
1126 sub_idx_map_B[i]=sm_idx;
1128 for (
int j=0; j<i; j++){
1129 if (idx_B[i]==idx_B[j]){
1130 sub_idx_map_B[i]=sub_idx_map_B[j];
1144 int * new_idx_A, * new_idx_B;
1145 if (!is_custom || func->is_distributive){
1146 tensor * new_tsr_A = A->
self_reduce(idx_A, &new_idx_A, B->order, idx_B, &new_idx_B);
1147 if (new_tsr_A != A) {
1148 summation s(new_tsr_A, new_idx_A, alpha, B, new_idx_B, beta, func);
1169 memcpy(map_A, idx_A, tnsr_A->
order*
sizeof(
int));
1170 memcpy(map_B, idx_B, tnsr_B->
order*
sizeof(
int));
1172 if (tnsr_A != A)
delete tnsr_A;
1175 map_A = new_idx_map;
1179 dstack_map_B[nst_B] = map_B;
1180 dstack_tsr_B[nst_B] = tnsr_B;
1183 map_B = new_idx_map;
1189 memcpy(new_sum.
idx_A, map_A,
sizeof(
int)*tnsr_A->
order);
1190 memcpy(new_sum.
idx_B, map_B,
sizeof(
int)*tnsr_B->
order);
1191 if (tnsr_A == tnsr_B){
1193 new_sum.
A = &nnew_tsr;
1195 new_sum.sym_sum_tsr(run_diag);
1224 if (ocfact != 1 || sign != 1){
1228 for (
int i=0; i<ocfact; i++){
1229 tnsr_B->
sr->
add(new_alpha, alpha, new_alpha);
1243 if (new_sum.unfold_broken_sym(NULL) != -1){
1244 if (A->wrld->cdt.rank == 0)
1245 DPRINTF(1,
"Permutational symmetry is broken\n");
1248 sidx = new_sum.unfold_broken_sym(&unfold_sum);
1251 int sidx2 = unfold_sum->unfold_broken_sym(NULL);
1252 if (sidx%2 == 0 && (A->sym[sidx/2] ==
SY || unfold_sum->
A->
sym[sidx/2] ==
SY)) sy = 1;
1253 if (sidx%2 == 1 && (B->sym[sidx/2] ==
SY || unfold_sum->
B->
sym[sidx/2] ==
SY)) sy = 1;
1254 if (!A->is_sparse && !B->is_sparse && (sidx2 != -1 ||
1257 if (unfold_sum->
A->
sym[sidx/2] ==
NS){
1258 if (A->wrld->cdt.rank == 0)
1259 DPRINTF(1,
"Performing operand desymmetrization for summation of A idx=%d\n",sidx/2);
1262 if (A->wrld->cdt.rank == 0)
1263 DPRINTF(1,
"Performing operand symmetrization for summation\n");
1267 unfold_sum->sym_sum_tsr(run_diag);
1269 if (tnsr_A != unfold_sum->
A){
1272 delete unfold_sum->
A;
1276 if (A->wrld->cdt.rank == 0)
1277 DPRINTF(1,
"Performing product desymmetrization for summation\n");
1279 unfold_sum->sym_sum_tsr(run_diag);
1280 if (A->wrld->cdt.rank == 0)
1281 DPRINTF(1,
"Performing product symmetrization for summation\n");
1283 int sidx_B[tnsr_B->
order];
1284 for (
int iis=0; iis<tnsr_B->
order; iis++){
1293 if (tnsr_B != unfold_sum->
B){
1296 delete unfold_sum->
B;
1300 if (sidx != -1 && sidx%2 == 1){
1301 delete unfold_sum->
B;
1302 }
else if (sidx != -1 && sidx%2 == 0){
1303 delete unfold_sum->
A;
1307 if (A->wrld->cdt.rank == 0)
1308 DPRINTF(1,
"Performing %d summation permutations\n",
1309 (
int)perm_types.size());
1313 tensor * inv_tsr_A = NULL;
1314 bool need_inv =
false;
1316 if (tnsr_B->
sr->
mulid() == NULL){
1317 for (i=0; i<(int)perm_types.size(); i++){
1322 inv_tsr_A =
new tensor(tnsr_A);
1326 for (i=0; i<(int)perm_types.size(); i++){
1328 if (signs[i] == -1 && need_inv){
1329 perm_types[i].A = inv_tsr_A;
1335 perm_types[i].alpha = new_alpha;
1337 perm_types[i].beta = dbeta;
1338 perm_types[i].sum_tensors(run_diag);
1353 new_sum.
alpha = alpha;
1359 if (tnsr_A != A)
delete tnsr_A;
1360 for (i=nst_B-1; i>=0; i--){
1362 dstack_tsr_B[i]->
extract_diag(dstack_map_B[i], 0, tnsr_B, &new_idx_map);
1367 tnsr_B = dstack_tsr_B[i];
1385 int stat, * new_idx_map;
1386 int * map_A, * map_B;
1388 int ** dstack_map_B;
1389 tensor * tnsr_A, * tnsr_B, * new_tsr, ** dstack_tsr_B;
1395 check_consistency();
1398 if (A->has_zero_edge_len || B->has_zero_edge_len){
1399 if (!B->sr->isequal(beta,B->sr->mulid()) && !B->has_zero_edge_len){
1404 int sub_idx_map_B[B->order];
1406 for (
int i=0; i<B->order; i++){
1407 sub_idx_map_B[i]=sm_idx;
1409 for (
int j=0; j<i; j++){
1410 if (idx_B[i]==idx_B[j]){
1411 sub_idx_map_B[i]=sub_idx_map_B[j];
1432 memcpy(map_A, idx_A, tnsr_A->
order*
sizeof(
int));
1433 memcpy(map_B, idx_B, tnsr_B->
order*
sizeof(
int));
1435 if (tnsr_A != A)
delete tnsr_A;
1438 map_A = new_idx_map;
1442 dstack_map_B[nst_B] = map_B;
1443 dstack_tsr_B[nst_B] = tnsr_B;
1446 map_B = new_idx_map;
1450 tensor * stnsr_A = tnsr_A;
1451 tnsr_A =
new tensor(stnsr_A);
1453 if (A != stnsr_A)
delete stnsr_A;
1463 if (tnsr_A == tnsr_B){
1465 new_sum.
A = nnew_tsr;
1492 int order_A, order_B, i;
1493 int * edge_len_A, * edge_len_B;
1494 int * sym_A, * sym_B;
1495 stat = allread_tsr(ntid_A, &nsA, &sA);
1498 stat = allread_tsr(ntid_B, &nsB, &sB);
1510 if (new_sum.check_mapping() == 0) {
1513 if (A->wrld->cdt.rank == 0){
1514 printf(
"Remapping tensors for sum:\n");
1518 stat = new_sum.map();
1519 if (stat ==
ERROR) {
1520 printf(
"Failed to map tensors to physical grid\n");
1527 if (A->wrld->cdt.rank == 0){
1528 printf(
"Keeping mappings:\n");
1535 ASSERT(new_sum.check_mapping());
1537 if (is_custom ==
false && new_sum.can_fold()){
1539 double est_time_nofold = 4.*(A->size + B->size)*
COST_MEMBW;
1540 if (new_sum.est_time_fold() + (A->size + B->size)*
COST_MEMBW < est_time_nofold){
1545 inner_stride = new_sum.map_fold();
1547 sumf = new_sum.construct_sum(inner_stride);
1552 sumf = new_sum.construct_sum();
1556 if (A->wrld->cdt.rank == 0){
1557 printf(
"Could not fold summation, is_custom = %d, new_sum.can_fold = %d\n", is_custom, new_sum.can_fold());
1560 sumf = new_sum.construct_sum();
1565 sumf = new_sum.construct_sum();
1575 DEBUG_PRINTF(
"[%d] performing tensor sum\n", A->wrld->cdt.rank);
1641 stat = allread_tsr(ntid_A, &nA, &uA);
1643 stat = get_info(ntid_A, &order_A, &edge_len_A, &sym_A);
1646 stat = allread_tsr(ntid_B, &nB, &uB);
1648 stat = get_info(ntid_B, &order_B, &edge_len_B, &sym_B);
1651 if (nsA != nA) { printf(
"nsA = " PRId64
", nA = " PRId64
"\n",nsA,nA);
ABORT; }
1652 if (nsB != nB) { printf(
"nsB = " PRId64
", nB = " PRId64
"\n",nsB,nB);
ABORT; }
1653 for (i=0; (int64_t)i<nA; i++){
1654 if (fabs(uA[i] - sA[i]) > 1.E-6){
1655 printf(
"A[i] = %lf, sA[i] = %lf\n", uA[i], sA[i]);
1659 cpy_sym_sum(alpha, uA, order_A, edge_len_A, edge_len_A, sym_A, map_A,
1660 beta, sB, order_B, edge_len_B, edge_len_B, sym_B, map_B);
1663 for (i=0; (int64_t)i<nB; i++){
1664 if (fabs(uB[i] - sB[i]) > 1.E-6){
1665 printf(
"B[%d] = %lf, sB[%d] = %lf\n", i, uB[i], i, sB[i]);
1667 assert(fabs(sB[i] - uB[i]) < 1.E-6);
1679 for (
int i=nst_B-1; i>=0; i--){
1680 int ret = dstack_tsr_B[i]->
extract_diag(dstack_map_B[i], 0, tnsr_B, &new_idx_map);
1683 tnsr_B = dstack_tsr_B[i];
1700 int summation::unfold_broken_sym(
summation ** nnew_sum){
1701 int sidx, i, num_tot, iA, iA2, iB;
1706 if (nnew_sum != NULL){
1708 *nnew_sum = new_sum;
1709 }
else new_sum = NULL;
1713 &num_tot, &idx_arr);
1716 for (i=0; i<A->order; i++){
1717 if (A->sym[i] !=
NS){
1719 if (idx_arr[2*iA+1] != -1){
1720 if (B->sym[idx_arr[2*iA+1]] ==
NS ||
1721 ((B->sym[idx_arr[2*iA+1]] ==
AS) ^ (A->sym[i] ==
AS)) ||
1722 idx_arr[2*idx_A[i+1]+1] == -1 ||
1723 idx_A[i+1] != idx_B[idx_arr[2*iA+1]+1]){
1727 }
else if (idx_arr[2*idx_A[i+1]+1] != -1){
1734 for (i=0; i<B->order; i++){
1735 if (B->sym[i] !=
NS){
1737 if (idx_arr[2*iB+0] != -1){
1738 if (A->sym[idx_arr[2*iB+0]] ==
NS ||
1739 ((A->sym[idx_arr[2*iB+0]] ==
AS) ^ (B->sym[i] ==
AS)) ||
1740 idx_arr[2*idx_B[i+1]+0] == -1 ||
1741 idx_B[i+1] != idx_A[idx_arr[2*iB+0]+1]){
1744 }
else if (A->sym[idx_arr[2*iB+0]] ==
NS){
1749 }
else if (idx_arr[2*idx_B[i+1]+0] != -1){
1758 for (i=0; i<A->order; i++){
1759 if (A->sym[i] ==
SY){
1762 if (idx_arr[2*iA+1] == -1 &&
1763 idx_arr[2*iA2+1] == -1){
1770 if (nnew_sum != NULL && sidx != -1){
1772 new_sum->
A =
new tensor(A, 0, 0);
1773 int nA_sym[A->order];
1774 memcpy(nA_sym, new_sum->
A->
sym,
sizeof(
int)*new_sum->
A->
order);
1775 nA_sym[sidx/2] = bsym;
1778 new_sum->
B =
new tensor(B, 0, 0);
1779 int nB_sym[B->order];
1780 memcpy(nB_sym, new_sum->
B->
sym,
sizeof(
int)*new_sum->
B->
order);
1781 nB_sym[sidx/2] =
NS;
1789 bool summation::check_consistency(){
1790 int i, num_tot, len;
1796 &num_tot, &idx_arr);
1798 for (i=0; i<num_tot; i++){
1800 iA = idx_arr[2*i+0];
1801 iB = idx_arr[2*i+1];
1805 if (len != -1 && iB != -1 && len != B->lens[iB]){
1806 if (A->wrld->cdt.rank == 0){
1807 printf(
"i = %d Error in sum call: The %dth edge length (%d) of tensor %s does not",
1808 i, iA, len, A->name);
1809 printf(
"match the %dth edge length (%d) of tensor %s.\n",
1810 iB, B->lens[iB], B->name);
1824 if (A != os.
A)
return 0;
1825 if (B != os.
B)
return 0;
1827 for (i=0; i<A->order; i++){
1828 if (idx_A[i] != os.
idx_A[i])
return 0;
1830 for (i=0; i<B->order; i++){
1831 if (idx_B[i] != os.
idx_B[i])
return 0;
1836 int summation::check_mapping(){
1837 int i, pass, order_tot, iA, iB;
1838 int * idx_arr, * phys_map;
1847 if (A->is_mapped == 0) pass = 0;
1848 if (B->is_mapped == 0) pass = 0;
1851 if (A->is_folded == 1) pass = 0;
1852 if (B->is_folded == 1) pass = 0;
1854 if (A->topo != B->topo) pass = 0;
1857 DPRINTF(4,
"failed confirmation here\n");
1863 memset(phys_map, 0,
sizeof(
int)*A->topo->order);
1867 &order_tot, &idx_arr);
1874 DPRINTF(4,
"failed confirmation here\n");
1876 for (i=0; i<order_tot; i++){
1878 iB = idx_arr[2*i+1];
1879 if (iA != -1 && iB != -1) {
1880 if (!
comp_dim_map(&A->edge_map[iA], &B->edge_map[iB])){
1882 DPRINTF(4,
"failed confirmation here i=%d\n",i);
1885 if (iA != -1 && iB == -1) {
1886 mapping * map = &A->edge_map[iA];
1888 phys_map[map->
cdt]++;
1893 if (iB != -1 && iA == -1){
1894 mapping * map = &B->edge_map[iB];
1896 phys_map[map->
cdt]++;
1903 for (i=0; i<A->topo->order; i++){
1904 if (phys_map[i] > 1) {
1906 DPRINTF(3,
"failed confirmation here i=%d\n",i);
1914 if (B->sr->addid() == NULL || B->is_sparse){
1915 int ndim_mapped = 0;
1916 for (
int j=0; j<B->order; j++){
1917 mapping * map = &B->edge_map[j];
1925 if (ndim_mapped < B->topo->order) pass = 0;
1933 int summation::map_sum_indices(
topology const * topo){
1934 int tsr_order, isum, iA, iB, i, j, jsum, jX, stat;
1935 int * tsr_edge_len, * tsr_sym_table, * restricted;
1936 int * idx_arr, * idx_sum;
1937 int num_sum, num_tot, idx_num;
1945 &num_tot, &idx_arr);
1950 for (i=0; i<num_tot; i++){
1951 if (idx_arr[2*i] != -1 && idx_arr[2*i+1] != -1){
1952 idx_sum[num_sum] = i;
1957 tsr_order = num_sum;
1965 memset(tsr_sym_table, 0, tsr_order*tsr_order*
sizeof(
int));
1966 memset(restricted, 0, tsr_order*
sizeof(
int));
1968 for (i=0; i<tsr_order; i++){
1973 for (i=0; i<num_sum; i++){
1975 iA = idx_arr[isum*2+0];
1976 iB = idx_arr[isum*2+1];
1981 }
else if (B->edge_map[iB].type !=
NOT_MAPPED){
1989 for (i=0; i<num_sum; i++){
1991 iA = idx_arr[isum*idx_num+0];
1992 iB = idx_arr[isum*idx_num+1];
1994 tsr_edge_len[i] = A->pad_edge_len[iA];
1999 if (A->sym[iA] !=
NS){
2000 for (j=0; j<num_sum; j++){
2002 jX = idx_arr[jsum*idx_num+0];
2004 tsr_sym_table[i*tsr_order+j] = 1;
2005 tsr_sym_table[j*tsr_order+i] = 1;
2009 if (B->sym[iB] !=
NS){
2010 for (j=0; j<num_sum; j++){
2012 jX = idx_arr[jsum*idx_num+1];
2014 tsr_sym_table[i*tsr_order+j] = 1;
2015 tsr_sym_table[j*tsr_order+i] = 1;
2022 tsr_edge_len, tsr_sym_table,
2034 for (i=0; i<num_sum; i++){
2036 iA = idx_arr[isum*idx_num+0];
2037 iB = idx_arr[isum*idx_num+1];
2046 for (i=0; i<num_sum; i++){
2058 int summation::map(){
2059 int i, ret, need_remap;
2060 int need_remap_A, need_remap_B;
2062 topology * old_topo_A, * old_topo_B;
2066 ASSERT(A->wrld->cdt.cm == B->wrld->cdt.cm);
2067 World * wrld = A->wrld;
2071 if (wrld->
rank == 0)
2072 printf(
"Initial mappings:\n");
2073 A->print_map(stdout);
2074 B->print_map(stdout);
2086 old_topo_A = A->topo;
2087 old_topo_B = B->topo;
2094 int64_t min_size = INT64_MAX;
2096 for (i=A->wrld->cdt.rank; i<2*(
int)A->wrld->topovec.size(); i+=A->wrld->cdt.np){
2111 else if (ret !=
SUCCESS)
return ret;
2115 else if (ret !=
SUCCESS)
return ret;
2117 ret = map_sum_indices(A->topo);
2125 else if (ret !=
SUCCESS)
return ret;
2129 else if (ret !=
SUCCESS)
return ret;
2135 else if (ret !=
SUCCESS)
return ret;
2136 ret = A->map_tensor_rem(A->topo->order,
2139 else if (ret !=
SUCCESS)
return ret;
2142 idx_B, B->edge_map,0);
2143 ret = B->map_tensor_rem(B->topo->order,
2146 else if (ret !=
SUCCESS)
return ret;
2150 else if (ret !=
SUCCESS)
return ret;
2151 ret = B->map_tensor_rem(B->topo->order,
2154 else if (ret !=
SUCCESS)
return ret;
2157 idx_A, A->edge_map,0);
2158 ret = A->map_tensor_rem(A->topo->order,
2161 else if (ret !=
SUCCESS)
return ret;
2166 else if (ret !=
SUCCESS)
return ret;
2170 else if (ret !=
SUCCESS)
return ret;
2179 A->print_map(stdout,0);
2180 B->print_map(stdout,0);
2182 if (!check_mapping())
continue;
2185 size = A->size + B->size;
2190 if (A->topo == old_topo_A){
2191 for (d=0; d<A->order; d++){
2199 size += A->size*std::max(1.0,log2(wrld->
cdt.
np));
2202 double nnz_frac_A = std::min(2,(
int)A->calc_npe())*((
double)A->nnz_tot)/(A->size*A->calc_npe());
2203 size += 25.*nnz_frac_A*A->size*std::max(1.0,log2(wrld->
cdt.
np));
2205 size += 5.*A->size*std::max(1.0,log2(wrld->
cdt.
np));
2208 if (B->topo == old_topo_B){
2209 for (d=0; d<B->order; d++){
2217 size += B->size*std::max(1.0,log2(wrld->
cdt.
np));
2223 double nnz_frac_A = std::min(2,(
int)A->calc_npe())*((
double)A->nnz_tot)/(A->size*A->calc_npe());
2224 double nnz_frac_B = std::min(2,(
int)B->calc_npe())*((
double)B->nnz_tot)/(B->size*B->calc_npe());
2225 nnz_frac_B = std::max(nnz_frac_B, nnz_frac_A);
2226 size += 25.*pref*nnz_frac_B*B->size*std::max(1.0,log2(wrld->
cdt.
np));
2228 size += 5.*pref*B->size*std::max(1.0,log2(wrld->
cdt.
np));
2240 if (btopo == -1 || size < min_size){
2246 min_size = INT64_MAX;
2251 printf(
"CTF ERROR: Failed to map pair!\n");
2261 A->topo = wrld->
topovec[gtopo/2];
2262 B->topo = wrld->
topovec[gtopo/2];
2274 ret = map_sum_indices(A->topo);
2287 ret = A->map_tensor_rem(A->topo->order,
2292 idx_B, B->edge_map,0);
2293 ret = B->map_tensor_rem(B->topo->order,
2299 ret = B->map_tensor_rem(B->topo->order,
2304 idx_A, A->edge_map,0);
2305 ret = A->map_tensor_rem(A->topo->order,
2323 printf(
"New mappings:\n");
2324 A->print_map(stdout);
2325 B->print_map(stdout);
2333 if (A->topo == old_topo_A){
2334 for (d=0; d<A->order; d++){
2341 A->redistribute(dA);
2344 if (B->topo == old_topo_B){
2345 for (d=0; d<B->order; d++){
2352 B->redistribute(dB);
2356 delete [] old_map_A;
2357 delete [] old_map_B;
2366 CommData global_comm = A->wrld->cdt;
2367 MPI_Barrier(global_comm.
cm);
2368 if (global_comm.
rank == 0){
2371 sprintf(sname,
"%s", B->name);
2372 sprintf(sname+strlen(sname),
"[");
2373 for (i=0; i<B->order; i++){
2375 sprintf(sname+strlen(sname),
" %d",idx_B[i]);
2377 sprintf(sname+strlen(sname),
"%d",idx_B[i]);
2379 sprintf(sname+strlen(sname),
"] <- ");
2380 sprintf(sname+strlen(sname),
"%s", A->name);
2381 sprintf(sname+strlen(sname),
"[");
2382 for (i=0; i<A->order; i++){
2384 sprintf(sname+strlen(sname),
" %d",idx_A[i]);
2386 sprintf(sname+strlen(sname),
"%d",idx_A[i]);
2388 sprintf(sname+strlen(sname),
"]");
2389 printf(
"CTF: Summation %s\n",sname);
2443 void summation::sp_sum(){
2447 bool is_idx_matched =
true;
2448 if (A->order != B->order)
2449 is_idx_matched =
false;
2451 for (
int o=0; o<A->order; o++){
2452 if (idx_A[o] != idx_B[o]){
2453 is_idx_matched =
false;
2460 A->read_local_nnz(&num_pair, &mapped_data);
2462 if (!is_idx_matched){
2463 int64_t lda_A[A->order];
2464 int64_t lda_B[B->order];
2466 for (
int o=1; o<A->order; o++){
2467 lda_A[o] = lda_A[o-1]*A->lens[o];
2470 for (
int o=1; o<B->order; o++){
2471 lda_B[o] = lda_B[o-1]*B->lens[o];
2475 #pragma omp parallel for 2477 for (
int i=0; i<num_pair; i++){
2478 int64_t k = pi[i].
k();
2480 for (
int o=0; o<A->order; o++){
2481 int64_t kpart = (k/lda_A[o])%A->lens[o];
2483 for (
int q=0; q<B->order; q++){
2484 if (idx_A[o] == idx_B[q]){
2485 k_new += kpart*lda_B[q];
2489 ((int64_t*)(pi[i].ptr))[0] = k_new;
2493 bool is_reduce =
false;
2494 for (
int oA=0; oA<A->order; oA++){
2496 for (
int oB=0; oB<B->order; oB++){
2497 if (idx_A[oA] == idx_B[oB]){
2501 if (!inB) is_reduce =
true;
2504 if (is_reduce && num_pair > 0){
2507 for (int64_t i=1; i<num_pair; i++){
2508 if (pi[i].k() != pi[i-1].k()) nuniq++;
2510 if (nuniq != num_pair){
2511 char * swap_data = mapped_data;
2512 alloc_ptr(A->sr->pair_size()*nuniq, (
void**)&mapped_data);
2515 int64_t acc_st = -1;
2517 for (int64_t i=1; i<num_pair; i++){
2518 if (pi[i].k() == pi[i-1].k()){
2520 memcpy(pi_new[pfx].ptr, pi[cp_st].ptr, A->
sr->
pair_size()*(i-cp_st));
2525 if (acc_st == -1) acc_st = i;
2528 for (int64_t j=acc_st; j<i; j++){
2529 A->sr->add(pi_new[pfx-1].d(), pi[j].d(), pi_new[pfx-1].d());
2535 if (cp_st < num_pair)
2536 memcpy(pi_new[pfx].ptr, pi[cp_st].ptr, A->
sr->
pair_size()*(num_pair-cp_st));
2538 for (int64_t j=acc_st; j<num_pair; j++){
2539 A->sr->add(pi_new[pfx-1].d(), pi[j].d(), pi_new[pfx-1].d());
2548 if (is_custom && !func->is_accumulator()){
2549 char * swap_data = mapped_data;
2550 alloc_ptr(B->sr->pair_size()*num_pair, (
void**)&mapped_data);
2553 #pragma omp parallel for 2555 for (int64_t i=0; i<num_pair; i++){
2557 func->apply_f(pi[i].d(), pi_new[i].d());
2559 char tmp_A[A->sr->el_size];
2560 A->sr->mul(pi[i].d(), alpha, tmp_A);
2561 func->apply_f(tmp_A, pi_new[i].d());
2571 int64_t map_idx_len[B->order];
2572 int64_t map_idx_lda[B->order];
2573 int map_idx_rev[B->order];
2574 for (
int oB=0; oB<B->order; oB++){
2576 for (
int oA=0; oA<A->order; oA++){
2577 if (idx_A[oA] == idx_B[oB]){
2583 for (
int ooB=0; ooB<oB; ooB++){
2584 if (idx_B[ooB] == idx_B[oB]){
2586 map_idx_lda[map_idx_rev[ooB]] += lda_B[oB];
2591 map_idx_len[nmap_idx] = B->lens[oB];
2592 map_idx_lda[nmap_idx] = lda_B[oB];
2593 map_idx_rev[nmap_idx] = oB;
2600 for (
int midx=0; midx<nmap_idx; midx++){
2601 tot_rep *= map_idx_len[midx];
2603 char * swap_data = mapped_data;
2604 alloc_ptr(A->sr->pair_size()*num_pair*tot_rep, (
void**)&mapped_data);
2607 #pragma omp parallel for 2609 for (int64_t i=0; i<num_pair; i++){
2610 for (int64_t r=0; r<tot_rep; r++){
2611 memcpy(pi_new[i*tot_rep+r].ptr, pi[i].ptr, A->
sr->
pair_size());
2615 #pragma omp parallel for 2617 for (int64_t i=0; i<num_pair; i++){
2619 for (
int midx=0; midx<nmap_idx; midx++){
2620 int64_t stride=phase;
2621 phase *= map_idx_len[midx];
2622 for (int64_t r=0; r<tot_rep/phase; r++){
2623 for (int64_t m=1; m<map_idx_len[midx]; m++){
2624 for (int64_t s=0; s<stride; s++){
2625 ((int64_t*)(pi_new[i*tot_rep + r*phase + m*stride + s].ptr))[0] += m*map_idx_lda[midx];
2632 num_pair *= tot_rep;
2636 B->write(num_pair, alpha, beta, mapped_data,
'w');
int map_tensor(int num_phys_dims, int tsr_order, int const *tsr_edge_len, int const *tsr_sym_table, int *restricted, CommData *phys_comm, int const *comm_idx, int fill, mapping *tsr_edge_map)
map a tensor
bool is_custom
whether there is a elementwise custom function
char * home_buffer
buffer associated with home mapping of tensor, to which it is returned
CTF_int::CommData cdt
communicator data for MPI comm defining this world
bool is_home
whether the latest tensor data is in the home buffer
int64_t * nnz_blk
nonzero elements in each block owned locally
int * sym
symmetries among tensor dimensions
int * idx_A
indices of left operand
virtual int pair_size() const
gets pair size el_size plus the key size
std::list< mem_transfer > contract_mst()
gets rid of empty space on the stack
void calc_dim(int order, int64_t size, int const *edge_len, mapping const *edge_map, int64_t *vrt_sz, int *vrt_edge_len, int *blk_edge_len)
calculate the block-sizes of a tensor
void safecopy(char *&a, char const *b) const
copies element b to element a, , with checks for NULL and alloc as necessary
void execute(bool run_diag=false)
run summation
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
template int conv_idx< int >(int, int const *, int **)
int * pad_edge_len
padded tensor edge lengths
summation(summation const &other)
copy constructor
void get_sym_perms(summation const &sum, std::vector< summation > &perms, std::vector< int > &signs)
finds all permutations of a summation that must be done for a broken symmetry
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
int sparsify(char const *threshold=NULL, bool take_abs=true)
reduce tensor to sparse format, storing only nonzero data, or data above a specified threshold...
bool has_home
whether the tensor has a home mapping/buffer
void * alloc(int64_t len)
alloc abstraction
int get_best_topo(int64_t nvirt, int topo, CommData global_comm, int64_t bcomm_vol, int64_t bmemuse)
get the best topologoes (least nvirt) over all procs
int64_t home_size
size of home buffer
void desymmetrize(tensor *sym_tsr, tensor *nonsym_tsr, bool is_C)
unfolds the data of a tensor
void set_sym(int const *sym)
sets symmetry, WARNING: for internal use only !!!!
untyped internal class for doubly-typed univariate function
virtual char const * addid() const
MPI datatype for pairs.
void set_new_nnz_glb(int64_t const *nnz_blk)
sets the number of nonzeros both locally (nnz_loc) and overall globally (nnz_tot) ...
#define DEBUG_PRINTF(...)
void sort(int64_t n)
sorts set of pairs using std::sort
void copy_mapping(int order, mapping const *mapping_A, mapping *mapping_B)
copies mapping A to B
tensor * self_reduce(int const *idx_A, int **new_idx_A, int order_B, int const *idx_B, int **new_idx_B, int order_C=0, int const *idx_C=NULL, int **new_idx_C=NULL)
performs a partial reduction on the tensor (used in summation and contraction)
an instance of the CTF library (world) on a MPI communicator
bool is_sparse
whether only the non-zero elements of the tensor are stored
int order
number of tensor dimensions
double estimate_time()
predicts execution time in seconds using performance models
void set_padding()
sets padding and local size of a tensor given a mapping
CTF::World * wrld
distributed processor context on which tensor is defined
class for execution distributed scaling of a tensor
int rank
rank of local processor
int overcounting_factor(int order_A, const T &idx_A, const int *sym_A, int order_B, const T &idx_B, const int *sym_B, int order_C, const T &idx_C, const int *sym_C)
performs replication along a dimension, generates 2.5D algs
~summation()
lazy constructor
int zero_out_padding()
sets padded portion of tensor to zero (this should be maintained internally)
int strip_diag(int order, int order_tot, int const *idx_map, int64_t vrt_sz, mapping const *edge_map, topology const *topo, algstrct const *sr, int *blk_edge_len, int64_t *blk_sz, strp_tsr **stpr)
build stack required for stripping out diagonals of tensor
char const * alpha
scaling of A
int can_block_reshuffle(int order, int const *old_phase, mapping const *map)
determines if tensor can be permuted by block
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
void print_map(FILE *stream=stdout, bool allcall=1) const
displays mapping information
int comp_dim_map(mapping const *map_A, mapping const *map_B)
compares two mappings
bool is_data_aliased
whether the tensor data is an alias of another tensor object's data
int64_t k() const
returns key of pair at head of ptr
int64_t nnz_tot
maximum number of local nonzero elements over all procs
void pull_alias(tensor const *other)
pulls data from an tensor with an aliased buffer
double est_time_transp(int order, int const *new_order, int const *edge_len, int dir, algstrct const *sr)
estimates time needed to transposes a non-symmetric (folded) tensor based on performance models ...
algstrct * sr
algstrct on which tensor elements and operations are defined
virtual void pair_dealloc(char *ptr) const
deallocate given pointer containing contiguous array of pairs
univar_function const * func
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
void unfold(bool was_mod=0)
undo the folding of a local tensor block unsets is_folded and deletes rec_tsr
tensor * rec_tsr
representation of folded tensor (shares data pointer)
int * idx_B
indices of output
bool is_mapped
whether a mapping has been selected
univar_function const * func
int64_t calc_nvirt() const
calculate virtualization factor of tensor return virtualization factor
void print()
print contraction details
univar_function const * func
function to execute on elements
void permute_target(int order, int const *perm, int *arr)
permutes a permutation array
int check_self_mapping(tensor const *tsr, int const *idx_map)
checks mapping in preparation for tensors scale, summ or contract
int64_t nnz_loc
number of local nonzero elements
virtual void add(char const *a, char const *b, char *c) const
c = a+b
int el_size
size of each element of algstrct in bytes
char const * beta
scaling of existing B
int cdealloc(void *ptr)
free abstraction
std::vector< CTF_int::topology * > topovec
derived topologies
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
int is_equal(summation const &os)
returns 1 if summations have same tensors and index map
internal distributed tensor class
void nosym_transpose(tensor *A, int all_fdim_A, int const *all_flen_A, int const *new_order, int dir)
int extract_diag(int const *idx_map, int rw, tensor *&new_tsr, int **idx_map_new)
extracts the diagonal of a tensor if the index map specifies to do so
int align_symmetric_indices(int order_A, T &idx_A, const int *sym_A, int order_B, T &idx_B, const int *sym_B)
int map_self_indices(tensor const *tsr, int const *idx_map)
create virtual mapping for idx_maps that have repeating indices
topology * topo
topology to which the tensor is mapped
performs replication along a dimension, generates 2.5D algs
class for execution distributed summation of tensors
virtual void safeaddinv(char const *a, char *&b) const
b = -a, with checks for NULL and alloc as necessary
virtual char const * mulid() const
identity element for multiplication i.e. 1
void symmetrize(tensor *sym_tsr, tensor *nonsym_tsr)
folds the data of a tensor
char * name
name given to tensor
int sum_tensors(bool run_diag)
PDAXPY: a*idx_map_A(A) + b*idx_map_B(B) -> idx_map_B(B). Treats symmetric as lower triangular...
MPI_Comm comm
set of processors making up this world
void clear()
resets mapping to NOT_MAPPED
int conv_idx(int order, type const *cidx, int **iidx)