2 #include "../scaling/strp_tsr.h"     3 #include "../mapping/mapping.h"     4 #include "../mapping/distribution.h"     5 #include "../tensor/untyped_tensor.h"     6 #include "../shared/util.h"     7 #include "../shared/memcontrol.h"     9 #include "../symmetry/sym_indices.h"    10 #include "../symmetry/symmetrization.h"    11 #include "../redistribution/nosym_transp.h"    12 #include "../redistribution/redist.h"    13 #include "../scaling/scaling.h"    27     memcpy(idx_A, other.
idx_A, 
sizeof(
int)*other.
A->
order);
    30     memcpy(idx_B, other.
idx_B, 
sizeof(
int)*other.
B->
order);
    50     idx_A     = (
int*)
alloc(
sizeof(
int)*A->order);
    51     idx_B     = (
int*)
alloc(
sizeof(
int)*B->order);
    53     memcpy(idx_A, idx_A_, 
sizeof(
int)*A->order);
    54     memcpy(idx_B, idx_B_, 
sizeof(
int)*B->order);
    70     conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B);
    91     idx_A     = (
int*)
alloc(
sizeof(
int)*A->order);
    92     idx_B     = (
int*)
alloc(
sizeof(
int)*B->order);
    94     memcpy(idx_A, idx_A_, 
sizeof(
int)*A->order);
    95     memcpy(idx_B, idx_B_, 
sizeof(
int)*B->order);
   116     conv_idx(A->order, cidx_A, &idx_A, B->order, cidx_B, &idx_B);
   120 #if (DEBUG >= 2 || VERBOSE >= 1)   122     if (A->wrld->cdt.rank == 0) printf(
"Summation::execute (head):\n");
   127     int stat = home_sum_tsr(run_diag);
   129       printf(
"CTF ERROR: Failed to perform summation\n");
   137   void summation::get_fold_indices(
int *  num_fold,
   139     int i, in, num_tot, nfold, broken;
   140     int iA, iB, inA, inB, iiA, iiB;
   141     int * idx_arr, * idx;
   147     for (i=0; i<num_tot; i++){
   151     for (iA=0; iA<A->order; iA++){
   158         inB = idx_arr[2*in+1];
   159         if (((inA>=0) + (inB>=0) != 2) ||
   160             (iB != -1 && inB - iB != in-i) ||
   161             (iB != -1 && A->sym[inA] != B->sym[inB])){
   166       } 
while (A->sym[inA-1] != 
NS);
   168         for (iiA=iA;iiA<inA;iiA++){
   174     for (iB=0; iB<B->order; iB++){
   181         inA = idx_arr[2*in+0];
   182         if (((inB>=0) + (inA>=0) != 2) ||
   183             (iA != -1 && inA - iA != in-i) ||
   184             (iA != -1 && B->sym[inB] != A->sym[inA])){
   188       } 
while (B->sym[inB-1] != 
NS);
   190         for (iiB=iB;iiB<inB;iiB++){
   198     for (i=0; i<num_tot; i++){
   209   int summation::can_fold(){
   210     int i, j, nfold, * fold_idx;
   212     if (A->is_sparse || B->is_sparse) 
return 0;
   214     for (i=0; i<A->order; i++){
   215       for (j=i+1; j<A->order; j++){
   216         if (idx_A[i] == idx_A[j]) 
return 0;
   219     for (i=0; i<B->order; i++){
   220       for (j=i+1; j<B->order; j++){
   221         if (idx_B[i] == idx_B[j]) 
return 0;
   224     get_fold_indices(&nfold, &fold_idx);
   230   void summation::get_fold_sum(
summation *& fold_sum,
   236     int * fold_idx, * fidx_map_A, * fidx_map_B;
   237     tensor * ftsr_A, * ftsr_B;
   239     get_fold_indices(&nfold, &fold_idx);
   250     A->fold(nfold, fold_idx, idx_A, 
   251             &all_fdim_A, &all_flen_A);
   252     B->fold(nfold, fold_idx, idx_B, 
   253             &all_fdim_B, &all_flen_B);
   256     for (i=0; i<A->order; i++){
   257       for (j=0; j<nfold; j++){
   258         if (A->sym[i] == 
NS && idx_A[i] == fold_idx[j]){
   265     for (i=0; i<B->order; i++){
   266       for (j=0; j<nfold; j++){
   267         if (B->sym[i] == 
NS && idx_B[i] == fold_idx[j]){
   277     int * sidx_A, * sidx_B;
   279                            ftsr_B->
order, fidx_map_B, &sidx_B);
   285     fold_sum = 
new summation(A->rec_tsr, sidx_A, alpha, B->rec_tsr, sidx_B, beta);
   290   int summation::map_fold(){
   291     int i, all_fdim_A, all_fdim_B;
   292     int * fnew_ord_A, * fnew_ord_B;
   293     int * all_flen_A, * all_flen_B;
   297     get_fold_sum(fold_sum, all_fdim_A, all_fdim_B, all_flen_A, all_flen_B);
   299     if (A->wrld->rank == 0){
   300       printf(
"Folded summation type:\n");
   306     fold_sum->get_len_ordering(&fnew_ord_A, &fnew_ord_B); 
   323     for (i=0; i<fold_sum->
A->
order; i++){
   337   double summation::est_time_fold(){
   338     int all_fdim_A, all_fdim_B;
   339     int * fnew_ord_A, * fnew_ord_B;
   340     int * all_flen_A, * all_flen_B;
   341     int * tAiord, * tBiord;
   344     get_fold_sum(fold_sum, all_fdim_A, all_fdim_B, all_flen_A, all_flen_B);
   345     fold_sum->get_len_ordering(&fnew_ord_A, &fnew_ord_B); 
   348     memcpy(tAiord, A->inner_ordering, all_fdim_A*
sizeof(
int));
   349     memcpy(tBiord, B->inner_ordering, all_fdim_B*
sizeof(
int));
   361     double esttime = 0.0;
   363     esttime += A->calc_nvirt()*
est_time_transp(all_fdim_A, tAiord, all_flen_A, 1, A->sr);
   364     esttime += 2.*B->calc_nvirt()*
est_time_transp(all_fdim_B, tBiord, all_flen_B, 1, B->sr);
   379   void summation::get_len_ordering(
int ** new_ordering_A,
   380                                    int ** new_ordering_B){
   382     int * ordering_A, * ordering_B, * idx_arr;
   390     ASSERT(num_tot == A->order);
   391     ASSERT(num_tot == B->order);
   396     for (
int i=0; i<num_tot; i++){
   398       for (
int j=0; j<num_tot; j++){
   399         if (idx_A[j] == idx_B[i])
   404     *new_ordering_A = ordering_A;
   405     *new_ordering_B = ordering_B;
   409   tspsum * summation::construct_sparse_sum(
int const * phys_mapped){
   410     int nvirt, i, iA, iB, order_tot, is_top, need_rep;
   411     int64_t blk_sz_A, blk_sz_B, vrt_sz_A, vrt_sz_B;
   415     int * virt_blk_len_A, * virt_blk_len_B;
   416     int * blk_len_A, * blk_len_B;
   418     tspsum * htsum = NULL , ** rec_tsum = NULL;
   423             &order_tot, &idx_arr);
   425     nphys_dim = A->topo->order;
   436     calc_dim(A->order, blk_sz_A, A->pad_edge_len, A->edge_map,
   437              &vrt_sz_A, virt_blk_len_A, blk_len_A);
   438     calc_dim(B->order, blk_sz_B, B->pad_edge_len, B->edge_map,
   439              &vrt_sz_B, virt_blk_len_B, blk_len_B);
   442     for (i=0; i<order_tot; i++){
   446         map = &A->edge_map[iA];
   449           virt_dim[i] = map->
np;
   451         else virt_dim[i] = 1;
   454         map = &B->edge_map[iB];
   457           virt_dim[i] = map->
np;
   459         else virt_dim[i] = 1;
   461       nvirt *= virt_dim[i];
   467       if (A->wrld->np > 1){
   488       if (B->wrld->np > 1){
   533     for (i=0; i<nphys_dim; i++){
   534       if (phys_mapped[2*i+0] == 0 ||
   535           phys_mapped[2*i+1] == 0){
   571     int * new_sym_A, * new_sym_B;
   573     memcpy(new_sym_A, A->sym, 
sizeof(
int)*A->order);
   575     memcpy(new_sym_B, B->sym, 
sizeof(
int)*B->order);
   580     tsumseq->
sym_A       = new_sym_A;
   582     tsumseq->
sym_B       = new_sym_B;
   586       tsumseq->
func      = func;
   587     } 
else tsumseq->
func = NULL;
   602   tsum * summation::construct_dense_sum(
int         inner_stride,
   603                                         int const * phys_mapped){
   604     int i, iA, iB, order_tot, is_top, sA, sB, need_rep, i_A, i_B, j, k;
   605     int64_t blk_sz_A, blk_sz_B, vrt_sz_A, vrt_sz_B;
   606     int nphys_dim, nvirt;
   607     int * idx_arr, * virt_dim;
   608     int * virt_blk_len_A, * virt_blk_len_B;
   609     int * blk_len_A, * blk_len_B;
   610     tsum * htsum = NULL , ** rec_tsum = NULL;
   617             &order_tot, &idx_arr);
   619     nphys_dim = A->topo->order;
   630     calc_dim(A->order, blk_sz_A, A->pad_edge_len, A->edge_map,
   631              &vrt_sz_A, virt_blk_len_A, blk_len_A);
   632     calc_dim(B->order, blk_sz_B, B->pad_edge_len, B->edge_map,
   633              &vrt_sz_B, virt_blk_len_B, blk_len_B);
   635     sA = 
strip_diag(A->order, order_tot, idx_A, vrt_sz_A,
   636                            A->edge_map, A->topo, A->sr,
   637                            blk_len_A, &blk_sz_A, &str_A);
   638     sB = 
strip_diag(B->order, order_tot, idx_B, vrt_sz_B,
   639                            B->edge_map, B->topo, B->sr,
   640                            blk_len_B, &blk_sz_B, &str_B);
   642       if (A->wrld->cdt.rank == 0)
   643         DPRINTF(1,
"Stripping tensor\n");
   659     for (i=0; i<order_tot; i++){
   663         map = &A->edge_map[iA];
   666           virt_dim[i] = map->
np;
   667           if (sA) virt_dim[i] = virt_dim[i]/str_A->
strip_dim[iA];
   669         else virt_dim[i] = 1;
   672         map = &B->edge_map[iB];
   675           virt_dim[i] = map->
np;
   676           if (sB) virt_dim[i] = virt_dim[i]/str_B->
strip_dim[iA];
   678         else virt_dim[i] = 1;
   680       nvirt *= virt_dim[i];
   684     for (i=0; i<nphys_dim; i++){
   685       if (phys_mapped[2*i+0] == 0 ||
   686           phys_mapped[2*i+1] == 0){
   723     int * new_sym_A, * new_sym_B;
   725     memcpy(new_sym_A, A->sym, 
sizeof(
int)*A->order);
   727     memcpy(new_sym_B, B->sym, 
sizeof(
int)*B->order);
   730     if (inner_stride == -1){
   738       for (i=0; i<A->order; i++){
   739         if (A->sym[i] == 
NS){
   740           for (j=0; j<itsr->
order; j++){
   741             if (A->inner_ordering[j] == i_A){
   745               } 
while (j>=0 && A->sym[j] != 
NS);
   746               for (k=j+1; k<=i; k++){
   747                 virt_blk_len_A[k] = 1;
   758       for (i=0; i<B->order; i++){
   759         if (B->sym[i] == 
NS){
   760           for (j=0; j<itsr->
order; j++){
   761             if (B->inner_ordering[j] == i_B){
   765               } 
while (j>=0 && B->sym[j] != 
NS);
   766               for (k=j+1; k<=i; k++){
   767                 virt_blk_len_B[k] = 1;
   778     tsumseq->
sym_A       = new_sym_A;
   780     tsumseq->
sym_B       = new_sym_B;
   784       tsumseq->
func      = func;
   785     } 
else tsumseq->
func = NULL;
   801   tsum * summation::construct_sum(
int inner_stride){
   808     nphys_dim = A->topo->order;
   811     memset(phys_mapped, 0, 
sizeof(
int)*nphys_dim*2);
   815     for (i=0; i<A->order; i++){
   816       map = &A->edge_map[i];
   818         phys_mapped[2*map->
cdt+0] = 1;
   823           phys_mapped[2*map->
cdt+0] = 1;
   827     for (i=0; i<B->order; i++){
   828       map = &B->edge_map[i];
   830         phys_mapped[2*map->
cdt+1] = 1;
   835           phys_mapped[2*map->
cdt+1] = 1;
   839     if (A->is_sparse || B->is_sparse){
   840       htsum = construct_sparse_sum(phys_mapped);
   842       htsum = construct_dense_sum(inner_stride, phys_mapped);
   850   int summation::home_sum_tsr(
bool run_diag){
   851     int ret, was_home_A, was_home_B;
   852     tensor * tnsr_A, * tnsr_B;
   940   #ifndef HOME_CONTRACT   942       ret = sym_sum_tsr(run_diag);
   945       ret = sum_tensors(run_diag);
   949     if (A->has_zero_edge_len || 
   950         B->has_zero_edge_len){
   951       if (!B->sr->isequal(beta,B->sr->mulid()) && !B->has_zero_edge_len){ 
   952         int sub_idx_map_B[B->order];
   954         for (
int i=0; i<B->order; i++){
   955           sub_idx_map_B[i]=sm_idx;
   957           for (
int j=0; j<i; j++){
   958             if (idx_B[i]==idx_B[j]){
   959               sub_idx_map_B[i]=sub_idx_map_B[j];
   977     was_home_A = A->is_home;
   978     was_home_B = B->is_home;
   980       tnsr_A              = 
new tensor(A,0,0);
   981       tnsr_A->
data        = A->data;
   987       tnsr_A->
topo        = A->topo;
   995     } 
else tnsr_A = NULL;     
   997       tnsr_B              = 
new tensor(B,0,0);
   998       tnsr_B->
data        = B->data;
  1004       tnsr_B->
topo        = B->topo;
  1012     } 
else tnsr_B = NULL;
  1014     if (A->wrld->cdt.rank == 0)
  1015       printf(
"Start head sum:\n");
  1019     ret = osum.sym_sum_tsr(run_diag);
  1024     if (A->wrld->cdt.rank == 0)
  1025       printf(
"End head sum:\n");
  1028     if (ret!= 
SUCCESS) 
return ret;
  1029     if (was_home_A) tnsr_A->
unfold(); 
  1038           B->nnz_blk[i] = tnsr_B->
nnz_blk[i];
  1043       B->data = tnsr_B->
data;
  1046     if (was_home_B && !tnsr_B->
is_home){
  1047       if (A->wrld->cdt.rank == 0)
  1048         DPRINTF(1,
"Migrating tensor %s back to home\n", B->name);
  1053       B->redistribute(odst);
  1056         B->sr->copy(B->home_buffer, B->data, B->size);
  1057         B->sr->dealloc(B->data);
  1058         B->data = B->home_buffer;
  1069     } 
else if (was_home_B){
  1071         if (tnsr_B->
data != B->data){
  1072           printf(
"Tensor %s is a copy of %s and did not leave home but buffer is %p was %p\n", tnsr_B->
name, B->name, tnsr_B->
data, B->data);
  1076         B->data = tnsr_B->
data;
  1082     if (was_home_A && !tnsr_A->
is_home){
  1093     } 
else if (was_home_A) {
  1098         A->data = tnsr_A->
data;
  1105   int summation::sym_sum_tsr(
bool run_diag){
  1106     int sidx, i, nst_B, * new_idx_map;
  1107     int * map_A, * map_B;
  1108     int ** dstack_map_B;
  1109     tensor * tnsr_A, * tnsr_B, * new_tsr, ** dstack_tsr_B;
  1110     std::vector<summation> perm_types;
  1111     std::vector<int> signs;
  1116     bool is_cons = check_consistency();
  1117     if (!is_cons) 
return ERROR;
  1121     if (A->has_zero_edge_len || B->has_zero_edge_len){
  1122       if (!B->sr->isequal(beta, B->sr->mulid()) && !B->has_zero_edge_len){ 
  1123         int sub_idx_map_B[B->order];
  1125         for (
int i=0; i<B->order; i++){
  1126           sub_idx_map_B[i]=sm_idx;
  1128           for (
int j=0; j<i; j++){
  1129             if (idx_B[i]==idx_B[j]){
  1130               sub_idx_map_B[i]=sub_idx_map_B[j];
  1144     int * new_idx_A, * new_idx_B;
  1145     if (!is_custom || func->is_distributive){
  1146       tensor * new_tsr_A = A->
self_reduce(idx_A, &new_idx_A, B->order, idx_B, &new_idx_B);
  1147       if (new_tsr_A != A) {
  1148         summation s(new_tsr_A, new_idx_A, alpha, B, new_idx_B, beta, func);
  1169     memcpy(map_A, idx_A, tnsr_A->
order*
sizeof(
int));
  1170     memcpy(map_B, idx_B, tnsr_B->
order*
sizeof(
int));
  1172       if (tnsr_A != A) 
delete tnsr_A;
  1175       map_A = new_idx_map;
  1179       dstack_map_B[nst_B] = map_B;
  1180       dstack_tsr_B[nst_B] = tnsr_B;
  1183       map_B = new_idx_map;
  1189     memcpy(new_sum.
idx_A, map_A, 
sizeof(
int)*tnsr_A->
order);
  1190     memcpy(new_sum.
idx_B, map_B, 
sizeof(
int)*tnsr_B->
order);
  1191     if (tnsr_A == tnsr_B){
  1193       new_sum.
A = &nnew_tsr;
  1195       new_sum.sym_sum_tsr(run_diag);
  1224       if (ocfact != 1 || sign != 1){
  1228           for (
int i=0; i<ocfact; i++){
  1229             tnsr_B->
sr->
add(new_alpha, alpha, new_alpha);
  1243       if (new_sum.unfold_broken_sym(NULL) != -1){
  1244         if (A->wrld->cdt.rank == 0)
  1245           DPRINTF(1, 
"Permutational symmetry is broken\n");
  1248         sidx = new_sum.unfold_broken_sym(&unfold_sum);
  1251         int sidx2 = unfold_sum->unfold_broken_sym(NULL);
  1252         if (sidx%2 == 0 && (A->sym[sidx/2] == 
SY || unfold_sum->
A->
sym[sidx/2] == 
SY)) sy = 1;
  1253         if (sidx%2 == 1 && (B->sym[sidx/2] == 
SY || unfold_sum->
B->
sym[sidx/2] == 
SY)) sy = 1;
  1254         if (!A->is_sparse && !B->is_sparse && (sidx2 != -1 || 
  1257             if (unfold_sum->
A->
sym[sidx/2] == 
NS){
  1258               if (A->wrld->cdt.rank == 0)
  1259                 DPRINTF(1,
"Performing operand desymmetrization for summation of A idx=%d\n",sidx/2);
  1262               if (A->wrld->cdt.rank == 0)
  1263                 DPRINTF(1,
"Performing operand symmetrization for summation\n");
  1267             unfold_sum->sym_sum_tsr(run_diag);
  1269             if (tnsr_A != unfold_sum->
A){
  1272               delete unfold_sum->
A;
  1276             if (A->wrld->cdt.rank == 0)
  1277               DPRINTF(1,
"Performing product desymmetrization for summation\n");
  1279             unfold_sum->sym_sum_tsr(run_diag);
  1280             if (A->wrld->cdt.rank == 0)
  1281               DPRINTF(1,
"Performing product symmetrization for summation\n");
  1283               int sidx_B[tnsr_B->
order];
  1284               for (
int iis=0; iis<tnsr_B->
order; iis++){
  1293             if (tnsr_B != unfold_sum->
B){
  1296               delete unfold_sum->
B;
  1300           if (sidx != -1 && sidx%2 == 1){
  1301             delete unfold_sum->
B;
  1302           } 
else if (sidx != -1 && sidx%2 == 0){
  1303             delete unfold_sum->
A;
  1307           if (A->wrld->cdt.rank == 0)
  1308             DPRINTF(1,
"Performing %d summation permutations\n",
  1309                     (
int)perm_types.size());
  1313           tensor * inv_tsr_A = NULL;
  1314           bool need_inv = 
false;
  1316           if (tnsr_B->
sr->
mulid() == NULL){
  1317             for (i=0; i<(int)perm_types.size(); i++){
  1322               inv_tsr_A = 
new tensor(tnsr_A);
  1326           for (i=0; i<(int)perm_types.size(); i++){
  1328             if (signs[i] == -1 && need_inv){
  1329               perm_types[i].A = inv_tsr_A;
  1335               perm_types[i].alpha = new_alpha;
  1337             perm_types[i].beta = dbeta;
  1338             perm_types[i].sum_tensors(run_diag);
  1353         new_sum.
alpha = alpha;
  1359     if (tnsr_A != A) 
delete tnsr_A;
  1360     for (i=nst_B-1; i>=0; i--){
  1362       dstack_tsr_B[i]->
extract_diag(dstack_map_B[i], 0, tnsr_B, &new_idx_map);
  1367       tnsr_B = dstack_tsr_B[i];
  1385     int stat, * new_idx_map;
  1386     int * map_A, * map_B;
  1388     int ** dstack_map_B;
  1389     tensor * tnsr_A, * tnsr_B, * new_tsr, ** dstack_tsr_B;
  1395     check_consistency();
  1398     if (A->has_zero_edge_len || B->has_zero_edge_len){
  1399       if (!B->sr->isequal(beta,B->sr->mulid()) && !B->has_zero_edge_len){ 
  1404         int sub_idx_map_B[B->order];
  1406         for (
int i=0; i<B->order; i++){
  1407           sub_idx_map_B[i]=sm_idx;
  1409           for (
int j=0; j<i; j++){
  1410             if (idx_B[i]==idx_B[j]){
  1411               sub_idx_map_B[i]=sub_idx_map_B[j];
  1432     memcpy(map_A, idx_A, tnsr_A->
order*
sizeof(
int));
  1433     memcpy(map_B, idx_B, tnsr_B->
order*
sizeof(
int));
  1435       if (tnsr_A != A) 
delete tnsr_A;
  1438       map_A = new_idx_map;
  1442       dstack_map_B[nst_B] = map_B;
  1443       dstack_tsr_B[nst_B] = tnsr_B;
  1446       map_B = new_idx_map;
  1450       tensor * stnsr_A = tnsr_A;
  1451       tnsr_A = 
new tensor(stnsr_A);
  1453       if (A != stnsr_A) 
delete stnsr_A;
  1463     if (tnsr_A == tnsr_B){
  1465       new_sum.
A = nnew_tsr;
  1492       int order_A, order_B,  i;
  1493       int * edge_len_A, * edge_len_B;
  1494       int * sym_A, * sym_B;
  1495       stat = allread_tsr(ntid_A, &nsA, &sA);
  1498       stat = allread_tsr(ntid_B, &nsB, &sB);
  1510       if (new_sum.check_mapping() == 0) {
  1513         if (A->wrld->cdt.rank == 0){
  1514           printf(
"Remapping tensors for sum:\n");
  1518         stat = new_sum.map();
  1519         if (stat == 
ERROR) {
  1520           printf(
"Failed to map tensors to physical grid\n");
  1527         if (A->wrld->cdt.rank == 0){
  1528           printf(
"Keeping mappings:\n");
  1535       ASSERT(new_sum.check_mapping());
  1537       if (is_custom == 
false && new_sum.can_fold()){
  1539         double est_time_nofold = 4.*(A->size + B->size)*
COST_MEMBW;
  1540         if (new_sum.est_time_fold() + (A->size + B->size)*
COST_MEMBW < est_time_nofold){
  1545           inner_stride = new_sum.map_fold();
  1547           sumf = new_sum.construct_sum(inner_stride);
  1552           sumf = new_sum.construct_sum();
  1556         if (A->wrld->cdt.rank == 0){
  1557           printf(
"Could not fold summation, is_custom = %d, new_sum.can_fold = %d\n", is_custom, new_sum.can_fold());
  1560         sumf = new_sum.construct_sum();
  1565       sumf = new_sum.construct_sum();
  1575       DEBUG_PRINTF(
"[%d] performing tensor sum\n", A->wrld->cdt.rank);
  1641       stat = allread_tsr(ntid_A, &nA, &uA);
  1643       stat = get_info(ntid_A, &order_A, &edge_len_A, &sym_A);
  1646       stat = allread_tsr(ntid_B, &nB, &uB);
  1648       stat = get_info(ntid_B, &order_B, &edge_len_B, &sym_B);
  1651       if (nsA != nA) { printf(
"nsA = " PRId64 
", nA = " PRId64 
"\n",nsA,nA); 
ABORT; }
  1652       if (nsB != nB) { printf(
"nsB = " PRId64 
", nB = " PRId64 
"\n",nsB,nB); 
ABORT; }
  1653       for (i=0; (int64_t)i<nA; i++){
  1654         if (fabs(uA[i] - sA[i]) > 1.E-6){
  1655           printf(
"A[i] = %lf, sA[i] = %lf\n", uA[i], sA[i]);
  1659       cpy_sym_sum(alpha, uA, order_A, edge_len_A, edge_len_A, sym_A, map_A,
  1660                   beta, sB, order_B, edge_len_B, edge_len_B, sym_B, map_B);
  1663       for (i=0; (int64_t)i<nB; i++){
  1664         if (fabs(uB[i] - sB[i]) > 1.E-6){
  1665           printf(
"B[%d] = %lf, sB[%d] = %lf\n", i, uB[i], i, sB[i]);
  1667         assert(fabs(sB[i] - uB[i]) < 1.E-6);
  1679       for (
int i=nst_B-1; i>=0; i--){
  1680         int ret = dstack_tsr_B[i]->
extract_diag(dstack_map_B[i], 0, tnsr_B, &new_idx_map);
  1683         tnsr_B = dstack_tsr_B[i];
  1700   int summation::unfold_broken_sym(
summation ** nnew_sum){
  1701     int sidx, i, num_tot, iA, iA2, iB;
  1706     if (nnew_sum != NULL){
  1708       *nnew_sum = new_sum;
  1709     } 
else new_sum = NULL;
  1713             &num_tot, &idx_arr);
  1716     for (i=0; i<A->order; i++){
  1717       if (A->sym[i] != 
NS){
  1719         if (idx_arr[2*iA+1] != -1){
  1720           if (B->sym[idx_arr[2*iA+1]] == 
NS ||
  1721               ((B->sym[idx_arr[2*iA+1]] == 
AS) ^ (A->sym[i] == 
AS)) ||
  1722               idx_arr[2*idx_A[i+1]+1] == -1 ||
  1723               idx_A[i+1] != idx_B[idx_arr[2*iA+1]+1]){
  1727         } 
else if (idx_arr[2*idx_A[i+1]+1] != -1){
  1734       for (i=0; i<B->order; i++){
  1735         if (B->sym[i] != 
NS){
  1737           if (idx_arr[2*iB+0] != -1){
  1738             if (A->sym[idx_arr[2*iB+0]] == 
NS ||
  1739                 ((A->sym[idx_arr[2*iB+0]] == 
AS) ^ (B->sym[i] == 
AS)) ||
  1740                 idx_arr[2*idx_B[i+1]+0] == -1 ||
  1741                 idx_B[i+1] != idx_A[idx_arr[2*iB+0]+1]){
  1744             } 
else if (A->sym[idx_arr[2*iB+0]] == 
NS){
  1749           } 
else if (idx_arr[2*idx_B[i+1]+0] != -1){
  1758       for (i=0; i<A->order; i++){
  1759         if (A->sym[i] == 
SY){
  1762           if (idx_arr[2*iA+1] == -1 &&
  1763               idx_arr[2*iA2+1] == -1){
  1770     if (nnew_sum != NULL && sidx != -1){
  1772         new_sum->
A = 
new tensor(A, 0, 0);
  1773         int nA_sym[A->order];
  1774         memcpy(nA_sym, new_sum->
A->
sym, 
sizeof(
int)*new_sum->
A->
order);
  1775         nA_sym[sidx/2] = bsym;
  1778         new_sum->
B = 
new tensor(B, 0, 0);
  1779         int nB_sym[B->order];
  1780         memcpy(nB_sym, new_sum->
B->
sym, 
sizeof(
int)*new_sum->
B->
order);
  1781         nB_sym[sidx/2] = 
NS;
  1789   bool summation::check_consistency(){
  1790     int i, num_tot, len;
  1796             &num_tot, &idx_arr);
  1798     for (i=0; i<num_tot; i++){
  1800       iA = idx_arr[2*i+0];
  1801       iB = idx_arr[2*i+1];
  1805       if (len != -1 && iB != -1 && len != B->lens[iB]){
  1806         if (A->wrld->cdt.rank == 0){
  1807           printf(
"i = %d Error in sum call: The %dth edge length (%d) of tensor %s does not",
  1808                   i, iA, len, A->name);
  1809           printf(
"match the %dth edge length (%d) of tensor %s.\n",
  1810                   iB, B->lens[iB], B->name);
  1824     if (A != os.
A) 
return 0;
  1825     if (B != os.
B) 
return 0;
  1827     for (i=0; i<A->order; i++){
  1828       if (idx_A[i] != os.
idx_A[i]) 
return 0;
  1830     for (i=0; i<B->order; i++){
  1831       if (idx_B[i] != os.
idx_B[i]) 
return 0;
  1836   int summation::check_mapping(){
  1837     int i, pass, order_tot, iA, iB;
  1838     int * idx_arr, * phys_map;
  1847     if (A->is_mapped == 0) pass = 0;
  1848     if (B->is_mapped == 0) pass = 0;
  1851     if (A->is_folded == 1) pass = 0;
  1852     if (B->is_folded == 1) pass = 0;
  1854     if (A->topo != B->topo) pass = 0;
  1857       DPRINTF(4,
"failed confirmation here\n");
  1863     memset(phys_map, 0, 
sizeof(
int)*A->topo->order);
  1867             &order_tot, &idx_arr);
  1874       DPRINTF(4,
"failed confirmation here\n");
  1876     for (i=0; i<order_tot; i++){
  1878       iB = idx_arr[2*i+1];
  1879       if (iA != -1 && iB != -1) {
  1880         if (!
comp_dim_map(&A->edge_map[iA], &B->edge_map[iB])){
  1882           DPRINTF(4,
"failed confirmation here i=%d\n",i);
  1885       if (iA != -1 && iB == -1) {
  1886         mapping * map = &A->edge_map[iA];
  1888           phys_map[map->
cdt]++;
  1893       if (iB != -1 && iA == -1){
  1894         mapping * map = &B->edge_map[iB];
  1896           phys_map[map->
cdt]++;
  1903     for (i=0; i<A->topo->order; i++){
  1904       if (phys_map[i] > 1) {
  1906         DPRINTF(3,
"failed confirmation here i=%d\n",i);
  1914     if (B->sr->addid() == NULL || B->is_sparse){
  1915       int ndim_mapped = 0;
  1916       for (
int j=0; j<B->order; j++){
  1917         mapping * map = &B->edge_map[j];
  1925       if (ndim_mapped < B->topo->order) pass = 0;
  1933   int summation::map_sum_indices(
topology const * topo){
  1934     int tsr_order, isum, iA, iB, i, j, jsum, jX, stat;
  1935     int * tsr_edge_len, * tsr_sym_table, * restricted;
  1936     int * idx_arr, * idx_sum;
  1937     int num_sum, num_tot, idx_num;
  1945             &num_tot, &idx_arr);
  1950     for (i=0; i<num_tot; i++){
  1951       if (idx_arr[2*i] != -1 && idx_arr[2*i+1] != -1){
  1952         idx_sum[num_sum] = i;
  1957     tsr_order = num_sum;
  1965     memset(tsr_sym_table, 0, tsr_order*tsr_order*
sizeof(
int));
  1966     memset(restricted, 0, tsr_order*
sizeof(
int));
  1968     for (i=0; i<tsr_order; i++){ 
  1973     for (i=0; i<num_sum; i++){
  1975       iA = idx_arr[isum*2+0];
  1976       iB = idx_arr[isum*2+1];
  1981       } 
else if (B->edge_map[iB].type != 
NOT_MAPPED){
  1989     for (i=0; i<num_sum; i++){
  1991       iA = idx_arr[isum*idx_num+0];
  1992       iB = idx_arr[isum*idx_num+1];
  1994       tsr_edge_len[i] = A->pad_edge_len[iA];
  1999       if (A->sym[iA] != 
NS){
  2000         for (j=0; j<num_sum; j++){
  2002           jX = idx_arr[jsum*idx_num+0];
  2004             tsr_sym_table[i*tsr_order+j] = 1;
  2005             tsr_sym_table[j*tsr_order+i] = 1;
  2009       if (B->sym[iB] != 
NS){
  2010         for (j=0; j<num_sum; j++){
  2012           jX = idx_arr[jsum*idx_num+1];
  2014             tsr_sym_table[i*tsr_order+j] = 1;
  2015             tsr_sym_table[j*tsr_order+i] = 1;
  2022                       tsr_edge_len,       tsr_sym_table,
  2034       for (i=0; i<num_sum; i++){
  2036         iA = idx_arr[isum*idx_num+0];
  2037         iB = idx_arr[isum*idx_num+1];
  2046     for (i=0; i<num_sum; i++){
  2058   int summation::map(){
  2059     int i, ret, need_remap;
  2060     int need_remap_A, need_remap_B;
  2062     topology * old_topo_A, * old_topo_B;
  2066     ASSERT(A->wrld->cdt.cm == B->wrld->cdt.cm);
  2067     World * wrld = A->wrld;
  2071     if (wrld->
rank == 0)
  2072       printf(
"Initial mappings:\n");
  2073     A->print_map(stdout);
  2074     B->print_map(stdout);
  2086     old_topo_A = A->topo;
  2087     old_topo_B = B->topo;
  2094     int64_t min_size = INT64_MAX;
  2096     for (i=A->wrld->cdt.rank; i<2*(
int)A->wrld->topovec.size(); i+=A->wrld->cdt.np){
  2111         else if (ret != 
SUCCESS) 
return ret;
  2115         else if (ret != 
SUCCESS) 
return ret;
  2117       ret = map_sum_indices(A->topo);
  2125         else if (ret != 
SUCCESS) 
return ret;
  2129         else if (ret != 
SUCCESS) 
return ret;
  2135         else if (ret != 
SUCCESS) 
return ret;
  2136         ret = A->map_tensor_rem(A->topo->order, 
  2139         else if (ret != 
SUCCESS) 
return ret;
  2142                      idx_B, B->edge_map,0);
  2143         ret = B->map_tensor_rem(B->topo->order, 
  2146         else if (ret != 
SUCCESS) 
return ret;
  2150         else if (ret != 
SUCCESS) 
return ret;
  2151         ret = B->map_tensor_rem(B->topo->order, 
  2154         else if (ret != 
SUCCESS) 
return ret;
  2157                      idx_A, A->edge_map,0);
  2158         ret = A->map_tensor_rem(A->topo->order, 
  2161         else if (ret != 
SUCCESS) 
return ret;
  2166         else if (ret != 
SUCCESS) 
return ret;
  2170         else if (ret != 
SUCCESS) 
return ret;
  2179       A->print_map(stdout,0);
  2180       B->print_map(stdout,0);
  2182       if (!check_mapping()) 
continue;
  2185       size = A->size + B->size;
  2190       if (A->topo == old_topo_A){
  2191         for (d=0; d<A->order; d++){
  2199           size += A->size*std::max(1.0,log2(wrld->
cdt.
np));
  2202             double nnz_frac_A = std::min(2,(
int)A->calc_npe())*((
double)A->nnz_tot)/(A->size*A->calc_npe());
  2203             size += 25.*nnz_frac_A*A->size*std::max(1.0,log2(wrld->
cdt.
np));
  2205             size += 5.*A->size*std::max(1.0,log2(wrld->
cdt.
np));
  2208       if (B->topo == old_topo_B){
  2209         for (d=0; d<B->order; d++){
  2217           size += B->size*std::max(1.0,log2(wrld->
cdt.
np));
  2223             double nnz_frac_A = std::min(2,(
int)A->calc_npe())*((
double)A->nnz_tot)/(A->size*A->calc_npe());
  2224             double nnz_frac_B = std::min(2,(
int)B->calc_npe())*((
double)B->nnz_tot)/(B->size*B->calc_npe());
  2225             nnz_frac_B = std::max(nnz_frac_B, nnz_frac_A);
  2226             size += 25.*pref*nnz_frac_B*B->size*std::max(1.0,log2(wrld->
cdt.
np));
  2228             size += 5.*pref*B->size*std::max(1.0,log2(wrld->
cdt.
np));
  2240       if (btopo == -1 || size < min_size){
  2246       min_size = INT64_MAX;
  2251       printf(
"CTF ERROR: Failed to map pair!\n");
  2261     A->topo = wrld->
topovec[gtopo/2];
  2262     B->topo = wrld->
topovec[gtopo/2];
  2274     ret = map_sum_indices(A->topo);
  2287       ret = A->map_tensor_rem(A->topo->order, 
  2292                    idx_B, B->edge_map,0);
  2293       ret = B->map_tensor_rem(B->topo->order, 
  2299       ret = B->map_tensor_rem(B->topo->order, 
  2304                    idx_A, A->edge_map,0);
  2305       ret = A->map_tensor_rem(A->topo->order, 
  2323       printf(
"New mappings:\n");
  2324     A->print_map(stdout);
  2325     B->print_map(stdout);
  2333     if (A->topo == old_topo_A){
  2334       for (d=0; d<A->order; d++){
  2341       A->redistribute(dA);
  2344     if (B->topo == old_topo_B){
  2345       for (d=0; d<B->order; d++){
  2352       B->redistribute(dB);
  2356     delete [] old_map_A;
  2357     delete [] old_map_B;
  2366     CommData global_comm = A->wrld->cdt;
  2367     MPI_Barrier(global_comm.
cm);
  2368     if (global_comm.
rank == 0){
  2371       sprintf(sname, 
"%s", B->name);
  2372       sprintf(sname+strlen(sname),
"[");
  2373       for (i=0; i<B->order; i++){
  2375           sprintf(sname+strlen(sname),
" %d",idx_B[i]);
  2377           sprintf(sname+strlen(sname),
"%d",idx_B[i]);
  2379       sprintf(sname+strlen(sname),
"] <- ");
  2380       sprintf(sname+strlen(sname), 
"%s", A->name);
  2381       sprintf(sname+strlen(sname),
"[");
  2382       for (i=0; i<A->order; i++){
  2384           sprintf(sname+strlen(sname),
" %d",idx_A[i]);
  2386           sprintf(sname+strlen(sname),
"%d",idx_A[i]);
  2388       sprintf(sname+strlen(sname),
"]");
  2389       printf(
"CTF: Summation %s\n",sname);
  2443   void summation::sp_sum(){
  2447     bool is_idx_matched = 
true;
  2448     if (A->order != B->order)
  2449       is_idx_matched = 
false;
  2451       for (
int o=0; o<A->order; o++){
  2452         if (idx_A[o] != idx_B[o]){
  2453           is_idx_matched = 
false;
  2460     A->read_local_nnz(&num_pair, &mapped_data);
  2462     if (!is_idx_matched){
  2463       int64_t lda_A[A->order];
  2464       int64_t lda_B[B->order];
  2466       for (
int o=1; o<A->order; o++){
  2467         lda_A[o] = lda_A[o-1]*A->lens[o];
  2470       for (
int o=1; o<B->order; o++){
  2471         lda_B[o] = lda_B[o-1]*B->lens[o];
  2475       #pragma omp parallel for  2477       for (
int i=0; i<num_pair; i++){
  2478         int64_t k = pi[i].
k();
  2480         for (
int o=0; o<A->order; o++){
  2481           int64_t kpart = (k/lda_A[o])%A->lens[o];
  2483           for (
int q=0; q<B->order; q++){
  2484             if (idx_A[o] == idx_B[q]){
  2485               k_new += kpart*lda_B[q];
  2489         ((int64_t*)(pi[i].ptr))[0] = k_new;
  2493       bool is_reduce = 
false;
  2494       for (
int oA=0; oA<A->order; oA++){
  2496         for (
int oB=0; oB<B->order; oB++){
  2497           if (idx_A[oA] == idx_B[oB]){
  2501         if (!inB) is_reduce = 
true;
  2504       if (is_reduce && num_pair > 0){
  2507         for (int64_t i=1; i<num_pair; i++){
  2508           if (pi[i].k() != pi[i-1].k()) nuniq++;
  2510         if (nuniq != num_pair){
  2511           char * swap_data = mapped_data;
  2512           alloc_ptr(A->sr->pair_size()*nuniq, (
void**)&mapped_data);
  2515           int64_t acc_st = -1;
  2517           for (int64_t i=1; i<num_pair; i++){
  2518             if (pi[i].k() == pi[i-1].k()){
  2520                 memcpy(pi_new[pfx].ptr, pi[cp_st].ptr, A->
sr->
pair_size()*(i-cp_st));
  2525               if (acc_st == -1) acc_st = i;
  2528                 for (int64_t j=acc_st; j<i; j++){
  2529                   A->sr->add(pi_new[pfx-1].d(), pi[j].d(), pi_new[pfx-1].d());
  2535           if (cp_st < num_pair)
  2536             memcpy(pi_new[pfx].ptr, pi[cp_st].ptr, A->
sr->
pair_size()*(num_pair-cp_st));
  2538             for (int64_t j=acc_st; j<num_pair; j++){
  2539               A->sr->add(pi_new[pfx-1].d(), pi[j].d(), pi_new[pfx-1].d());
  2548       if (is_custom && !func->is_accumulator()){
  2549         char * swap_data = mapped_data;
  2550         alloc_ptr(B->sr->pair_size()*num_pair, (
void**)&mapped_data);
  2553         #pragma omp parallel for  2555         for (int64_t i=0; i<num_pair; i++){
  2557             func->apply_f(pi[i].d(), pi_new[i].d());
  2559             char tmp_A[A->sr->el_size];
  2560             A->sr->mul(pi[i].d(), alpha, tmp_A);
  2561             func->apply_f(tmp_A, pi_new[i].d());
  2571       int64_t map_idx_len[B->order];
  2572       int64_t map_idx_lda[B->order];
  2573       int map_idx_rev[B->order];
  2574       for (
int oB=0; oB<B->order; oB++){
  2576         for (
int oA=0; oA<A->order; oA++){
  2577           if (idx_A[oA] == idx_B[oB]){
  2583           for (
int ooB=0; ooB<oB; ooB++){
  2584             if (idx_B[ooB] == idx_B[oB]){
  2586               map_idx_lda[map_idx_rev[ooB]] += lda_B[oB];
  2591             map_idx_len[nmap_idx] = B->lens[oB];
  2592             map_idx_lda[nmap_idx] = lda_B[oB];
  2593             map_idx_rev[nmap_idx] = oB;
  2600         for (
int midx=0; midx<nmap_idx; midx++){
  2601           tot_rep *= map_idx_len[midx];
  2603         char * swap_data = mapped_data;
  2604         alloc_ptr(A->sr->pair_size()*num_pair*tot_rep, (
void**)&mapped_data);
  2607         #pragma omp parallel for  2609         for (int64_t i=0; i<num_pair; i++){
  2610           for (int64_t r=0; r<tot_rep; r++){
  2611             memcpy(pi_new[i*tot_rep+r].ptr, pi[i].ptr, A->
sr->
pair_size());
  2615         #pragma omp parallel for  2617         for (int64_t i=0; i<num_pair; i++){
  2619           for (
int midx=0; midx<nmap_idx; midx++){
  2620             int64_t stride=phase;
  2621             phase *= map_idx_len[midx];
  2622             for (int64_t r=0; r<tot_rep/phase; r++){
  2623               for (int64_t m=1; m<map_idx_len[midx]; m++){
  2624                 for (int64_t s=0; s<stride; s++){
  2625                   ((int64_t*)(pi_new[i*tot_rep + r*phase + m*stride + s].ptr))[0] += m*map_idx_lda[midx];
  2632         num_pair *= tot_rep;
  2636     B->write(num_pair, alpha, beta, mapped_data, 
'w');
 
int map_tensor(int num_phys_dims, int tsr_order, int const *tsr_edge_len, int const *tsr_sym_table, int *restricted, CommData *phys_comm, int const *comm_idx, int fill, mapping *tsr_edge_map)
map a tensor 
bool is_custom
whether there is a elementwise custom function 
char * home_buffer
buffer associated with home mapping of tensor, to which it is returned 
CTF_int::CommData cdt
communicator data for MPI comm defining this world 
bool is_home
whether the latest tensor data is in the home buffer 
int64_t * nnz_blk
nonzero elements in each block owned locally 
int * sym
symmetries among tensor dimensions 
int * idx_A
indices of left operand 
virtual int pair_size() const 
gets pair size el_size plus the key size 
std::list< mem_transfer > contract_mst()
gets rid of empty space on the stack 
void calc_dim(int order, int64_t size, int const *edge_len, mapping const *edge_map, int64_t *vrt_sz, int *vrt_edge_len, int *blk_edge_len)
calculate the block-sizes of a tensor 
void safecopy(char *&a, char const *b) const 
copies element b to element a, , with checks for NULL and alloc as necessary 
void execute(bool run_diag=false)
run summation 
virtual bool isequal(char const *a, char const *b) const 
returns true if algstrct elements a and b are equal 
template int conv_idx< int >(int, int const *, int **)
int * pad_edge_len
padded tensor edge lengths 
summation(summation const &other)
copy constructor 
void get_sym_perms(summation const &sum, std::vector< summation > &perms, std::vector< int > &signs)
finds all permutations of a summation that must be done for a broken symmetry 
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map 
int sparsify(char const *threshold=NULL, bool take_abs=true)
reduce tensor to sparse format, storing only nonzero data, or data above a specified threshold...
bool has_home
whether the tensor has a home mapping/buffer 
void * alloc(int64_t len)
alloc abstraction 
int get_best_topo(int64_t nvirt, int topo, CommData global_comm, int64_t bcomm_vol, int64_t bmemuse)
get the best topologoes (least nvirt) over all procs 
int64_t home_size
size of home buffer 
void desymmetrize(tensor *sym_tsr, tensor *nonsym_tsr, bool is_C)
unfolds the data of a tensor 
void set_sym(int const *sym)
sets symmetry, WARNING: for internal use only !!!! 
untyped internal class for doubly-typed univariate function 
virtual char const * addid() const 
MPI datatype for pairs. 
void set_new_nnz_glb(int64_t const *nnz_blk)
sets the number of nonzeros both locally (nnz_loc) and overall globally (nnz_tot) ...
#define DEBUG_PRINTF(...)
void sort(int64_t n)
sorts set of pairs using std::sort 
void copy_mapping(int order, mapping const *mapping_A, mapping *mapping_B)
copies mapping A to B 
tensor * self_reduce(int const *idx_A, int **new_idx_A, int order_B, int const *idx_B, int **new_idx_B, int order_C=0, int const *idx_C=NULL, int **new_idx_C=NULL)
performs a partial reduction on the tensor (used in summation and contraction) 
an instance of the CTF library (world) on a MPI communicator 
bool is_sparse
whether only the non-zero elements of the tensor are stored 
int order
number of tensor dimensions 
double estimate_time()
predicts execution time in seconds using performance models 
void set_padding()
sets padding and local size of a tensor given a mapping 
CTF::World * wrld
distributed processor context on which tensor is defined 
class for execution distributed scaling of a tensor 
int rank
rank of local processor 
int overcounting_factor(int order_A, const T &idx_A, const int *sym_A, int order_B, const T &idx_B, const int *sym_B, int order_C, const T &idx_C, const int *sym_C)
performs replication along a dimension, generates 2.5D algs 
~summation()
lazy constructor 
int zero_out_padding()
sets padded portion of tensor to zero (this should be maintained internally) 
int strip_diag(int order, int order_tot, int const *idx_map, int64_t vrt_sz, mapping const *edge_map, topology const *topo, algstrct const *sr, int *blk_edge_len, int64_t *blk_sz, strp_tsr **stpr)
build stack required for stripping out diagonals of tensor 
char const * alpha
scaling of A 
int can_block_reshuffle(int order, int const *old_phase, mapping const *map)
determines if tensor can be permuted by block 
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction 
void print_map(FILE *stream=stdout, bool allcall=1) const 
displays mapping information 
int comp_dim_map(mapping const *map_A, mapping const *map_B)
compares two mappings 
bool is_data_aliased
whether the tensor data is an alias of another tensor object's data 
int64_t k() const 
returns key of pair at head of ptr 
int64_t nnz_tot
maximum number of local nonzero elements over all procs 
void pull_alias(tensor const *other)
pulls data from an tensor with an aliased buffer 
double est_time_transp(int order, int const *new_order, int const *edge_len, int dir, algstrct const *sr)
estimates time needed to transposes a non-symmetric (folded) tensor based on performance models ...
algstrct * sr
algstrct on which tensor elements and operations are defined 
virtual void pair_dealloc(char *ptr) const 
deallocate given pointer containing contiguous array of pairs 
univar_function const * func
mapping * edge_map
mappings of each tensor dimension onto topology dimensions 
void unfold(bool was_mod=0)
undo the folding of a local tensor block unsets is_folded and deletes rec_tsr 
tensor * rec_tsr
representation of folded tensor (shares data pointer) 
int * idx_B
indices of output 
bool is_mapped
whether a mapping has been selected 
univar_function const * func
int64_t calc_nvirt() const 
calculate virtualization factor of tensor return virtualization factor 
void print()
print contraction details 
univar_function const * func
function to execute on elements 
void permute_target(int order, int const *perm, int *arr)
permutes a permutation array 
int check_self_mapping(tensor const *tsr, int const *idx_map)
checks mapping in preparation for tensors scale, summ or contract 
int64_t nnz_loc
number of local nonzero elements 
virtual void add(char const *a, char const *b, char *c) const 
c = a+b 
int el_size
size of each element of algstrct in bytes 
char const * beta
scaling of existing B 
int cdealloc(void *ptr)
free abstraction 
std::vector< CTF_int::topology * > topovec
derived topologies 
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
int is_equal(summation const &os)
returns 1 if summations have same tensors and index map 
internal distributed tensor class 
void nosym_transpose(tensor *A, int all_fdim_A, int const *all_flen_A, int const *new_order, int dir)
int extract_diag(int const *idx_map, int rw, tensor *&new_tsr, int **idx_map_new)
extracts the diagonal of a tensor if the index map specifies to do so 
int align_symmetric_indices(int order_A, T &idx_A, const int *sym_A, int order_B, T &idx_B, const int *sym_B)
int map_self_indices(tensor const *tsr, int const *idx_map)
create virtual mapping for idx_maps that have repeating indices 
topology * topo
topology to which the tensor is mapped 
performs replication along a dimension, generates 2.5D algs 
class for execution distributed summation of tensors 
virtual void safeaddinv(char const *a, char *&b) const 
b = -a, with checks for NULL and alloc as necessary 
virtual char const * mulid() const 
identity element for multiplication i.e. 1 
void symmetrize(tensor *sym_tsr, tensor *nonsym_tsr)
folds the data of a tensor 
char * name
name given to tensor 
int sum_tensors(bool run_diag)
PDAXPY: a*idx_map_A(A) + b*idx_map_B(B) -> idx_map_B(B). Treats symmetric as lower triangular...
MPI_Comm comm
set of processors making up this world 
void clear()
resets mapping to NOT_MAPPED 
int conv_idx(int order, type const *cidx, int **iidx)