2 #include "../interface/common.h"     3 #include "../interface/timer.h"     4 #include "../interface/idx_tensor.h"     5 #include "../summation/summation.h"     6 #include "../contraction/contraction.h"     8 #include "../shared/util.h"     9 #include "../shared/memcontrol.h"    10 #include "../redistribution/sparse_rw.h"    11 #include "../redistribution/pad.h"    12 #include "../redistribution/nosym_transp.h"    13 #include "../redistribution/redist.h"    14 #include "../redistribution/cyclic_reshuffle.h"    15 #include "../redistribution/glb_cyclic_reshuffle.h"    16 #include "../redistribution/dgtog_redist.h"    25     double ps[] = {1.0, (double)log2(np), (double)size*log2(np)};
    40   void tensor::free_self(){
    42       if (wrld->rank == 0) 
DPRINTF(3,
"Deleted order %d tensor %s\n",order,name);
    43       if (is_folded) unfold();
    53       if (!is_data_aliased){
    55           if (!is_sparse) sr->dealloc(home_buffer);
    57             sr->pair_dealloc(data);
    62             if (!is_sparse) sr->dealloc(data);
    63             else sr->pair_dealloc(data); 
    66         if (has_home && !is_home){
    67           if (!is_sparse) sr->dealloc(home_buffer);
    68           else sr->pair_dealloc(home_buffer);
    91     this->init(sr, order,edge_len,sym,wrld,alloc_data,name,profile,is_sparse);
   105     this->init(sr, order,edge_len,sym,wrld,0,name,profile,is_sparse);
   106     set_distribution(idx, prl, blk);
   108       nnz_blk = (int64_t*)
alloc(
sizeof(int64_t)*calc_nvirt());
   109       std::fill(nnz_blk, nnz_blk+calc_nvirt(), 0);
   113       this->home_buffer = NULL;
   120       this->data = sr->
alloc(this->size);
   121       this->sr->
set(this->data, this->sr->
addid(), this->size);
   123       this->home_size = this->size;
   124       register_size(home_size*sr->
el_size);
   127       this->home_buffer = this->data;
   136   tensor::tensor(
tensor const * other, 
bool copy, 
bool alloc_data){
   137     char * nname = (
char*)
alloc(strlen(other->
name) + 2);
   139     strcpy(nname, other->
name);
   142       DPRINTF(2,
"Cloning tensor %s into %s copy=%d alloc_data=%d\n",other->
name, nname,copy, alloc_data);
   145                other->
sym, other->
wrld, (!copy & alloc_data), nname,
   152       copy_tensor_data(other);
   153     } 
else if (!alloc_data) data = NULL;
   157   tensor::tensor(
tensor * other, 
int const * new_sym){
   158     char * nname = (
char*)
alloc(strlen(other->
name) + 2);
   160     strcpy(nname, other->
name);
   163       DPRINTF(2,
"Repacking tensor %s into %s\n",other->
name,nname);
   166     bool has_chng=
false, less_sym=
false, more_sym=
false;
   167     for (
int i=0; i<other->
order; i++){
   168       if (other->
sym[i] != new_sym[i]){
   170           DPRINTF(2,
"sym[%d] was %d now %d\n",i,other->
sym[i],new_sym[i]);
   173         if (other->
sym[i] == 
NS){
   177         if (new_sym[i] == 
NS){
   187     if (!less_sym && !more_sym){
   189                  new_sym, other->
wrld, 0, nname,
   191       copy_tensor_data(other);
   196                  new_sym, other->
wrld, 1, nname,
   199       for (
int j=0; j<order; j++){
   202       summation ts(other, idx, sr->mulid(), 
this, idx, sr->addid());
   210   void tensor::copy_tensor_data(
tensor const * other){
   224         register_size(this->home_size*sr->el_size);
   225         this->home_buffer = sr->alloc(other->
home_size);
   228           this->data = this->home_buffer;
   235           this->data = sr->alloc(other->
size);
   240         this->data = sr->alloc(other->
size);
   249       this->data = sr->alloc(other->
size);
   251       sr->copy(this->data, other->
data, other->
size);
   257       if (data!=NULL)    sr->dealloc(this->data);
   261       this->data = sr->pair_alloc(other->
nnz_loc);
   264       this->set_new_nnz_glb(other->
nnz_blk);
   265       sr->copy_pairs(this->data, other->
data, other->
nnz_loc);
   267     if (this->is_folded){
   268       delete this->rec_tsr;
   275                        (
void**)&this->inner_ordering);
   276       for (
int i=0; i<other->
order; i++){
   279       this->rec_tsr = rtsr;
   282     this->order = other->
order;
   284     memcpy(this->padding, other->
padding, 
sizeof(
int)*other->
order);
   287     this->topo      = other->
topo;
   290     this->size = other->
size;
   291     this->nnz_loc = other->
nnz_loc;
   292     this->nnz_tot = other->
nnz_tot;
   295     if (wrld->rank == 0){
   297         printf(
"New sparse tensor %s copied from %s of size %ld nonzeros (%ld bytes) locally, %ld nonzeros total:\n",name, other->
name, this->nnz_loc,this->nnz_loc*sr->el_size,nnz_tot);
   299         printf(
"New tensor %s copied from %s of size %ld elms (%ld bytes):\n",name, other->
name, this->size,this->size*sr->el_size);
   308                     int const *      edge_len,
   316     this->sr                = sr_->
clone();
   317     this->order             = order_;
   319     this->is_scp_padded     = 0;
   325     this->is_data_aliased   = 0;
   326     this->has_zero_edge_len = 0;
   329     this->profile           = profile_;
   330     this->is_sparse         = is_sparse_;
   334     this->nnz_blk           = NULL;
   335     this->is_csr            = 
false;
   337     this->left_home_transp  = 0;
   339     this->registered_alloc_size = 0;
   341       this->name = (
char*)
alloc(strlen(name_)+1);
   342       strcpy(this->name, name_);
   344       this->name = (
char*)
alloc(7*
sizeof(
char));
   345       for (
int i=0; i<4; i++){
   346         this->name[i] = 
'A'+(wrld->glob_wrld_rng()%26);
   348       this->name[4] = 
'0'+(order_/10);
   349       this->name[5] = 
'0'+(order_%10);
   350       this->name[6] = 
'\0';
   352     this->home_buffer = NULL;
   354       DPRINTF(3,
"Created order %d tensor %s, is_sparse = %d, allocated = %d\n",order,name,is_sparse,alloc_data);
   357     memset(this->padding, 0, order*
sizeof(
int));
   360     memcpy(this->lens, edge_len, order*
sizeof(
int));
   362     memcpy(this->pad_edge_len, lens, order*
sizeof(
int));
   365     this->set_sym (sym_);
   366     this->edge_map  = 
new mapping[order];
   369     for (
int i=0; i<order; i++){
   370       if (this->lens[i] <= 0) this->has_zero_edge_len = 1;
   372       this->edge_map[i].has_child  = 0;
   373       this->edge_map[i].np         = 1;
   388       int ret = set_zero();
   394   int * tensor::calc_phase()
 const {
   399     for (i=0; i<this->order; i++){
   400       map = this->edge_map + i;
   406   int tensor::calc_tot_phase()
 const {
   408     int * phase = this->calc_phase();
   410     for (i=0 ; i<this->order; i++){
   411       tot_phase *= phase[i];
   417   int64_t tensor::calc_nvirt()
 const {
   419     int64_t nvirt, tnvirt;
   422     for (j=0; j<this->order; j++){
   423       map = &this->edge_map[j];
   426         tnvirt = nvirt*(int64_t)map->
np;
   427         if (tnvirt < nvirt) 
return UINT64_MAX;
   435   int64_t tensor::calc_npe()
 const {
   440     for (j=0; j<this->order; j++){
   441       map = &this->edge_map[j];
   456   void tensor::set_padding(){
   458     int * new_phase, * sub_edge_len;
   470     for (j=0; j<this->order; j++){
   471       map = this->edge_map + j;
   473       pad = this->lens[j]%new_phase[j];
   475         pad = new_phase[j]-pad;
   477       this->padding[j] = pad;
   479     for (i=0; i<this->order; i++){
   480       this->pad_edge_len[i] = this->lens[i] + this->padding[i];
   481       sub_edge_len[i] = this->pad_edge_len[i]/new_phase[i];
   484     this->size = calc_nvirt()*
sy_packed_size(this->order, sub_edge_len, this->sym);
   493   int tensor::set(
char const * val) {
   494     sr->set(this->data, val, this->size);
   495     return zero_out_padding();
   499   int tensor::set_zero() {
   502     int i, map_success, btopo;
   504     int64_t memuse, bmemuse;
   506     if (this->is_mapped){
   508         sr->pair_dealloc(this->data);
   510         this->home_buffer = NULL;
   512         memset(this->nnz_blk, 0, 
sizeof(int64_t)*calc_nvirt());
   513         this->set_new_nnz_glb(this->nnz_blk);
   515         sr->set(this->data, sr->addid(), this->size);
   525       bool fully_distributed = 
false;
   526       for (i=wrld->rank; i<(int64_t)wrld->topovec.size(); i+=wrld->np){
   527         this->clear_mapping();
   529         memset(restricted, 0, this->order*
sizeof(
int));
   530         map_success = 
map_tensor(wrld->topovec[i]->order, this->order, this->pad_edge_len,
   531                                  this->sym_table, restricted,
   532                                  wrld->topovec[i]->dim_comm, NULL, 0,
   534         if (map_success == 
ERROR) {
   538         } 
else if (map_success == 
SUCCESS){
   539           this->topo = wrld->topovec[i];
   541           memuse = (int64_t)this->size;
   543             DPRINTF(1,
"Not enough memory %E to map tensor (size %E) on topo %d\n", (
double)
proc_bytes_available(),(
double)memuse*sr->el_size,i);
   546           int64_t sum_phases = 0;
   547           int64_t prod_phys_phases = 1;
   548           for (
int j=0; j<this->order; j++){
   549             int phase = this->edge_map[j].calc_phase();
   550             prod_phys_phases *= this->edge_map[j].calc_phys_phase();
   551             int max_lcm_phase = phase;
   552             for (
int k=0; k<this->order; k++){
   553               max_lcm_phase = std::max(max_lcm_phase,
lcm(phase,this->edge_map[k].calc_phase()));
   555             sum_phases += max_lcm_phase + phase;
   557           memuse = memuse*(1.+((double)sum_phases)/(4.*wrld->topovec[i]->glb_comm.np));
   568             fully_distributed = prod_phys_phases == wrld->np; 
   569           } 
else if ((memuse < bmemuse && !fully_distributed) || (memuse < bmemuse && prod_phys_phases == wrld->
np)){
   572             fully_distributed = prod_phys_phases == wrld->np; 
   575           DPRINTF(1,
"Unsuccessful in map_tensor() in set_zero()\n");
   581       int btopo1 = 
get_best_topo((1-fully_distributed)*INT64_MAX+fully_distributed*bmemuse, btopo, wrld->cdt);
   583       if (btopo != btopo1 && btopo1 != -1) btopo = btopo1;
   585       if (btopo == -1 || btopo == INT_MAX) {
   587           printf(
"ERROR: FAILED TO MAP TENSOR\n");
   588         MPI_Barrier(MPI_COMM_WORLD);
   594       memset(restricted, 0, this->order*
sizeof(
int));
   595       this->clear_mapping();
   597       map_success = 
map_tensor(wrld->topovec[btopo]->order, this->order,
   598                                this->pad_edge_len, this->sym_table, restricted,
   599                                wrld->topovec[btopo]->dim_comm, NULL, 0,
   603       this->topo = wrld->topovec[btopo];
   610       if (!is_sparse && this->size > INT_MAX && wrld->rank == 0)
   611         printf(
"CTF WARNING: Tensor %s is has local size %ld, which is greater than INT_MAX=%d, so MPI could run into problems\n", name, size, INT_MAX);
   614         nnz_blk = (int64_t*)
alloc(
sizeof(int64_t)*calc_nvirt());
   615         std::fill(nnz_blk, nnz_blk+calc_nvirt(), 0);
   618         this->home_buffer = NULL;
   621         if (this->order > 0){
   622           this->home_size = this->size; 
   629           this->home_buffer = sr->alloc(this->home_size);
   630           if (wrld->rank == 0) 
DPRINTF(2,
"Creating home of %s\n",name);
   631           register_size(this->size*sr->el_size);
   632           this->data = this->home_buffer;
   634           this->data = sr->alloc(this->size);
   637         this->data = sr->alloc(this->size);
   642           printf(
"New tensor %s defined of size %ld elms (%ld bytes):\n",name, this->size,this->size*sr->el_size);
   643         this->print_map(stdout);
   645         sr->init(this->size, this->data);
   652   void tensor::print_map(FILE * stream, 
bool allcall)
 const {
   653     if (!allcall || wrld->rank == 0){
   655         printf(
"printing mapping of sparse tensor %s\n",name);
   657         printf(
"printing mapping of dense tensor %s\n",name);
   659         printf(
"CTF: %s mapped to order %d topology with dims:",name,topo->order);
   661           printf(
" %d ",topo->lens[
dim]);
   667       sprintf(tname, 
"%s[", name);
   670           sprintf(tname+strlen(tname), 
",");
   671         int tp = edge_map[
dim].calc_phase();
   672         int pp = edge_map[
dim].calc_phys_phase();
   674         if (tp==1) sprintf(tname+strlen(tname),
"1");
   677             sprintf(tname+strlen(tname),
"p%d(%d)",edge_map[
dim].
np,edge_map[
dim].cdt);
   679               sprintf(tname+strlen(tname),
"p%d(%d)",edge_map[
dim].child->np,edge_map[dim].child->cdt);
   681           if (vp > 1) sprintf(tname+strlen(tname),
"v%d",vp);
   685       sprintf(tname+strlen(tname), 
"]");
   697   void tensor::set_name(
char const * name_){
   699     this->name = (
char*)
alloc(strlen(name_)+1);
   700     strcpy(this->name, name_);
   703   char const * tensor::get_name()
 const {
   707   void tensor::profile_on(){
   711   void tensor::profile_off(){
   715   void tensor::get_raw_data(
char ** data_, int64_t * size_)
 const {
   721                       int * 
const * permutation_A,
   723                       int * 
const * permutation_B,
   725     int64_t sz_A, blk_sz_A, sz_B, blk_sz_B;
   734     if (permutation_B != NULL){
   735       ASSERT(permutation_A == NULL);
   741         if (wrld->rank == 0 && tsr_B->
is_sparse) printf(
"CTF ERROR: please use other variant of permute function when the output is sparse\n");
   747       ret = tsr_A->
write(blk_sz_B, sr->mulid(), sr->addid(), all_data_B, 
'r');
   750       all_data_A = all_data_B;
   758         ASSERT(permutation_A != NULL);
   759         ASSERT(permutation_B == NULL);
   769     ret = tsr_B->
write(blk_sz_A, alpha, beta, all_data_A, 
'w');
   778                                int &           bw_mirror_rank,
   779                                int &           fw_mirror_rank,
   781                                char **         sub_buffer_){
   784     if (order != -1) is_sub = 1;
   786     greater_world->
cdt.
allred(&is_sub, &tot_sub, 1, MPI_INT, MPI_SUM);
   789     if (order != -1) 
ASSERT(tot_sub == wrld->np);
   791     greater_world->
cdt.
allred(&order, &aorder, 1, MPI_INT, MPI_MAX);
   793     int sub_root_rank = 0;
   796     if (order >= 0 && wrld->rank == 0){
   797       greater_world->
cdt.
allred(&greater_world->
rank, &sub_root_rank, 1, MPI_INT, MPI_SUM);
   803       greater_world->
cdt.
bcast(buffer, buf_sz, MPI_CHAR, sub_root_rank);
   806       greater_world->
cdt.
allred(MPI_IN_PLACE, &sub_root_rank, 1, MPI_INT, MPI_SUM);
   807       greater_world->
cdt.
bcast(buffer, buf_sz, MPI_CHAR, sub_root_rank);
   816       fw_mirror_rank = wrld->rank;
   817       MPI_Isend(&(greater_world->
rank), 1, MPI_INT, wrld->rank, 13, greater_world->
comm, &req);
   819     if (greater_world->
rank < tot_sub){
   821       MPI_Recv(&bw_mirror_rank, 1, MPI_INT, MPI_ANY_SOURCE, 13, greater_world->
comm, &stat);
   823     if (fw_mirror_rank >= 0){
   825       MPI_Wait(&req, &stat);
   828     MPI_Request req1, req2;
   830     char * sub_buffer = sr->alloc(odst->
size);
   832     char * rbuffer = NULL;
   833     if (bw_mirror_rank >= 0){
   835       MPI_Irecv(rbuffer, buf_sz, MPI_CHAR, bw_mirror_rank, 0, greater_world->
comm, &req1);
   836       MPI_Irecv(sub_buffer, odst->
size*sr->el_size, MPI_CHAR, bw_mirror_rank, 1, greater_world->
comm, &req2);
   838     if (fw_mirror_rank >= 0){
   844       MPI_Send(sbuffer, buf_sz, MPI_CHAR, fw_mirror_rank, 0, greater_world->
comm);
   845       MPI_Send(this->data, odst->
size*sr->el_size, MPI_CHAR, fw_mirror_rank, 1, greater_world->
comm);
   848     if (bw_mirror_rank >= 0){
   850       MPI_Wait(&req1, &stat);
   851       MPI_Wait(&req2, &stat);
   856       sr->set(sub_buffer, sr->addid(), odst->
size);
   857     *sub_buffer_ = sub_buffer;
   861   void tensor::slice(
int const *  offsets_B,
   865                      int const *  offsets_A,
   869     int64_t i, j, sz_A, blk_sz_A, sz_B, blk_sz_B;
   870     char * all_data_A, * blk_data_A;
   871     char * all_data_B, * blk_data_B;
   881     for (i=0,j=0; i<this->order && j<A->
order; i++, j++){
   882       if (ends_A[j] - offsets_A[j] != ends_B[i] - offsets_B[i]){
   883         if (ends_B[i] - offsets_B[i] == 1){ j--; 
continue; } 
   884         if (ends_A[j] - offsets_A[j] == 1){ i--; 
continue; } 
   885         printf(
"CTF ERROR: slice dimensions inconsistent 1\n");
   891     while (A->
order != 0 && i < this->order){
   892       if (ends_B[i] - offsets_B[i] == 1){ i++; 
continue; }
   893       printf(
"CTF ERROR: slice dimensions inconsistent 2\n");
   897     while (this->order != 0 && j < A->order){
   898       if (ends_A[j] - offsets_A[j] == 1){ j++; 
continue; }
   899       printf(
"CTF ERROR: slice dimensions inconsistent 3\n");
   915         for (i=0; i<tsr_B->
order; i++){
   916           padding_B[i] = tsr_B->
lens[i] - ends_B[i];
   919                   all_data_B, blk_data_B, &blk_sz_B, sr);
   921           sr->pair_dealloc(all_data_B);
   923         for (i=0; i<tsr_B->
order; i++){
   924           toffset_B[i] = -offsets_B[i];
   925           padding_B[i] = ends_B[i]-offsets_B[i]-tsr_B->
lens[i];
   929                 padding_B, pblk_data_B, sr, toffset_B);
   930         for (i=0; i<tsr_A->
order; i++){
   931           toffset_A[i] = ends_A[i] - offsets_A[i];
   932           padding_A[i] = tsr_A->
lens[i] - toffset_A[i];
   935                 padding_A, pblk_data_B, sr, offsets_A);
   937       tsr_A->
write(blk_sz_B, sr->mulid(), sr->addid(), blk_data_B, 
'r');
   938       all_data_A = blk_data_B;
   951       for (i=0; i<tsr_A->
order; i++){
   952         padding_A[i] = tsr_A->
lens[i] - ends_A[i];
   954       int nosym[tsr_A->
order];
   955       std::fill(nosym, nosym+tsr_A->
order, 
NS);
   956       depad_tsr(tsr_A->
order, sz_A, ends_A, nosym, padding_A, offsets_A,
   957                 all_data_A, blk_data_A, &blk_sz_A, sr);
   962       for (i=0; i<tsr_A->
order; i++){
   963         toffset_A[i] = -offsets_A[i];
   964         padding_A[i] = ends_A[i]-offsets_A[i]-tsr_A->
lens[i];
   968               padding_A, pblk_data_A, sr, toffset_A);
   969       for (i=0; i<tsr_B->
order; i++){
   970         toffset_B[i] = ends_B[i] - offsets_B[i];
   971         padding_B[i] = tsr_B->
lens[i] - toffset_B[i];
   974               padding_B, pblk_data_A, sr, offsets_B);
   981     tsr_B->
write(blk_sz_A, alpha, beta, blk_data_A, 
'w');
   991   void tensor::add_to_subworld(
tensor *     tsr_sub,
   994   #ifdef USE_SLICE_FOR_SUBWORLD   995     int offsets[this->order];
   996     memset(offsets, 0, this->order*
sizeof(
int));
   997     if (tsr_sub->
order == -1){ 
  1003       stsr.
slice(NULL, NULL, beta, 
this, offsets, offsets, alpha);
  1005       tsr_sub->
slice(offsets, lens, beta, 
this, offsets, lens, alpha);
  1008     int fw_mirror_rank, bw_mirror_rank;
  1011     tsr_sub->
orient_subworld(wrld, bw_mirror_rank, fw_mirror_rank, odst, &sub_buffer);
  1017     cyclic_reshuffle(sym, idst, NULL, NULL, *odst, NULL, NULL, &this->data, &sub_buffer, sr, wrld->cdt, 0, alpha, beta);
  1020     if (fw_mirror_rank >= 0){
  1022       MPI_Irecv(tsr_sub->
data, odst->
size, tsr_sub->
sr->
mdtype(), fw_mirror_rank, 0, wrld->cdt.cm, &req);
  1025     if (bw_mirror_rank >= 0)
  1026       MPI_Send(sub_buffer, odst->
size, sr->mdtype(), bw_mirror_rank, 0, wrld->cdt.cm);
  1027     if (fw_mirror_rank >= 0){
  1029       MPI_Wait(&req, &stat);
  1032     sr->dealloc(sub_buffer);
  1037   void tensor::add_from_subworld(
tensor *     tsr_sub,
  1040   #ifdef USE_SLICE_FOR_SUBWORLD  1041     int offsets[this->order];
  1042     memset(offsets, 0, this->order*
sizeof(
int));
  1043     if (tsr_sub->
order == -1){ 
  1046       slice(offsets, offsets, beta, &stsr, NULL, NULL, alpha);
  1048       slice(offsets, lens, alpha, tsr_sub, offsets, lens, beta);
  1051     int fw_mirror_rank, bw_mirror_rank;
  1054     tsr_sub->
orient_subworld(wrld, bw_mirror_rank, fw_mirror_rank, odst, &sub_buffer);
  1060     cyclic_reshuffle(sym, *odst, NULL, NULL, idst, NULL, NULL, &sub_buffer, &this->data, sr, wrld->cdt, 0, alpha, beta);
  1062     sr->dealloc(sub_buffer);
  1067   void tensor::write(int64_t        num_pair,
  1070                      int64_t 
const *inds,
  1072     char * pairs = sr->pair_alloc(num_pair);
  1074     for (int64_t i=0; i<num_pair; i++){
  1078     this->write(num_pair,alpha,beta,pairs,
'w');
  1079     sr->pair_dealloc(pairs);
  1082   int tensor::write(int64_t      num_pair,
  1088     int * phase, * phys_phase, * virt_phase, * bucket_lda;
  1089     int * virt_phys_rank;
  1094     if (wrld->rank == 0){
  1099       this->print_map(stdout);
  1117       for (i=0; i<tsr->
order; i++){
  1121         virt_phase[i]     = phase[i]/phys_phase[i];
  1124         num_virt          = num_virt*virt_phase[i];
  1134       if (4*num_pair*sr->pair_size() >= max_memuse){
  1135         npart = 1 + (6*num_pair*sr->pair_size())/max_memuse;
  1137       MPI_Allreduce(MPI_IN_PLACE, &npart, 1, MPI_INT, MPI_MAX, wrld->cdt.cm);
  1143       int64_t part_size = num_pair/npart;
  1144       for (
int part = 0; part<npart; part++){
  1145         int64_t nnz_loc_new;
  1148         if (part == npart-1) pnum_pair = num_pair - part_size*part;
  1149         else pnum_pair = part_size;
  1150         char * buf_ptr = mapped_data + sr->pair_size()*part_size*part;
  1175         if (is_sparse && rw == 
'w'){
  1176           this->set_new_nnz_glb(nnz_blk);
  1177           if (tsr->
data != NULL) sr->pair_dealloc(tsr->
data);
  1178           tsr->
data = new_pairs;
  1203   void tensor::read(int64_t      num_pair,
  1206                     int64_t 
const * inds,
  1208     char * pairs = sr->pair_alloc(num_pair);
  1210     for (int64_t i=0; i<num_pair; i++){
  1214     write(num_pair, alpha, beta, pairs, 
'r');
  1215     for (int64_t i=0; i<num_pair; i++){
  1216       pr[i].
read_val(data+i*sr->el_size);
  1218     sr->pair_dealloc(pairs);
  1221   int tensor::read(int64_t      num_pair,
  1224                    char *       mapped_data){
  1225     return write(num_pair, alpha, beta, mapped_data, 
'r');
  1228   int tensor::read(int64_t num_pair,
  1229                    char *  mapped_data){
  1230     return write(num_pair, NULL, NULL, mapped_data, 
'r');
  1233   void tensor::set_distribution(
char const *          idx,
  1248       itopo = wrld->topovec.size();
  1249       wrld->topovec.push_back(top);
  1252     assert(itopo != -1);
  1254     this->clear_mapping();
  1256     this->topo = wrld->topovec[itopo];
  1257     for (
int i=0; i<order; i++){
  1258       mapping * map = this->edge_map+i;
  1260         if (idx[i] == prl.
idx[j]){
  1267           map->
np = this->topo->dim_comm[j].np;
  1273         if (idx[i] == blk.
idx[j]){
  1285     this->is_mapped = 
true;
  1286     this->set_padding();
  1288     conv_idx(this->order, idx, &idx_A);
  1290       if (wrld->rank == 0)
  1291         printf(
"CTF ERROR: invalid distribution in read() call, aborting.\n");
  1299   char * tensor::read(
char const *          idx,
  1304       for (
int i=0; i<order; i++){
  1307           std::fill(new_sym, new_sym+order, 
NS);
  1308           tensor tsr(sr, order, lens, new_sym, wrld, 
true);
  1309           tsr[idx] += (*this)[idx];
  1310           return tsr.
read(idx, prl, blk, unpack);
  1315     tensor tsr_ali(
this, 1, 1);
  1317     if (wrld->rank == 0)
  1318       printf(
"Redistributing via read() starting from distribution:\n");
  1323     if (tsr_ali.
has_home) deregister_size();
  1328     return tsr_ali.
data;
  1332   int tensor::sparsify(
char const * threshold,
  1334     if ((threshold == NULL && sr->addid() == NULL) ||
  1335         (threshold != NULL && !sr->is_ordered())){
  1338     if (threshold == NULL)
  1339       return sparsify([&](
char const* c){ 
return !sr->isequal(c, sr->addid()); });
  1341       return sparsify([&](
char const* c){
  1342         char tmp[sr->el_size];
  1343         sr->max(c,threshold,tmp);
  1344         return !sr->isequal(tmp, threshold);
  1347       return sparsify([&](
char const* c){
  1348         char tmp[sr->el_size];
  1350         sr->max(tmp,threshold,tmp);
  1351         return !sr->isequal(tmp, threshold);
  1355   int tensor::sparsify(std::function<
bool(
char const*)> f){
  1358       int64_t nnz_loc_new = 0;
  1360       int64_t nnz_blk_old[calc_nvirt()];
  1361       memcpy(nnz_blk_old, nnz_blk, calc_nvirt()*
sizeof(int64_t));
  1362       memset(nnz_blk, 0, calc_nvirt()*
sizeof(int64_t));
  1366       for (
int v=0; v<calc_nvirt(); v++){
  1367         for (int64_t j=0; j<nnz_blk_old[v]; j++,i++){
  1370           keep_vals[i] = f(pi[i].d());
  1379       if (nnz_loc_new != nnz_loc){
  1380         char * old_data = data;
  1381         data = sr->pair_alloc(nnz_loc_new);
  1384         for (int64_t i=0; i<nnz_loc; i++){
  1386             pi_new[nnz_loc_new].
write(pi[i].ptr);
  1390         sr->dealloc(old_data);
  1394       this->set_new_nnz_glb(nnz_blk);
  1399       ASSERT(!has_home || is_home);
  1400       int nvirt = calc_nvirt();
  1401       this->nnz_blk = (int64_t*)
alloc(
sizeof(int64_t)*nvirt);
  1404       int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
  1411       char * old_data = this->data;
  1414       int idx_lyr = wrld->rank;
  1415       for (
int i=0; i<this->order; i++){
  1417         if (i == 0) edge_lda[0] = 1;
  1418         else edge_lda[i]     = edge_lda[i-1]*lens[i-1];
  1419         mapping const * map  = this->edge_map + i;
  1422         virt_phase[i]        = phase[i]/phys_phase[i];
  1424         nvirt          = nvirt*virt_phase[i];
  1427           idx_lyr -= this->topo->lda[map->
cdt]
  1442         for (
int i=0; i<this->order; i++){
  1443           if (i == 0) edge_lda[0] = 1;
  1444           else edge_lda[i] = edge_lda[i-1]*this->pad_edge_len[i-1];
  1445           depadding[i] = -padding[i];
  1449         memset(prepadding, 0, 
sizeof(
int)*order);
  1450         spsfy_tsr(this->order, this->size, nvirt,
  1451                   this->pad_edge_len, this->sym, phase,
  1452                   phys_phase, virt_phase, virt_phys_rank,
  1453                   this->data, this->data, this->nnz_blk, this->sr, edge_lda, f);
  1454         char * new_pairs[nvirt];
  1455         char const * data_ptr = this->data;
  1456         int64_t new_nnz_tot = 0;
  1457         for (
int v=0; v<nvirt; v++){
  1458           if (nnz_blk[v] > 0){
  1459             int64_t old_nnz = nnz_blk[v];
  1460             new_pairs[v] = (
char*)sr->pair_alloc(nnz_blk[v]);
  1461             depad_tsr(order, nnz_blk[v], this->lens, this->sym, this->padding, prepadding,
  1462                       data_ptr, new_pairs[v], nnz_blk+v, sr);
  1463             pad_key(order, nnz_blk[v], this->pad_edge_len, depadding, 
PairIterator(sr,new_pairs[v]), sr);
  1464             data_ptr += old_nnz*sr->pair_size();
  1465             new_nnz_tot += nnz_blk[v];
  1466           } 
else new_pairs[v] = NULL;
  1470         sr->pair_dealloc(this->data);
  1471         this->data = sr->pair_alloc(new_nnz_tot);
  1472         char * new_data_ptr = this->data;
  1473         for (
int v=0; v<nvirt; v++){
  1474           if (new_pairs[v] != NULL){
  1475             sr->copy_pairs(new_data_ptr, new_pairs[v], nnz_blk[v]);
  1476             sr->pair_dealloc(new_pairs[v]);
  1481         memset(nnz_blk, 0, 
sizeof(int64_t)*nvirt);
  1485       sr->dealloc(old_data);
  1487       if (has_home) deregister_size();
  1494       this->set_new_nnz_glb(this->nnz_blk);
  1505   void tensor::read_local(int64_t * num_pair,
  1508                           bool      unpack_sym)
 const {
  1510     read_local(num_pair, &pairs, unpack_sym);
  1514     for (int64_t i=0; i<*num_pair; i++){
  1515       (*inds)[i] = pr[i].
k();
  1516       pr[i].
read_val(*data+i*sr->el_size);
  1518     sr->pair_dealloc(pairs);
  1522   void tensor::read_local_nnz(int64_t * num_pair,
  1525                               bool      unpack_sym)
 const {
  1527     read_local_nnz(num_pair, &pairs, unpack_sym);
  1531     for (int64_t i=0; i<*num_pair; i++){
  1532       (*inds)[i] = pr[i].
k();
  1533       pr[i].
read_val(*data+i*sr->el_size);
  1535     sr->pair_dealloc(pairs);
  1538   int tensor::read_local_nnz(int64_t * num_pair,
  1539                              char **   mapped_data,
  1540                              bool      unpack_sym)
 const {
  1541     if (sr->isequal(sr->addid(), NULL) && !is_sparse)
  1542       return read_local_nnz(num_pair,mapped_data, unpack_sym);
  1546     *mapped_data = tsr_cpy.
data;
  1553   int tensor::read_local(int64_t * num_pair,
  1554                          char **   mapped_data,
  1555                          bool      unpack_sym)
 const {
  1556     int i, num_virt, idx_lyr;
  1558     int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
  1567       *mapped_data = NULL;
  1575     bool has_sym = 
false;
  1576     for (i=0; i<this->order; i++){
  1577       if (this->sym[i] != 
NS) has_sym = 
true;
  1583       read_local_nnz(&num_nnz, &nnz_data, unpack_sym);
  1584       tensor dense_tsr(sr, order, lens, sym, wrld);
  1585       dense_tsr.
write(num_nnz, sr->mulid(), sr->addid(), nnz_data);
  1586       sr->pair_dealloc(nnz_data);
  1587       dense_tsr.
read_local(num_pair, mapped_data, unpack_sym);
  1590     } 
else if (has_sym && unpack_sym) {
  1591       int nosym[this->order];
  1592       std::fill(nosym, nosym+this->order, 
NS);
  1593       tensor nosym_tsr(sr, order, lens, nosym, wrld);
  1594       int idx[this->order];
  1595       for (i=0; i<this->order; i++){
  1598       summation s((
tensor*)
this, idx, sr->mulid(), &nosym_tsr, idx, sr->mulid());
  1600       return nosym_tsr.
read_local(num_pair, mapped_data);
  1612       idx_lyr = wrld->rank;
  1613       for (i=0; i<tsr->
order; i++){
  1618         virt_phase[i]     = phase[i]/phys_phase[i];
  1620         num_virt          = num_virt*virt_phase[i];
  1629                        phase, phys_phase, virt_phase, virt_phys_rank, num_pair,
  1630                        tsr->
data, &pairs, sr);
  1631         *mapped_data = pairs;
  1633         *mapped_data = NULL;
  1653     char * my_pairs, * all_pairs;
  1656     if (has_zero_edge_len){
  1682     alloc_ptr(numPes*
sizeof(
int), (
void**)&nXs);
  1683     alloc_ptr(numPes*
sizeof(
int), (
void**)&pXs);
  1688     read_local(&ntt, &my_pairs, unpack);
  1691     MPI_Allgather(&n, 1, MPI_INT, nXs, 1, MPI_INT, wrld->comm);
  1692     for (i=1; i<numPes; i++){
  1693       pXs[i] = pXs[i-1]+nXs[i-1];
  1695     nval = pXs[numPes-1] + nXs[numPes-1];
  1696     nval = nval/sr->pair_size();
  1697     all_pairs = sr->pair_alloc(nval);
  1698     MPI_Allgatherv(my_pairs, n, MPI_CHAR,
  1699                    all_pairs, nXs, pXs, MPI_CHAR, wrld->comm);
  1706       sr->pair_dealloc(my_pairs);
  1712   int64_t tensor::get_tot_size(
bool packed=
false){
  1715       for (
int i=0; i<order; i++){
  1724   int tensor::allread(int64_t * num_pair,
  1728     char * ball_data = sr->alloc((*num_pair));
  1729     for (int64_t i=0; i<*num_pair; i++){
  1730       ipr[i].
read_val(ball_data+i*sr->el_size);
  1732     if (ipr.
ptr != NULL)
  1733       sr->pair_dealloc(ipr.
ptr);
  1734     *all_data = ball_data;
  1738   int tensor::allread(int64_t * num_pair,
  1742     for (int64_t i=0; i<*num_pair; i++){
  1743       ipr[i].
read_val(all_data+i*sr->el_size);
  1755     bool is_changed = 
false;
  1756     if (topo != B->
topo) is_changed = 
true;
  1758     for (
int i=0; i<order; i++){
  1760         edge_map[i].clear();
  1767       return redistribute(old_dist);
  1771   int tensor::reduce_sum(
char * result) {
  1772     return reduce_sum(result, sr);
  1775   int tensor::reduce_sum(
char * result, 
algstrct const * sr_other) {
  1776     ASSERT(is_mapped && !is_folded);
  1779     for (
int i=0; i<order; i++){
  1784     sr->copy(result, sc.
data);
  1785     wrld->cdt.bcast(result, sr_other->
el_size, MPI_CHAR, 0);
  1789   int tensor::reduce_sumabs(
char * result) {
  1790     return reduce_sumabs(result, sr);
  1793   int tensor::reduce_sumabs(
char * result, 
algstrct const * sr_other){
  1794     ASSERT(is_mapped && !is_folded);
  1798     for (
int i=0; i<order; i++){
  1803     sr->copy(result, sc.
data);
  1804     wrld->cdt.bcast(result, sr->el_size, MPI_CHAR, 0);
  1808   int tensor::reduce_sumsq(
char * result) {
  1809     ASSERT(is_mapped && !is_folded);
  1812     for (
int i=0; i<order; i++){
  1817     sr->copy(result, sc.
data);
  1818     wrld->cdt.bcast(result, sr->el_size, MPI_CHAR, 0);
  1822   void tensor::prnt()
 const {
  1825   void tensor::print(FILE * fp, 
char const * cutoff)
 const {
  1827     int64_t imy_sz, tot_sz =0;
  1828     int * recvcnts, * displs, * idx_arr;
  1829     char * pmy_data, * pall_data;
  1832     if (wrld->rank == 0)
  1833       printf(
"Printing tensor %s\n",name);
  1846     if (cutoff != NULL){
  1851       read_local_nnz(&imy_sz, &pmy_data, 
true);
  1861     if (wrld->rank == 0){
  1862       alloc_ptr(wrld->np*
sizeof(
int), (
void**)&recvcnts);
  1863       alloc_ptr(wrld->np*
sizeof(
int), (
void**)&displs);
  1864       alloc_ptr(order*
sizeof(
int), (
void**)&idx_arr);
  1868     MPI_Gather(&my_sz, 1, MPI_INT, recvcnts, 1, MPI_INT, 0, wrld->cdt.cm);
  1870     if (wrld->rank == 0){
  1871       for (
int i=0; i<wrld->np; i++){
  1872         recvcnts[i] *= sr->pair_size();
  1875       for (
int i=1; i<wrld->np; i++){
  1876         displs[i] = displs[i-1] + recvcnts[i-1];
  1878       tot_sz = (displs[wrld->np-1] + recvcnts[wrld->np-1])/sr->pair_size();
  1880       pall_data = sr->pair_alloc(tot_sz);
  1887     if (wrld->cdt.np == 1)
  1888       pall_data = pmy_data;
  1890       MPI_Gatherv(pmy_data, my_sz*sr->pair_size(), MPI_CHAR,
  1891                  pall_data, recvcnts, displs, MPI_CHAR, 0, wrld->cdt.cm);
  1899     if (wrld->rank == 0){
  1900       all_data.sort(tot_sz);
  1901       for (int64_t i=0; i<tot_sz; i++){
  1908         k = all_data[i].k();
  1909         for (
int j=0; j<order; j++){
  1911             idx_arr[j] = k%lens[j];
  1914         for (
int j=0; j<order; j++){
  1915           fprintf(fp,
"[%d]",idx_arr[j]);
  1917         fprintf(fp,
"(%ld, <",all_data[i].k());
  1918         sr->print(all_data[i].d(), fp);
  1924       if (pall_data != pmy_data) sr->pair_dealloc(pall_data);
  1926     if (pmy_data != NULL) sr->pair_dealloc(pmy_data);
  1930   void tensor::compare(
const tensor * A, FILE * fp, 
char const * cutoff){
  1933     int64_t imy_sz, tot_sz =0, my_sz_B;
  1934     int * recvcnts, * displs, * idx_arr;
  1953     assert(my_sz == my_sz_B);
  1957     if (global_comm.
rank == 0){
  1958       alloc_ptr(global_comm.
np*
sizeof(
int), (
void**)&recvcnts);
  1959       alloc_ptr(global_comm.
np*
sizeof(
int), (
void**)&displs);
  1965     MPI_Gather(&my_sz, 1, MPI_INT, recvcnts, 1, MPI_INT, 0, global_comm.
cm);
  1967     if (global_comm.
rank == 0){
  1968       for (i=0; i<global_comm.
np; i++){
  1972       for (i=1; i<global_comm.
np; i++){
  1973         displs[i] = displs[i-1] + recvcnts[i-1];
  1975       tot_sz = (displs[global_comm.
np-1]
  1984     if (my_sz == 0) my_data_A = my_data_B = NULL;
  1985     MPI_Gatherv(my_data_A, my_sz*A->
sr->
pair_size(), MPI_CHAR,
  1986                 all_data_A, recvcnts, displs, MPI_CHAR, 0, global_comm.
cm);
  1987     MPI_Gatherv(my_data_B, my_sz*A->
sr->
pair_size(), MPI_CHAR,
  1988                 all_data_B, recvcnts, displs, MPI_CHAR, 0, global_comm.
cm);
  1993     if (global_comm.
rank == 0){
  1994       pall_data_A.sort(tot_sz);
  1995       pall_data_B.
sort(tot_sz);
  1996       for (i=0; i<tot_sz; i++){
  1999         A->
sr->
abs(pall_data_A[i].d(), aA);
  2000         A->
sr->
min(aA, cutoff, aA);
  2001         B->
sr->
abs(pall_data_B[i].d(), aB);
  2002         B->
sr->
min(aB, cutoff, aB);
  2005           k = pall_data_A[i].k();
  2006           for (j=0; j<A->
order; j++){
  2007             idx_arr[j] = k%A->
lens[j];
  2010           for (j=0; j<A->
order; j++){
  2011             fprintf(fp,
"[%d]",idx_arr[j]);
  2014           A->
sr->
print(pall_data_A[i].d(),fp);
  2016           A->
sr->
print(pall_data_B[i].d(),fp);
  2023       sr->pair_dealloc(all_data_A);
  2024       sr->pair_dealloc(all_data_B);
  2029   void tensor::unfold(
bool was_mod){
  2030     int i, j, allfold_dim;
  2031     int * all_edge_len, * sub_edge_len;
  2032     if (this->is_folded){
  2035       calc_dim(this->order, this->size, this->pad_edge_len, this->edge_map,
  2036                NULL, sub_edge_len, NULL);
  2038       for (i=0; i<this->order; i++){
  2039         if (this->sym[i] == 
NS){
  2041           while (i-j >= 0 && this->sym[i-j] != 
NS) j++;
  2042           all_edge_len[allfold_dim] = 
sy_packed_size(j, sub_edge_len+i-j+1,
  2048         nosym_transpose(
this, allfold_dim, all_edge_len, this->inner_ordering, 0);
  2049         assert(!left_home_transp);
  2051         ASSERT(this->nrow_idx != -1);
  2053           despmatricize(this->nrow_idx, this->is_csr);
  2058       this->rec_tsr->is_data_aliased=1;
  2059       delete this->rec_tsr;
  2062     this->is_folded = 0;
  2067   void tensor::remove_fold(){
  2068     delete this->rec_tsr;
  2070     this->is_folded = 0;
  2075   double tensor::est_time_unfold(){
  2076     int i, j, allfold_dim;
  2077     int * all_edge_len, * sub_edge_len;
  2078     if (!this->is_folded) 
return 0.0;
  2082     calc_dim(this->order, this->size, this->pad_edge_len, this->edge_map,
  2083              NULL, sub_edge_len, NULL);
  2085     for (i=0; i<this->order; i++){
  2086       if (this->sym[i] == 
NS){
  2088         while (i-j >= 0 && this->sym[i-j] != 
NS) j++;
  2089         all_edge_len[allfold_dim] = 
sy_packed_size(j, sub_edge_len+i-j+1,
  2094     est_time = this->calc_nvirt()*
est_time_transp(allfold_dim, this->inner_ordering, all_edge_len, 0, sr);
  2101   void tensor::fold(
int         nfold,
  2102                     int const * fold_idx,
  2103                     int const * idx_map,
  2106     int i, j, k, fdim, allfold_dim, is_fold, fold_dim;
  2107     int * sub_edge_len, * fold_edge_len, * all_edge_len, * dim_order;
  2111     if (this->is_folded != 0) this->unfold();
  2115     allfold_dim = 0, fold_dim = 0;
  2116     for (j=0; j<this->order; j++){
  2117       if (this->sym[j] == 
NS){
  2119         for (i=0; i<nfold; i++){
  2120           if (fold_idx[i] == idx_map[j])
  2130     calc_dim(this->order, this->size, this->pad_edge_len, this->edge_map,
  2131        NULL, sub_edge_len, NULL);
  2133     allfold_dim = 0, fdim = 0;
  2134     for (j=0; j<this->order; j++){
  2135       if (this->sym[j] == 
NS){
  2137         while (j-k >= 0 && this->sym[j-k] != 
NS) k++;
  2138         all_edge_len[allfold_dim] = 
sy_packed_size(k, sub_edge_len+j-k+1,
  2141         for (i=0; i<nfold; i++){
  2142           if (fold_idx[i] == idx_map[j]){
  2144             while (j-k >= 0 && this->sym[j-k] != 
NS) k++;
  2151           dim_order[fdim] = allfold_dim;
  2154           dim_order[fold_dim+allfold_dim-fdim] = allfold_dim;
  2158     std::fill(fold_sym, fold_sym+fold_dim, 
NS);
  2159     fold_tsr = 
new tensor(sr, fold_dim, fold_edge_len, fold_sym, wrld, 0);
  2161     this->is_folded      = 1;
  2162     this->rec_tsr        = fold_tsr;
  2163     this->inner_ordering = dim_order;
  2166     *all_fdim = allfold_dim;
  2167     *all_flen = all_edge_len;
  2176   void tensor::pull_alias(
tensor const * other){
  2178       this->topo = other->
topo;
  2181       this->data = other->
data;
  2182       this->is_home = other->
is_home;
  2185       this->set_padding();
  2189   void tensor::clear_mapping(){
  2192     for (j=0; j<this->order; j++){
  2193       map = this->edge_map + j;
  2197     this->is_mapped = 0;
  2198     this->is_folded = 0;
  2202                            int const *  old_offsets,
  2203                            int * 
const * old_permutation,
  2204                            int const *  new_offsets,
  2205                            int * 
const * new_permutation){
  2207     int can_block_shuffle;
  2208     char * shuffled_data;
  2210     char * shuffled_data_corr;
  2214     if (is_sparse) can_block_shuffle = 0;
  2216   #ifdef USE_BLOCK_RESHUFFLE  2219       can_block_shuffle = 0;
  2221       if (old_offsets != NULL || old_permutation != NULL ||
  2222           new_offsets != NULL || new_permutation != NULL){
  2223         can_block_shuffle = 0;
  2227     if (size > INT_MAX && !is_sparse && wrld->cdt.rank == 0)
  2228       printf(
"CTF WARNING: Tensor %s is being redistributed to a mapping where its size is %ld, which is greater than INT_MAX=%d, so MPI could run into problems\n", name, size, INT_MAX);
  2230   #ifdef HOME_CONTRACT  2232       if (wrld->cdt.rank == 0)
  2233         DPRINTF(2,
"Tensor %s leaving home %d\n", name, is_sparse);
  2235         if (this->has_home){
  2236           this->home_buffer = sr->pair_alloc(nnz_loc);
  2237           sr->copy_pairs(this->home_buffer, this->data, nnz_loc);
  2241         this->data = sr->alloc(old_dist.
size);
  2242         sr->copy(this->data, this->home_buffer, old_dist.
size);
  2248     if (this->profile) {
  2250       strcpy(spf,
"redistribute_");
  2251       strcat(spf,this->name);
  2252       if (wrld->cdt.rank == 0){
  2259     if (wrld->cdt.rank == 0){
  2260       if (can_block_shuffle) 
VPRINTF(1,
"Remapping tensor %s via block_reshuffle to mapping\n",this->name);
  2261       else if (is_sparse) 
VPRINTF(1,
"Remapping tensor %s via sparse reshuffle to mapping\n",this->name);
  2262       else VPRINTF(1,
"Remapping tensor %s via cyclic_reshuffle to mapping\n",this->name);
  2264     this->print_map(stdout);
  2269       padded_reshuffle(sym, old_dist, new_dist, this->data, &shuffled_data_corr, sr, wrld->cdt);
  2272     if (can_block_shuffle){
  2273       block_reshuffle(old_dist, new_dist, this->data, shuffled_data, sr, wrld->cdt);
  2274       sr->dealloc(this->data);
  2280         double nnz_frac_ = ((double)nnz_tot)/(old_dist.
size*wrld->cdt.np);
  2281         double tps_[] = {0.0, 1.0, (double)log2(wrld->cdt.np),  (double)std::max(old_dist.
size, new_dist.
size)*log2(wrld->cdt.np)*sr->el_size*nnz_frac_};
  2284         double st_time = MPI_Wtime();
  2286         char * old_data = this->data;
  2289         int64_t old_nnz = nnz_loc;
  2292         nnz_blk = (int64_t*)
alloc(
sizeof(int64_t)*calc_nvirt());
  2293         std::fill(nnz_blk, nnz_blk+calc_nvirt(), 0);
  2294         this->write(old_nnz, sr->mulid(), sr->addid(), old_data);
  2296         shuffled_data = this->data;
  2297         if (old_data != NULL){
  2298           sr->pair_dealloc(old_data);
  2301         double exe_time = MPI_Wtime()-st_time;
  2302         double nnz_frac = ((double)nnz_tot)/(old_dist.
size*wrld->cdt.np);
  2303         double tps[] = {exe_time, 1.0, (double)log2(wrld->cdt.np),  (double)std::max(old_dist.
size, new_dist.
size)*log2(wrld->cdt.np)*sr->el_size*nnz_frac};
  2307         dgtog_reshuffle(sym, lens, old_dist, new_dist, &this->data, &shuffled_data, sr, wrld->cdt);
  2315     this->data = shuffled_data;
  2318     if (!is_sparse && sr->addid() != NULL){
  2319       bool abortt = 
false;
  2320       for (int64_t j=0; j<this->size; j++){
  2321         if (!sr->isequal(this->data+j*sr->el_size, shuffled_data_corr+j*sr->el_size)){
  2322           printf(
"data element %ld/%ld not received correctly on process %d\n",
  2323                   j, this->size, wrld->cdt.rank);
  2324           printf(
"element received was ");
  2325           sr->print(this->data+j*sr->el_size);
  2326           printf(
", correct ");
  2327           sr->print(shuffled_data_corr+j*sr->el_size);
  2333       sr->dealloc(shuffled_data_corr);
  2338     if (this->profile) {
  2340       strcpy(spf,
"redistribute_");
  2341       strcat(spf,this->name);
  2342       if (wrld->cdt.rank == 0){
  2354   double tensor::est_redist_time(
distribution const & old_dist, 
double nnz_frac){
  2355     int nvirt = (int64_t)calc_nvirt();
  2357     if (is_sparse) can_blres = 0;
  2359   #ifdef USE_BLOCK_RESHUFFLE  2366     for (
int i=0; i<order; i++){
  2370     double est_time = 0.0;
  2373       est_time += 
blres_est_time(std::max(old_dist.
size,this->size)*this->sr->el_size*nnz_frac, nvirt, old_nvirt);
  2375       if (this->is_sparse)
  2377         est_time += 
spredist_est_time(this->sr->el_size*std::max(this->size,old_dist.
size)*nnz_frac, wrld->cdt.np);
  2379         est_time += 
dgtog_est_time(this->sr->el_size*std::max(this->size,old_dist.
size)*nnz_frac, wrld->cdt.np);
  2385   int64_t tensor::get_redist_mem(
distribution const & old_dist, 
double nnz_frac){
  2387     if (is_sparse) can_blres = 0;
  2389   #ifdef USE_BLOCK_RESHUFFLE  2396       return (int64_t)this->sr->el_size*std::max(this->size,old_dist.
size)*nnz_frac;
  2399         return (int64_t)this->sr->pair_size()*std::max(this->size,old_dist.
size)*nnz_frac*3;
  2401         return (int64_t)this->sr->el_size*std::max(this->size,old_dist.
size)*nnz_frac*2.5;
  2405   int tensor::map_tensor_rem(
int        num_phys_dims,
  2408     int i, num_sub_phys_dims, stat;
  2409     int * restricted, * phys_mapped, * comm_idx;
  2416     memset(phys_mapped, 0, num_phys_dims*
sizeof(
int));
  2418     for (i=0; i<this->order; i++){
  2419       restricted[i] = (this->edge_map[i].type != 
NOT_MAPPED);
  2420       map = &this->edge_map[i];
  2422         phys_mapped[map->
cdt] = 1;
  2428     num_sub_phys_dims = 0;
  2429     for (i=0; i<num_phys_dims; i++){
  2430       if (phys_mapped[i] == 0){
  2431         num_sub_phys_dims++;
  2436     num_sub_phys_dims = 0;
  2437     for (i=0; i<num_phys_dims; i++){
  2438       if (phys_mapped[i] == 0){
  2439         sub_phys_comm[num_sub_phys_dims] = phys_comm[i];
  2440         comm_idx[num_sub_phys_dims] = i;
  2441         num_sub_phys_dims++;
  2444     stat = 
map_tensor(num_sub_phys_dims,  this->order,
  2445                       this->pad_edge_len,  this->sym_table,
  2446                       restricted,   sub_phys_comm,
  2456   int tensor::extract_diag(
int const * idx_map,
  2459                            int **      idx_map_new){
  2460     int i, j, k, * edge_len, * nsym, * ex_idx_map, * diag_idx_map;
  2461     for (i=0; i<this->order; i++){
  2462       for (j=i+1; j<this->order; j++){
  2463         if (idx_map[i] == idx_map[j]){
  2469           for (k=0; k<this->order; k++){
  2472               diag_idx_map[k]    = k;
  2473               edge_len[k]        = this->pad_edge_len[k]-this->padding[k];
  2474               (*idx_map_new)[k]  = idx_map[k];
  2477                 if (this->sym[k] == this->sym[j]) nsym[k] = this->sym[k];
  2479                 nsym[k] = this->sym[k];
  2481               ex_idx_map[k]       = k-1;
  2482               diag_idx_map[k-1]   = k-1;
  2483               edge_len[k-1]       = this->pad_edge_len[k]-this->padding[k];
  2484               nsym[k-1]            = this->sym[k];
  2485               (*idx_map_new)[k-1] = idx_map[k];
  2491             int64_t lda_i=1, lda_j=1;
  2492             for (
int ii=0; ii<i; ii++){
  2495             for (
int jj=0; jj<j; jj++){
  2500               new_tsr = 
new tensor(sr, this->order-1, edge_len, nsym, wrld, 1, name, 1, is_sparse);
  2502               for (
int p=0; p<nnz_loc; p++){
  2503                 int64_t k = pi[p].
k();
  2504                 if ((k/lda_i)%lens[i] == (k/lda_j)%lens[j]) nw++;
  2506               char * pwdata = sr->pair_alloc(nw);
  2512               for (
int p=0; p<nnz_loc; p++){
  2513                 int64_t k = pi[p].
k();
  2514                 if ((k/lda_i)%lens[i] == (k/lda_j)%lens[j]){
  2515                   int64_t k_new = (k%lda_j)+(k/(lda_j*lens[j])*lda_j);
  2517                   ((int64_t*)(wdata[nw].ptr))[0] = k_new;
  2522               new_tsr->
write(nw, sr->mulid(), sr->addid(), pwdata);
  2523               sr->pair_dealloc(pwdata);
  2530               #pragma omp parallel for  2532               for (
int p=0; p<nw; p++){
  2533                 int64_t k = wdata[p].
k();
  2534                 int64_t kpart = (k/lda_i)%lens[i];
  2535                 int64_t k_new = (k%lda_j)+((k/lda_j)*lens[j]+kpart)*lda_j;
  2536                 ((int64_t*)(wdata[p].ptr))[0] = k_new;
  2540               for (
int p=0; p<nnz_loc; p++){
  2541                 int64_t k = pi[p].
k();
  2542                 if ((k/lda_i)%lens[i] == (k/lda_j)%lens[j]){
  2547               this->write(nw, NULL, NULL, pwdata);
  2548               sr->pair_dealloc(pwdata);
  2553               new_tsr = 
new tensor(sr, this->order-1, edge_len, nsym, wrld, 1, name, 1, is_sparse);
  2570   int tensor::zero_out_padding(){
  2571     int i, num_virt, idx_lyr;
  2573     int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
  2578     if (this->has_zero_edge_len || is_sparse){
  2582     this->set_padding();
  2584     if (!this->is_mapped || sr->addid() == NULL){
  2597       idx_lyr = wrld->rank;
  2598       for (i=0; i<this->order; i++){
  2600         map               = this->edge_map + i;
  2603         virt_phase[i]     = phase[i]/phys_phase[i];
  2605         num_virt          = num_virt*virt_phase[i];
  2608           idx_lyr -= topo->lda[map->
cdt]
  2613                      this->pad_edge_len, this->sym, this->padding,
  2614                      phase, phys_phase, virt_phase, virt_phys_rank, this->data, sr);
  2616         std::fill(this->data, this->data+np, 0.0);
  2629   void tensor::scale_diagonals(
int const * sym_mask){
  2630     int i, num_virt, idx_lyr;
  2632     int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
  2638     this->set_padding();
  2641     if (!this->is_mapped){
  2653       idx_lyr = wrld->rank;
  2654       for (i=0; i<this->order; i++){
  2656         map               = this->edge_map + i;
  2659         virt_phase[i]     = phase[i]/phys_phase[i];
  2661         num_virt          = num_virt*virt_phase[i];
  2664           idx_lyr -= topo->lda[map->
cdt]
  2669                   this->pad_edge_len, this->sym, this->padding,
  2670                   phase, phys_phase, virt_phase, virt_phys_rank, this->data, sr, sym_mask);
  2682   void tensor::addinv(){
  2686       #pragma omp parallel for  2688       for (int64_t i=0; i<nnz_loc; i++){
  2689         sr->addinv(pi[i].d(), pi[i].d());
  2693       #pragma omp parallel for  2695       for (int64_t i=0; i<size; i++){
  2696         sr->addinv(data+i*sr->el_size,data+i*sr->el_size);
  2701   void tensor::set_sym(
int const * sym_){
  2703       std::fill(this->sym, this->sym+order, 
NS);
  2705       memcpy(this->sym, sym_, order*
sizeof(
int));
  2707     memset(sym_table, 0, order*order*
sizeof(
int));
  2708     for (
int i=0; i<order; i++){
  2709       if (this->sym[i] != 
NS) {
  2710         sym_table[(i+1)+i*order] = 1;
  2711         sym_table[(i+1)*order+i] = 1;
  2716   void tensor::set_new_nnz_glb(int64_t 
const * nnz_blk_){
  2719       for (
int i=0; i<calc_nvirt(); i++){
  2720         nnz_blk[i] = nnz_blk_[i];
  2721         nnz_loc += nnz_blk[i];
  2723       wrld->cdt.allred(&nnz_loc, &nnz_tot, 1, MPI_INT64_T, MPI_SUM);
  2728   void tensor::spmatricize(
int m, 
int n, 
int nrow_idx, 
bool csr){
  2732     MPI_Barrier(this->wrld->comm);
  2736     int64_t new_sz_A = 0;
  2737     this->rec_tsr->is_sparse = 1;
  2738     int nvirt_A = calc_nvirt();
  2739     this->rec_tsr->nnz_blk = (int64_t*)
alloc(nvirt_A*
sizeof(int64_t));
  2740     for (
int i=0; i<nvirt_A; i++){
  2742         this->rec_tsr->nnz_blk[i] = 
get_csr_size(this->nnz_blk[i], m, this->sr->el_size);
  2744         this->rec_tsr->nnz_blk[i] = 
get_coo_size(this->nnz_blk[i], this->sr->el_size);
  2745       new_sz_A += this->rec_tsr->nnz_blk[i];
  2748     this->rec_tsr->is_data_aliased = 
false;
  2749     int phase[this->order];
  2750     for (
int i=0; i<this->order; i++){
  2751       phase[i] = this->edge_map[i].calc_phase();
  2753     char * data_ptr_out = this->rec_tsr->data;
  2754     char const * data_ptr_in = this->data;
  2755     for (
int i=0; i<nvirt_A; i++){
  2758         cm.
set_data(this->nnz_blk[i], this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_in, this->sr, phase);
  2759         CSR_Matrix cs(cm, m, n, this->sr, data_ptr_out);
  2763         cm.
set_data(this->nnz_blk[i], this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_in, this->sr, phase);
  2765       data_ptr_in += this->nnz_blk[i]*this->sr->pair_size();
  2766       data_ptr_out += this->rec_tsr->nnz_blk[i];
  2769     this->nrow_idx = nrow_idx;
  2772     MPI_Barrier(this->wrld->comm);
  2790   void tensor::despmatricize(
int nrow_idx, 
bool csr){
  2794     MPI_Barrier(this->wrld->comm);
  2800     this->rec_tsr->is_sparse = 1;
  2801     int nvirt = calc_nvirt();
  2802     for (
int i=0; i<nvirt; i++){
  2803       if (this->rec_tsr->nnz_blk[i]>0){
  2812       offset += this->rec_tsr->nnz_blk[i];
  2814     this->data = sr->pair_alloc(new_sz);
  2815     int phase[this->order];
  2816     int phys_phase[this->order];
  2817     int phase_rank[this->order];
  2818     for (
int i=0; i<this->order; i++){
  2819       phase[i] = this->edge_map[i].calc_phase();
  2820       phys_phase[i]     = this->edge_map[i].calc_phys_phase();
  2821       phase_rank[i] = this->edge_map[i].calc_phys_rank(this->topo);
  2823     char * data_ptr_out = this->data;
  2824     char const * data_ptr_in = this->rec_tsr->data;
  2825     for (
int i=0; i<nvirt; i++){
  2826       if (this->rec_tsr->nnz_blk[i]>0){
  2830           cm.
get_data(cs.
nnz(), this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_out, this->sr, phase, phase_rank);
  2831           this->nnz_blk[i] = cm.
nnz();
  2835           cm.
get_data(cm.
nnz(), this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_out, this->sr, phase, phase_rank);
  2836           this->nnz_blk[i] = cm.
nnz();
  2838       } 
else this->nnz_blk[i] = 0;
  2839       data_ptr_out += this->nnz_blk[i]*this->sr->pair_size();
  2840       data_ptr_in += this->rec_tsr->nnz_blk[i];
  2845           phase_rank[j]+=phys_phase[j];
  2846           if (phase_rank[j]>=phase[j])
  2847             phase_rank[j]=phase_rank[j]%phase[j];
  2853     set_new_nnz_glb(this->nnz_blk);
  2854     this->rec_tsr->is_csr = csr;
  2858     MPI_Barrier(this->wrld->comm);
  2863   void tensor::leave_home_with_buffer(){
  2864 #ifdef HOME_CONTRACT  2865     if (this->has_home){
  2866       if (!this->is_home){
  2868           sr->pair_dealloc(this->home_buffer);
  2869           this->home_buffer = NULL;
  2871           sr->dealloc(this->home_buffer);
  2872           this->home_buffer = this->data;
  2875       if (wrld->rank == 0) 
DPRINTF(2,
"Deleting home (leave) of %s\n",name);
  2883   void tensor::register_size(int64_t sz){
  2885     registered_alloc_size = sz;
  2889   void tensor::deregister_size(){
  2891     registered_alloc_size = 0;
  2894   void tensor::write_dense_to_file(MPI_File & file, int64_t offset){
  2895     bool need_unpack = is_sparse;
  2896     for (
int i=0; i<order; i++){
  2897       if (sym[i] != 
NS) need_unpack = 
true;
  2901       std::fill(nsym, nsym+order, 
NS);
  2902       tensor t_dns(sr, order, lens, nsym, wrld);
  2903       t_dns[
"ij"] = (*this)[
"ij"];
  2907       int64_t chnk_sz = tot_els/wrld->np;
  2908       int64_t my_chnk_sz = chnk_sz;
  2909       if (wrld->rank < tot_els%wrld->np) my_chnk_sz++;
  2910       int64_t my_chnk_st = chnk_sz*wrld->rank + std::min((int64_t)wrld->rank, tot_els%wrld->np);
  2912       char * my_pairs = (
char*)
alloc(sr->pair_size()*my_chnk_sz);
  2915       for (int64_t i=0; i<my_chnk_sz; i++){
  2919       this->read(my_chnk_sz, my_pairs);
  2920       for (int64_t i=0; i<my_chnk_sz; i++){
  2921         char val[sr->el_size];
  2923         memcpy(my_pairs+i*sr->el_size, val, sr->el_size);
  2927       MPI_Offset off = my_chnk_st*sr->el_size+offset;
  2928       MPI_File_write_at(file, off, my_pairs, my_chnk_sz, sr->mdtype(), &stat);
  2933   void tensor::write_dense_to_file(
char const * filename){
  2936     MPI_File_open(this->wrld->comm, filename,  MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &file);
  2937     this->write_dense_to_file(file, 0);
  2938     MPI_File_close(&file);
  2941   void tensor::read_dense_from_file(MPI_File & file, int64_t offset){
  2942     bool need_unpack = is_sparse;
  2943     for (
int i=0; i<order; i++){
  2944       if (sym[i] != 
NS) need_unpack = 
true;
  2948       std::fill(nsym, nsym+order, 
NS);
  2949       tensor t_dns(sr, order, lens, nsym, wrld);
  2951       summation ts(&t_dns, 
"ij", sr->mulid(), 
this, 
"ij", sr->addid());
  2954       if (is_sparse) this->sparsify();
  2957       int64_t chnk_sz = tot_els/wrld->np;
  2958       int64_t my_chnk_sz = chnk_sz;
  2959       if (wrld->rank < tot_els%wrld->np) my_chnk_sz++;
  2960       int64_t my_chnk_st = chnk_sz*wrld->rank + std::min((int64_t)wrld->rank, tot_els%wrld->np);
  2962       char * my_pairs = (
char*)
alloc(sr->pair_size()*my_chnk_sz);
  2964       char * my_pairs_tail = my_pairs + 
sizeof(int64_t)*my_chnk_sz;
  2966       MPI_Offset off = my_chnk_st*sr->el_size+offset;
  2967       MPI_File_read_at(file, off, my_pairs_tail, my_chnk_sz, sr->mdtype(), &stat);
  2970       for (int64_t i=0; i<my_chnk_sz; i++){
  2971         char val[sr->el_size];
  2972         memcpy(val, my_pairs_tail+i*sr->el_size, sr->el_size);
  2973         pi[i].write_key(my_chnk_st+i);
  2974         pi[i].write_val(val);
  2977       this->write(my_chnk_sz, sr->mulid(), sr->addid(), my_pairs);
  2982   void tensor::read_dense_from_file(
char const * filename){
  2984     MPI_File_open(this->wrld->comm, filename,  MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &file);
  2985     this->read_dense_from_file(file, 0);
  2986     MPI_File_close(&file);
  2990   tensor * tensor::self_reduce(
int const * idx_A,
  2999     if (order_C == 0 && this->order == order_B + 1){
  3000       bool all_match_except_one = 
true;
  3001       bool one_skip = 
false;
  3003       while (iiA < this->order){
  3004         if (iiB >= order_B || idx_A[iiA] != idx_B[iiB]){
  3005           if (one_skip) all_match_except_one = 
false;
  3006           else one_skip = 
true;
  3013       if (all_match_except_one && one_skip) 
return this;
  3017     for (
int i=0; i<this->order; i++){    
  3019       bool has_match = 
false;
  3020       for (
int j=0; j<this->order; j++){
  3021         if (j != i && idx_A[j] == iA) has_match = 
true;
  3023       for (
int j=0; j<order_B; j++){
  3024         if (idx_B[j] == iA) has_match = 
true;
  3026       for (
int j=0; j<order_C; j++){
  3027         if (idx_C[j] == iA) has_match = 
true;
  3031         int new_len[this->order-1];
  3032         int new_sym[this->order-1];
  3033         int sum_A_idx[this->order];
  3034         int sum_B_idx[this->order-1];
  3041         for (
int j=0; j<this->order; j++){
  3042           max_idx = std::max(max_idx, idx_A[j]);
  3046             new_len[j] = this->lens[j];
  3047             (*new_idx_A)[j] = idx_A[j];
  3048             new_sym[j] = this->sym[j];
  3050               if (this->sym[i] == 
NS) new_sym[j] = 
NS;
  3055             new_len[j-1] = this->lens[j];
  3056             new_sym[j-1] = this->sym[j];
  3057             (*new_idx_A)[j-1] = idx_A[j];
  3063         for (
int j=0; j<this->order; j++){
  3064           max_idx = std::max(max_idx, idx_A[j]);
  3066         for (
int j=0; j<order_B; j++){
  3067           (*new_idx_B)[j] = idx_B[j];
  3068           max_idx = std::max(max_idx, idx_B[j]);
  3070         for (
int j=0; j<order_C; j++){
  3071           (*new_idx_C)[j] = idx_C[j];
  3072           max_idx = std::max(max_idx, idx_C[j]);
  3076           for (
int j=0; j<this->order-1; j++){
  3077             if ((*new_idx_A)[j] == max_idx)
  3078               (*new_idx_A)[j] = iA;
  3080           for (
int j=0; j<order_B; j++){
  3081             if ((*new_idx_B)[j] == max_idx)
  3082               (*new_idx_B)[j] = iA;
  3084           for (
int j=0; j<order_C; j++){
  3085             if ((*new_idx_C)[j] == max_idx)
  3086               (*new_idx_C)[j] = iA;
  3090         tensor * new_tsr = 
new tensor(this->sr, this->order-1, new_len, new_sym, this->wrld, 1, this->name, 1, this->is_sparse);
  3091         summation s(
this, sum_A_idx, this->sr->mulid(), new_tsr, sum_B_idx, this->sr->mulid());
 
void permute(int order, int const *perm, int *arr)
permute an array 
void read(int64_t num_pair, char const *alpha, char const *beta, int64_t const *inds, char *data)
read tensor data with <key, value> pairs where key is the global index for the value, which gets filled in with beta times the old values plus alpha times the values read from the tensor. 
void write(char const *buf, int64_t n=1)
sets internal pairs to provided data 
int calc_phys_rank(topology const *topo) const 
compute the physical rank of a mapping 
void inc_tot_mem_used(int64_t a)
int map_tensor(int num_phys_dims, int tsr_order, int const *tsr_edge_len, int const *tsr_sym_table, int *restricted, CommData *phys_comm, int const *comm_idx, int fill, mapping *tsr_edge_map)
map a tensor 
double spredist_est_time(int64_t size, int np)
void zero_padding(int order, int64_t size, int nvirt, int const *edge_len, int const *sym, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int const *cphase_rank, char *vdata, algstrct const *sr)
sets to zero all values in padded region of tensor 
void write_key(int64_t key)
sets key of head pair to key 
char * home_buffer
buffer associated with home mapping of tensor, to which it is returned 
CTF_int::CommData cdt
communicator data for MPI comm defining this world 
def sum(tensor, init_A, axis=None, dtype=None, out=None, keepdims=None)
bool is_home
whether the latest tensor data is in the home buffer 
int64_t * nnz_blk
nonzero elements in each block owned locally 
int * sym
symmetries among tensor dimensions 
void execute()
run contraction 
virtual algstrct * clone() const  =0
''copy constructor'' 
void slice(int const *offsets_B, int const *ends_B, char const *beta, tensor *A, int const *offsets_A, int const *ends_A, char const *alpha)
accumulates out a slice (block) of this tensor = B B[offsets,ends)=beta*B[offsets,ends) + alpha*A[offsets_A,ends_A) 
virtual int pair_size() const 
gets pair size el_size plus the key size 
int calc_phase() const 
compute the phase of a mapping 
int64_t get_coo_size(int64_t nnz, int val_size)
virtual char * pair_alloc(int64_t n) const 
allocate space for n (int64_t,dtype) pairs, necessary for object types 
double dgtog_est_time(int64_t tot_sz, int np)
estimates execution time, given this processor sends a receives tot_sz across np procs ...
void calc_dim(int order, int64_t size, int const *edge_len, mapping const *edge_map, int64_t *vrt_sz, int *vrt_edge_len, int *blk_edge_len)
calculate the block-sizes of a tensor 
void execute(bool run_diag=false)
run summation 
virtual bool isequal(char const *a, char const *b) const 
returns true if algstrct elements a and b are equal 
int * pad_edge_len
padded tensor edge lengths 
void serialize(char **buffer, int *size)
serialize object into contiguous data buffer 
int * inner_ordering
ordering of the dimensions according to which the tensori s folded 
int calc_phys_phase() const 
compute the physical phase of a mapping 
void allred(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op)
allreduce, same interface as MPI_Allreduce, but excluding the comm 
void read_loc_pairs(int order, int64_t nval, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *phase_rank, int64_t *nread, char const *data, char **pairs, algstrct const *sr)
read tensor pairs local to processor 
int sparsify(char const *threshold=NULL, bool take_abs=true)
reduce tensor to sparse format, storing only nonzero data, or data above a specified threshold...
bool has_home
whether the tensor has a home mapping/buffer 
local process walltime measurement 
int64_t size
current size of local tensor data chunk (mapping-dependent) 
void * alloc(int64_t len)
alloc abstraction 
int find_topology(topology const *topo, std::vector< topology * > &topovec)
searches for an equivalent topology in avector of topologies 
int get_best_topo(int64_t nvirt, int topo, CommData global_comm, int64_t bcomm_vol, int64_t bmemuse)
get the best topologoes (least nvirt) over all procs 
void set_distribution(char const *idx, CTF::Idx_Partition const &prl, CTF::Idx_Partition const &blk)
set edge mappings as specified 
bool is_folded
whether the data is folded/transposed into a (lower-order) tensor 
int64_t home_size
size of home buffer 
untyped internal class for doubly-typed univariate function 
virtual char const * addid() const 
MPI datatype for pairs. 
#define DEBUG_PRINTF(...)
void sort(int64_t n)
sorts set of pairs using std::sort 
void copy_mapping(int order, mapping const *mapping_A, mapping *mapping_B)
copies mapping A to B 
an instance of the CTF library (world) on a MPI communicator 
void write_dense_to_file(MPI_File &file, int64_t offset=0)
write all tensor data to binary file in element order, unpacking from sparse or symmetric formats ...
void read_dense_from_file(MPI_File &file, int64_t offset=0)
read all tensor data from binary file in element order, which should be stored as nonsymmetric and de...
bool is_sparse
whether only the non-zero elements of the tensor are stored 
int order
number of tensor dimensions 
void read_val(char *buf) const 
sets external value to the value pointed by the iterator 
virtual char * alloc(int64_t n) const 
allocate space for n items, necessary for object types 
char * all_data
serialized buffer containing info and data 
int align(tensor const *B)
align mapping of thisa tensor to that of B 
void cyclic_reshuffle(int const *sym, distribution const &old_dist, int const *old_offsets, int *const *old_permutation, distribution const &new_dist, int const *new_offsets, int *const *new_permutation, char **ptr_tsr_data, char **ptr_tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm, bool reuse_buffers, char const *alpha, char const *beta)
Goes from any set of phases to any new set of phases. 
void set_padding()
sets padding and local size of a tensor given a mapping 
void padded_reshuffle(int const *sym, distribution const &old_dist, distribution const &new_dist, char *tsr_data, char **tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm)
Reshuffle elements using key-value pair read/write. 
CTF::World * wrld
distributed processor context on which tensor is defined 
virtual void set(char *a, char const *b, int64_t n) const 
sets n elements of array a to value b 
int rank
rank of local processor 
bool is_cyclic
whether the tensor data is cyclically distributed (blocked if false) 
virtual void min(char const *a, char const *b, char *c) const 
c = min(a,b) 
int64_t nnz() const 
retrieves number of nonzeros out of all_data 
int write(int64_t num_pair, char const *alpha, char const *beta, char *mapped_data, char const rw='w')
Add tensor data new=alpha*new+beta*old with <key, value> pairs where key is the global index for the ...
class for execution distributed contraction of tensors 
int * padding
padding along each edge length (less than distribution phase) 
int * lens
unpadded tensor edge lengths 
LinModel< 3 > spredist_mdl(spredist_mdl_init,"spredist_mdl")
void get_data(int64_t nz, int order, int const *lens, int const *rev_ordering, int nrow_idx, char *tsr_data, algstrct const *sr, int const *phase, int const *phase_rank)
unfolds tensor data from COO format based on prespecification of row and column modes ...
int64_t nnz() const 
retrieves number of nonzeros out of all_data 
int64_t k() const 
returns key of pair at head of ptr 
serialized matrix in coordinate format, meaning three arrays of dimension nnz are stored...
int can_block_reshuffle(int order, int const *old_phase, mapping const *map)
determines if tensor can be permuted by block 
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction 
void print_map(FILE *stream=stdout, bool allcall=1) const 
displays mapping information 
int comp_dim_map(mapping const *map_A, mapping const *map_B)
compares two mappings 
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
bool is_data_aliased
whether the tensor data is an alias of another tensor object's data 
void(* abs)(char const *a, char *b)
b = max(a,addinv(a)) 
int64_t k() const 
returns key of pair at head of ptr 
int64_t nnz_tot
maximum number of local nonzero elements over all procs 
double est_time_transp(int order, int const *new_order, int const *edge_len, int dir, algstrct const *sr)
estimates time needed to transposes a non-symmetric (folded) tensor based on performance models ...
algstrct * sr
algstrct on which tensor elements and operations are defined 
virtual void pair_dealloc(char *ptr) const 
deallocate given pointer containing contiguous array of pairs 
mapping * edge_map
mappings of each tensor dimension onto topology dimensions 
tensor * rec_tsr
representation of folded tensor (shares data pointer) 
bool is_mapped
whether a mapping has been selected 
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm 
void read_val(char *buf) const 
sets value to the value pointed by the iterator 
void orient_subworld(CTF::World *greater_world, int &bw_mirror_rank, int &fw_mirror_rank, distribution *&odst, char **sub_buffer_)
maps data from this world (subcomm) to the correct order of processors with respect to a parent (grea...
void dgtog_reshuffle(int const *sym, int const *edge_len, distribution const &old_dist, distribution const &new_dist, char **ptr_tsr_data, char **ptr_tsr_new_data, algstrct const *sr, CommData ord_glb_comm)
void set_data(int64_t nz, int order, int const *lens, int const *ordering, int nrow_idx, char const *tsr_data, algstrct const *sr, int const *phase)
folds tensor data into COO format based on prespecification of row and column modes ...
virtual void print(char const *a, FILE *fp=stdout) const 
prints the value 
void depermute_keys(int order, int num_pair, int const *edge_len, int const *new_edge_len, int *const *permutation, char *pairs_buf, algstrct const *sr)
depermutes keys (apply P^T) 
void block_reshuffle(distribution const &old_dist, distribution const &new_dist, char *tsr_data, char *&tsr_cyclic_data, algstrct const *sr, CommData glb_comm)
Reshuffle elements by block given the global phases stay the same. 
int64_t calc_nvirt() const 
calculate virtualization factor of tensor return virtualization factor 
int get_distribution_size(int order)
void permute_keys(int order, int num_pair, int const *edge_len, int const *new_edge_len, int *const *permutation, char *pairs_buf, int64_t *new_num_pair, algstrct const *sr)
permutes keys 
int check_self_mapping(tensor const *tsr, int const *idx_map)
checks mapping in preparation for tensors scale, summ or contract 
int64_t nnz_loc
number of local nonzero elements 
int univar_function(int n, World &dw)
int el_size
size of each element of algstrct in bytes 
void pad_key(int order, int64_t num_pair, int const *edge_len, int const *padding, PairIterator pairs, algstrct const *sr, int const *offsets)
applies padding to keys 
bool profile
whether profiling should be done for contractions/sums involving this tensor 
int cdealloc(void *ptr)
free abstraction 
int64_t proc_bytes_available()
gives total memory available on this MPI process 
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
internal distributed tensor class 
void scal_diag(int order, int64_t size, int nvirt, int const *edge_len, int const *sym, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int const *cphase_rank, char *vdata, algstrct const *sr, int const *sym_mask)
scales each element by 1/(number of entries equivalent to it after permutation of indices for which s...
double blres_est_time(int64_t tot_sz, int nv0, int nv1)
estimates execution time, given this processor sends a receives tot_sz across np procs ...
void nosym_transpose(tensor *A, int all_fdim_A, int const *all_flen_A, int const *new_order, int dir)
void write_val(char const *buf)
sets value of head pair to what is in buf 
int64_t get_csr_size(int64_t nnz, int nrow_, int val_size)
computes the size of a serialized CSR matrix 
void wr_pairs_layout(int order, int np, int64_t inwrite, char const *alpha, char const *beta, char rw, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *virt_phys_rank, int const *bucket_lda, char *wr_pairs_buf, char *rw_data, CommData glb_comm, algstrct const *sr, bool is_sparse, int64_t nnz_loc, int64_t *nnz_blk, char *&pprs_new, int64_t &nnz_loc_new)
read or write pairs from / to tensor 
topology * topo
topology to which the tensor is mapped 
int redistribute(distribution const &old_dist, int const *old_offsets=NULL, int *const *old_permutation=NULL, int const *new_offsets=NULL, int *const *new_permutation=NULL)
permutes the data of a tensor to its new layout 
int64_t packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in packed symmetric (SY, SH, or AS) layout 
Idx_Partition reduce_order() const 
extracts non-trivial part of partition by ommitting unit dimensions 
class for execution distributed summation of tensors 
int read_local_nnz(int64_t *num_pair, char **mapped_data, bool unpack_sym=false) const 
read tensor data pairs local to processor that have nonzero values 
void spsfy_tsr(int order, int64_t size, int nvirt, int const *edge_len, int const *sym, int const *phase, int const *phys_phase, int const *virt_dim, int *phase_rank, char const *vdata, char *&vpairs, int64_t *nnz_blk, algstrct const *sr, int64_t const *edge_lda, std::function< bool(char const *)> f)
extracts all tensor values (in pair format) that pass a sparsifier function (including padded zeros i...
void depad_tsr(int order, int64_t num_pair, int const *edge_len, int const *sym, int const *padding, int const *prepadding, char const *pairsb, char *new_pairsb, int64_t *new_num_pair, algstrct const *sr)
retrieves the unpadded pairs 
bool has_zero_edge_len
if true tensor has a zero edge length, so is zero, which short-cuts stuff 
virtual char const * mulid() const 
identity element for multiplication i.e. 1 
a tensor with an index map associated with it (necessary for overloaded operators) ...
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout 
char * name
name given to tensor 
int np
number of processors 
double spredist_mdl_init[]
int sum_tensors(bool run_diag)
PDAXPY: a*idx_map_A(A) + b*idx_map_B(B) -> idx_map_B(B). Treats symmetric as lower triangular...
MPI_Comm comm
set of processors making up this world 
int read_local(int64_t *num_pair, char **mapped_data, bool unpack_sym=false) const 
read tensor data pairs local to processor including those with zero values WARNING: for sparse tensor...
void clear()
resets mapping to NOT_MAPPED 
int conv_idx(int order, type const *cidx, int **iidx)
virtual MPI_Datatype mdtype() const 
MPI datatype. 
void clear_mapping()
zeros out mapping