2 #include "../interface/common.h" 3 #include "../interface/timer.h" 4 #include "../interface/idx_tensor.h" 5 #include "../summation/summation.h" 6 #include "../contraction/contraction.h" 8 #include "../shared/util.h" 9 #include "../shared/memcontrol.h" 10 #include "../redistribution/sparse_rw.h" 11 #include "../redistribution/pad.h" 12 #include "../redistribution/nosym_transp.h" 13 #include "../redistribution/redist.h" 14 #include "../redistribution/cyclic_reshuffle.h" 15 #include "../redistribution/glb_cyclic_reshuffle.h" 16 #include "../redistribution/dgtog_redist.h" 25 double ps[] = {1.0, (double)log2(np), (double)size*log2(np)};
40 void tensor::free_self(){
42 if (wrld->rank == 0)
DPRINTF(3,
"Deleted order %d tensor %s\n",order,name);
43 if (is_folded) unfold();
53 if (!is_data_aliased){
55 if (!is_sparse) sr->dealloc(home_buffer);
57 sr->pair_dealloc(data);
62 if (!is_sparse) sr->dealloc(data);
63 else sr->pair_dealloc(data);
66 if (has_home && !is_home){
67 if (!is_sparse) sr->dealloc(home_buffer);
68 else sr->pair_dealloc(home_buffer);
91 this->init(sr, order,edge_len,sym,wrld,alloc_data,name,profile,is_sparse);
105 this->init(sr, order,edge_len,sym,wrld,0,name,profile,is_sparse);
106 set_distribution(idx, prl, blk);
108 nnz_blk = (int64_t*)
alloc(
sizeof(int64_t)*calc_nvirt());
109 std::fill(nnz_blk, nnz_blk+calc_nvirt(), 0);
113 this->home_buffer = NULL;
120 this->data = sr->
alloc(this->size);
121 this->sr->
set(this->data, this->sr->
addid(), this->size);
123 this->home_size = this->size;
124 register_size(home_size*sr->
el_size);
127 this->home_buffer = this->data;
136 tensor::tensor(
tensor const * other,
bool copy,
bool alloc_data){
137 char * nname = (
char*)
alloc(strlen(other->
name) + 2);
139 strcpy(nname, other->
name);
142 DPRINTF(2,
"Cloning tensor %s into %s copy=%d alloc_data=%d\n",other->
name, nname,copy, alloc_data);
145 other->
sym, other->
wrld, (!copy & alloc_data), nname,
152 copy_tensor_data(other);
153 }
else if (!alloc_data) data = NULL;
157 tensor::tensor(
tensor * other,
int const * new_sym){
158 char * nname = (
char*)
alloc(strlen(other->
name) + 2);
160 strcpy(nname, other->
name);
163 DPRINTF(2,
"Repacking tensor %s into %s\n",other->
name,nname);
166 bool has_chng=
false, less_sym=
false, more_sym=
false;
167 for (
int i=0; i<other->
order; i++){
168 if (other->
sym[i] != new_sym[i]){
170 DPRINTF(2,
"sym[%d] was %d now %d\n",i,other->
sym[i],new_sym[i]);
173 if (other->
sym[i] ==
NS){
177 if (new_sym[i] ==
NS){
187 if (!less_sym && !more_sym){
189 new_sym, other->
wrld, 0, nname,
191 copy_tensor_data(other);
196 new_sym, other->
wrld, 1, nname,
199 for (
int j=0; j<order; j++){
202 summation ts(other, idx, sr->mulid(),
this, idx, sr->addid());
210 void tensor::copy_tensor_data(
tensor const * other){
224 register_size(this->home_size*sr->el_size);
225 this->home_buffer = sr->alloc(other->
home_size);
228 this->data = this->home_buffer;
235 this->data = sr->alloc(other->
size);
240 this->data = sr->alloc(other->
size);
249 this->data = sr->alloc(other->
size);
251 sr->copy(this->data, other->
data, other->
size);
257 if (data!=NULL) sr->dealloc(this->data);
261 this->data = sr->pair_alloc(other->
nnz_loc);
264 this->set_new_nnz_glb(other->
nnz_blk);
265 sr->copy_pairs(this->data, other->
data, other->
nnz_loc);
267 if (this->is_folded){
268 delete this->rec_tsr;
275 (
void**)&this->inner_ordering);
276 for (
int i=0; i<other->
order; i++){
279 this->rec_tsr = rtsr;
282 this->order = other->
order;
284 memcpy(this->padding, other->
padding,
sizeof(
int)*other->
order);
287 this->topo = other->
topo;
290 this->size = other->
size;
291 this->nnz_loc = other->
nnz_loc;
292 this->nnz_tot = other->
nnz_tot;
295 if (wrld->rank == 0){
297 printf(
"New sparse tensor %s copied from %s of size %ld nonzeros (%ld bytes) locally, %ld nonzeros total:\n",name, other->
name, this->nnz_loc,this->nnz_loc*sr->el_size,nnz_tot);
299 printf(
"New tensor %s copied from %s of size %ld elms (%ld bytes):\n",name, other->
name, this->size,this->size*sr->el_size);
308 int const * edge_len,
316 this->sr = sr_->
clone();
317 this->order = order_;
319 this->is_scp_padded = 0;
325 this->is_data_aliased = 0;
326 this->has_zero_edge_len = 0;
329 this->profile = profile_;
330 this->is_sparse = is_sparse_;
334 this->nnz_blk = NULL;
335 this->is_csr =
false;
337 this->left_home_transp = 0;
339 this->registered_alloc_size = 0;
341 this->name = (
char*)
alloc(strlen(name_)+1);
342 strcpy(this->name, name_);
344 this->name = (
char*)
alloc(7*
sizeof(
char));
345 for (
int i=0; i<4; i++){
346 this->name[i] =
'A'+(wrld->glob_wrld_rng()%26);
348 this->name[4] =
'0'+(order_/10);
349 this->name[5] =
'0'+(order_%10);
350 this->name[6] =
'\0';
352 this->home_buffer = NULL;
354 DPRINTF(3,
"Created order %d tensor %s, is_sparse = %d, allocated = %d\n",order,name,is_sparse,alloc_data);
357 memset(this->padding, 0, order*
sizeof(
int));
360 memcpy(this->lens, edge_len, order*
sizeof(
int));
362 memcpy(this->pad_edge_len, lens, order*
sizeof(
int));
365 this->set_sym (sym_);
366 this->edge_map =
new mapping[order];
369 for (
int i=0; i<order; i++){
370 if (this->lens[i] <= 0) this->has_zero_edge_len = 1;
372 this->edge_map[i].has_child = 0;
373 this->edge_map[i].np = 1;
388 int ret = set_zero();
394 int * tensor::calc_phase()
const {
399 for (i=0; i<this->order; i++){
400 map = this->edge_map + i;
406 int tensor::calc_tot_phase()
const {
408 int * phase = this->calc_phase();
410 for (i=0 ; i<this->order; i++){
411 tot_phase *= phase[i];
417 int64_t tensor::calc_nvirt()
const {
419 int64_t nvirt, tnvirt;
422 for (j=0; j<this->order; j++){
423 map = &this->edge_map[j];
426 tnvirt = nvirt*(int64_t)map->
np;
427 if (tnvirt < nvirt)
return UINT64_MAX;
435 int64_t tensor::calc_npe()
const {
440 for (j=0; j<this->order; j++){
441 map = &this->edge_map[j];
456 void tensor::set_padding(){
458 int * new_phase, * sub_edge_len;
470 for (j=0; j<this->order; j++){
471 map = this->edge_map + j;
473 pad = this->lens[j]%new_phase[j];
475 pad = new_phase[j]-pad;
477 this->padding[j] = pad;
479 for (i=0; i<this->order; i++){
480 this->pad_edge_len[i] = this->lens[i] + this->padding[i];
481 sub_edge_len[i] = this->pad_edge_len[i]/new_phase[i];
484 this->size = calc_nvirt()*
sy_packed_size(this->order, sub_edge_len, this->sym);
493 int tensor::set(
char const * val) {
494 sr->set(this->data, val, this->size);
495 return zero_out_padding();
499 int tensor::set_zero() {
502 int i, map_success, btopo;
504 int64_t memuse, bmemuse;
506 if (this->is_mapped){
508 sr->pair_dealloc(this->data);
510 this->home_buffer = NULL;
512 memset(this->nnz_blk, 0,
sizeof(int64_t)*calc_nvirt());
513 this->set_new_nnz_glb(this->nnz_blk);
515 sr->set(this->data, sr->addid(), this->size);
525 bool fully_distributed =
false;
526 for (i=wrld->rank; i<(int64_t)wrld->topovec.size(); i+=wrld->np){
527 this->clear_mapping();
529 memset(restricted, 0, this->order*
sizeof(
int));
530 map_success =
map_tensor(wrld->topovec[i]->order, this->order, this->pad_edge_len,
531 this->sym_table, restricted,
532 wrld->topovec[i]->dim_comm, NULL, 0,
534 if (map_success ==
ERROR) {
538 }
else if (map_success ==
SUCCESS){
539 this->topo = wrld->topovec[i];
541 memuse = (int64_t)this->size;
543 DPRINTF(1,
"Not enough memory %E to map tensor (size %E) on topo %d\n", (
double)
proc_bytes_available(),(
double)memuse*sr->el_size,i);
546 int64_t sum_phases = 0;
547 int64_t prod_phys_phases = 1;
548 for (
int j=0; j<this->order; j++){
549 int phase = this->edge_map[j].calc_phase();
550 prod_phys_phases *= this->edge_map[j].calc_phys_phase();
551 int max_lcm_phase = phase;
552 for (
int k=0; k<this->order; k++){
553 max_lcm_phase = std::max(max_lcm_phase,
lcm(phase,this->edge_map[k].calc_phase()));
555 sum_phases += max_lcm_phase + phase;
557 memuse = memuse*(1.+((double)sum_phases)/(4.*wrld->topovec[i]->glb_comm.np));
568 fully_distributed = prod_phys_phases == wrld->np;
569 }
else if ((memuse < bmemuse && !fully_distributed) || (memuse < bmemuse && prod_phys_phases == wrld->
np)){
572 fully_distributed = prod_phys_phases == wrld->np;
575 DPRINTF(1,
"Unsuccessful in map_tensor() in set_zero()\n");
581 int btopo1 =
get_best_topo((1-fully_distributed)*INT64_MAX+fully_distributed*bmemuse, btopo, wrld->cdt);
583 if (btopo != btopo1 && btopo1 != -1) btopo = btopo1;
585 if (btopo == -1 || btopo == INT_MAX) {
587 printf(
"ERROR: FAILED TO MAP TENSOR\n");
588 MPI_Barrier(MPI_COMM_WORLD);
594 memset(restricted, 0, this->order*
sizeof(
int));
595 this->clear_mapping();
597 map_success =
map_tensor(wrld->topovec[btopo]->order, this->order,
598 this->pad_edge_len, this->sym_table, restricted,
599 wrld->topovec[btopo]->dim_comm, NULL, 0,
603 this->topo = wrld->topovec[btopo];
610 if (!is_sparse && this->size > INT_MAX && wrld->rank == 0)
611 printf(
"CTF WARNING: Tensor %s is has local size %ld, which is greater than INT_MAX=%d, so MPI could run into problems\n", name, size, INT_MAX);
614 nnz_blk = (int64_t*)
alloc(
sizeof(int64_t)*calc_nvirt());
615 std::fill(nnz_blk, nnz_blk+calc_nvirt(), 0);
618 this->home_buffer = NULL;
621 if (this->order > 0){
622 this->home_size = this->size;
629 this->home_buffer = sr->alloc(this->home_size);
630 if (wrld->rank == 0)
DPRINTF(2,
"Creating home of %s\n",name);
631 register_size(this->size*sr->el_size);
632 this->data = this->home_buffer;
634 this->data = sr->alloc(this->size);
637 this->data = sr->alloc(this->size);
642 printf(
"New tensor %s defined of size %ld elms (%ld bytes):\n",name, this->size,this->size*sr->el_size);
643 this->print_map(stdout);
645 sr->init(this->size, this->data);
652 void tensor::print_map(FILE * stream,
bool allcall)
const {
653 if (!allcall || wrld->rank == 0){
655 printf(
"printing mapping of sparse tensor %s\n",name);
657 printf(
"printing mapping of dense tensor %s\n",name);
659 printf(
"CTF: %s mapped to order %d topology with dims:",name,topo->order);
661 printf(
" %d ",topo->lens[
dim]);
667 sprintf(tname,
"%s[", name);
670 sprintf(tname+strlen(tname),
",");
671 int tp = edge_map[
dim].calc_phase();
672 int pp = edge_map[
dim].calc_phys_phase();
674 if (tp==1) sprintf(tname+strlen(tname),
"1");
677 sprintf(tname+strlen(tname),
"p%d(%d)",edge_map[
dim].
np,edge_map[
dim].cdt);
679 sprintf(tname+strlen(tname),
"p%d(%d)",edge_map[
dim].child->np,edge_map[dim].child->cdt);
681 if (vp > 1) sprintf(tname+strlen(tname),
"v%d",vp);
685 sprintf(tname+strlen(tname),
"]");
697 void tensor::set_name(
char const * name_){
699 this->name = (
char*)
alloc(strlen(name_)+1);
700 strcpy(this->name, name_);
703 char const * tensor::get_name()
const {
707 void tensor::profile_on(){
711 void tensor::profile_off(){
715 void tensor::get_raw_data(
char ** data_, int64_t * size_)
const {
721 int *
const * permutation_A,
723 int *
const * permutation_B,
725 int64_t sz_A, blk_sz_A, sz_B, blk_sz_B;
734 if (permutation_B != NULL){
735 ASSERT(permutation_A == NULL);
741 if (wrld->rank == 0 && tsr_B->
is_sparse) printf(
"CTF ERROR: please use other variant of permute function when the output is sparse\n");
747 ret = tsr_A->
write(blk_sz_B, sr->mulid(), sr->addid(), all_data_B,
'r');
750 all_data_A = all_data_B;
758 ASSERT(permutation_A != NULL);
759 ASSERT(permutation_B == NULL);
769 ret = tsr_B->
write(blk_sz_A, alpha, beta, all_data_A,
'w');
778 int & bw_mirror_rank,
779 int & fw_mirror_rank,
781 char ** sub_buffer_){
784 if (order != -1) is_sub = 1;
786 greater_world->
cdt.
allred(&is_sub, &tot_sub, 1, MPI_INT, MPI_SUM);
789 if (order != -1)
ASSERT(tot_sub == wrld->np);
791 greater_world->
cdt.
allred(&order, &aorder, 1, MPI_INT, MPI_MAX);
793 int sub_root_rank = 0;
796 if (order >= 0 && wrld->rank == 0){
797 greater_world->
cdt.
allred(&greater_world->
rank, &sub_root_rank, 1, MPI_INT, MPI_SUM);
803 greater_world->
cdt.
bcast(buffer, buf_sz, MPI_CHAR, sub_root_rank);
806 greater_world->
cdt.
allred(MPI_IN_PLACE, &sub_root_rank, 1, MPI_INT, MPI_SUM);
807 greater_world->
cdt.
bcast(buffer, buf_sz, MPI_CHAR, sub_root_rank);
816 fw_mirror_rank = wrld->rank;
817 MPI_Isend(&(greater_world->
rank), 1, MPI_INT, wrld->rank, 13, greater_world->
comm, &req);
819 if (greater_world->
rank < tot_sub){
821 MPI_Recv(&bw_mirror_rank, 1, MPI_INT, MPI_ANY_SOURCE, 13, greater_world->
comm, &stat);
823 if (fw_mirror_rank >= 0){
825 MPI_Wait(&req, &stat);
828 MPI_Request req1, req2;
830 char * sub_buffer = sr->alloc(odst->
size);
832 char * rbuffer = NULL;
833 if (bw_mirror_rank >= 0){
835 MPI_Irecv(rbuffer, buf_sz, MPI_CHAR, bw_mirror_rank, 0, greater_world->
comm, &req1);
836 MPI_Irecv(sub_buffer, odst->
size*sr->el_size, MPI_CHAR, bw_mirror_rank, 1, greater_world->
comm, &req2);
838 if (fw_mirror_rank >= 0){
844 MPI_Send(sbuffer, buf_sz, MPI_CHAR, fw_mirror_rank, 0, greater_world->
comm);
845 MPI_Send(this->data, odst->
size*sr->el_size, MPI_CHAR, fw_mirror_rank, 1, greater_world->
comm);
848 if (bw_mirror_rank >= 0){
850 MPI_Wait(&req1, &stat);
851 MPI_Wait(&req2, &stat);
856 sr->set(sub_buffer, sr->addid(), odst->
size);
857 *sub_buffer_ = sub_buffer;
861 void tensor::slice(
int const * offsets_B,
865 int const * offsets_A,
869 int64_t i, j, sz_A, blk_sz_A, sz_B, blk_sz_B;
870 char * all_data_A, * blk_data_A;
871 char * all_data_B, * blk_data_B;
881 for (i=0,j=0; i<this->order && j<A->
order; i++, j++){
882 if (ends_A[j] - offsets_A[j] != ends_B[i] - offsets_B[i]){
883 if (ends_B[i] - offsets_B[i] == 1){ j--;
continue; }
884 if (ends_A[j] - offsets_A[j] == 1){ i--;
continue; }
885 printf(
"CTF ERROR: slice dimensions inconsistent 1\n");
891 while (A->
order != 0 && i < this->order){
892 if (ends_B[i] - offsets_B[i] == 1){ i++;
continue; }
893 printf(
"CTF ERROR: slice dimensions inconsistent 2\n");
897 while (this->order != 0 && j < A->order){
898 if (ends_A[j] - offsets_A[j] == 1){ j++;
continue; }
899 printf(
"CTF ERROR: slice dimensions inconsistent 3\n");
915 for (i=0; i<tsr_B->
order; i++){
916 padding_B[i] = tsr_B->
lens[i] - ends_B[i];
919 all_data_B, blk_data_B, &blk_sz_B, sr);
921 sr->pair_dealloc(all_data_B);
923 for (i=0; i<tsr_B->
order; i++){
924 toffset_B[i] = -offsets_B[i];
925 padding_B[i] = ends_B[i]-offsets_B[i]-tsr_B->
lens[i];
929 padding_B, pblk_data_B, sr, toffset_B);
930 for (i=0; i<tsr_A->
order; i++){
931 toffset_A[i] = ends_A[i] - offsets_A[i];
932 padding_A[i] = tsr_A->
lens[i] - toffset_A[i];
935 padding_A, pblk_data_B, sr, offsets_A);
937 tsr_A->
write(blk_sz_B, sr->mulid(), sr->addid(), blk_data_B,
'r');
938 all_data_A = blk_data_B;
951 for (i=0; i<tsr_A->
order; i++){
952 padding_A[i] = tsr_A->
lens[i] - ends_A[i];
954 int nosym[tsr_A->
order];
955 std::fill(nosym, nosym+tsr_A->
order,
NS);
956 depad_tsr(tsr_A->
order, sz_A, ends_A, nosym, padding_A, offsets_A,
957 all_data_A, blk_data_A, &blk_sz_A, sr);
962 for (i=0; i<tsr_A->
order; i++){
963 toffset_A[i] = -offsets_A[i];
964 padding_A[i] = ends_A[i]-offsets_A[i]-tsr_A->
lens[i];
968 padding_A, pblk_data_A, sr, toffset_A);
969 for (i=0; i<tsr_B->
order; i++){
970 toffset_B[i] = ends_B[i] - offsets_B[i];
971 padding_B[i] = tsr_B->
lens[i] - toffset_B[i];
974 padding_B, pblk_data_A, sr, offsets_B);
981 tsr_B->
write(blk_sz_A, alpha, beta, blk_data_A,
'w');
991 void tensor::add_to_subworld(
tensor * tsr_sub,
994 #ifdef USE_SLICE_FOR_SUBWORLD 995 int offsets[this->order];
996 memset(offsets, 0, this->order*
sizeof(
int));
997 if (tsr_sub->
order == -1){
1003 stsr.
slice(NULL, NULL, beta,
this, offsets, offsets, alpha);
1005 tsr_sub->
slice(offsets, lens, beta,
this, offsets, lens, alpha);
1008 int fw_mirror_rank, bw_mirror_rank;
1011 tsr_sub->
orient_subworld(wrld, bw_mirror_rank, fw_mirror_rank, odst, &sub_buffer);
1017 cyclic_reshuffle(sym, idst, NULL, NULL, *odst, NULL, NULL, &this->data, &sub_buffer, sr, wrld->cdt, 0, alpha, beta);
1020 if (fw_mirror_rank >= 0){
1022 MPI_Irecv(tsr_sub->
data, odst->
size, tsr_sub->
sr->
mdtype(), fw_mirror_rank, 0, wrld->cdt.cm, &req);
1025 if (bw_mirror_rank >= 0)
1026 MPI_Send(sub_buffer, odst->
size, sr->mdtype(), bw_mirror_rank, 0, wrld->cdt.cm);
1027 if (fw_mirror_rank >= 0){
1029 MPI_Wait(&req, &stat);
1032 sr->dealloc(sub_buffer);
1037 void tensor::add_from_subworld(
tensor * tsr_sub,
1040 #ifdef USE_SLICE_FOR_SUBWORLD 1041 int offsets[this->order];
1042 memset(offsets, 0, this->order*
sizeof(
int));
1043 if (tsr_sub->
order == -1){
1046 slice(offsets, offsets, beta, &stsr, NULL, NULL, alpha);
1048 slice(offsets, lens, alpha, tsr_sub, offsets, lens, beta);
1051 int fw_mirror_rank, bw_mirror_rank;
1054 tsr_sub->
orient_subworld(wrld, bw_mirror_rank, fw_mirror_rank, odst, &sub_buffer);
1060 cyclic_reshuffle(sym, *odst, NULL, NULL, idst, NULL, NULL, &sub_buffer, &this->data, sr, wrld->cdt, 0, alpha, beta);
1062 sr->dealloc(sub_buffer);
1067 void tensor::write(int64_t num_pair,
1070 int64_t
const *inds,
1072 char * pairs = sr->pair_alloc(num_pair);
1074 for (int64_t i=0; i<num_pair; i++){
1078 this->write(num_pair,alpha,beta,pairs,
'w');
1079 sr->pair_dealloc(pairs);
1082 int tensor::write(int64_t num_pair,
1088 int * phase, * phys_phase, * virt_phase, * bucket_lda;
1089 int * virt_phys_rank;
1094 if (wrld->rank == 0){
1099 this->print_map(stdout);
1117 for (i=0; i<tsr->
order; i++){
1121 virt_phase[i] = phase[i]/phys_phase[i];
1124 num_virt = num_virt*virt_phase[i];
1134 if (4*num_pair*sr->pair_size() >= max_memuse){
1135 npart = 1 + (6*num_pair*sr->pair_size())/max_memuse;
1137 MPI_Allreduce(MPI_IN_PLACE, &npart, 1, MPI_INT, MPI_MAX, wrld->cdt.cm);
1143 int64_t part_size = num_pair/npart;
1144 for (
int part = 0; part<npart; part++){
1145 int64_t nnz_loc_new;
1148 if (part == npart-1) pnum_pair = num_pair - part_size*part;
1149 else pnum_pair = part_size;
1150 char * buf_ptr = mapped_data + sr->pair_size()*part_size*part;
1175 if (is_sparse && rw ==
'w'){
1176 this->set_new_nnz_glb(nnz_blk);
1177 if (tsr->
data != NULL) sr->pair_dealloc(tsr->
data);
1178 tsr->
data = new_pairs;
1203 void tensor::read(int64_t num_pair,
1206 int64_t
const * inds,
1208 char * pairs = sr->pair_alloc(num_pair);
1210 for (int64_t i=0; i<num_pair; i++){
1214 write(num_pair, alpha, beta, pairs,
'r');
1215 for (int64_t i=0; i<num_pair; i++){
1216 pr[i].
read_val(data+i*sr->el_size);
1218 sr->pair_dealloc(pairs);
1221 int tensor::read(int64_t num_pair,
1224 char * mapped_data){
1225 return write(num_pair, alpha, beta, mapped_data,
'r');
1228 int tensor::read(int64_t num_pair,
1229 char * mapped_data){
1230 return write(num_pair, NULL, NULL, mapped_data,
'r');
1233 void tensor::set_distribution(
char const * idx,
1248 itopo = wrld->topovec.size();
1249 wrld->topovec.push_back(top);
1252 assert(itopo != -1);
1254 this->clear_mapping();
1256 this->topo = wrld->topovec[itopo];
1257 for (
int i=0; i<order; i++){
1258 mapping * map = this->edge_map+i;
1260 if (idx[i] == prl.
idx[j]){
1267 map->
np = this->topo->dim_comm[j].np;
1273 if (idx[i] == blk.
idx[j]){
1285 this->is_mapped =
true;
1286 this->set_padding();
1288 conv_idx(this->order, idx, &idx_A);
1290 if (wrld->rank == 0)
1291 printf(
"CTF ERROR: invalid distribution in read() call, aborting.\n");
1299 char * tensor::read(
char const * idx,
1304 for (
int i=0; i<order; i++){
1307 std::fill(new_sym, new_sym+order,
NS);
1308 tensor tsr(sr, order, lens, new_sym, wrld,
true);
1309 tsr[idx] += (*this)[idx];
1310 return tsr.
read(idx, prl, blk, unpack);
1315 tensor tsr_ali(
this, 1, 1);
1317 if (wrld->rank == 0)
1318 printf(
"Redistributing via read() starting from distribution:\n");
1323 if (tsr_ali.
has_home) deregister_size();
1328 return tsr_ali.
data;
1332 int tensor::sparsify(
char const * threshold,
1334 if ((threshold == NULL && sr->addid() == NULL) ||
1335 (threshold != NULL && !sr->is_ordered())){
1338 if (threshold == NULL)
1339 return sparsify([&](
char const* c){
return !sr->isequal(c, sr->addid()); });
1341 return sparsify([&](
char const* c){
1342 char tmp[sr->el_size];
1343 sr->max(c,threshold,tmp);
1344 return !sr->isequal(tmp, threshold);
1347 return sparsify([&](
char const* c){
1348 char tmp[sr->el_size];
1350 sr->max(tmp,threshold,tmp);
1351 return !sr->isequal(tmp, threshold);
1355 int tensor::sparsify(std::function<
bool(
char const*)> f){
1358 int64_t nnz_loc_new = 0;
1360 int64_t nnz_blk_old[calc_nvirt()];
1361 memcpy(nnz_blk_old, nnz_blk, calc_nvirt()*
sizeof(int64_t));
1362 memset(nnz_blk, 0, calc_nvirt()*
sizeof(int64_t));
1366 for (
int v=0; v<calc_nvirt(); v++){
1367 for (int64_t j=0; j<nnz_blk_old[v]; j++,i++){
1370 keep_vals[i] = f(pi[i].d());
1379 if (nnz_loc_new != nnz_loc){
1380 char * old_data = data;
1381 data = sr->pair_alloc(nnz_loc_new);
1384 for (int64_t i=0; i<nnz_loc; i++){
1386 pi_new[nnz_loc_new].
write(pi[i].ptr);
1390 sr->dealloc(old_data);
1394 this->set_new_nnz_glb(nnz_blk);
1399 ASSERT(!has_home || is_home);
1400 int nvirt = calc_nvirt();
1401 this->nnz_blk = (int64_t*)
alloc(
sizeof(int64_t)*nvirt);
1404 int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
1411 char * old_data = this->data;
1414 int idx_lyr = wrld->rank;
1415 for (
int i=0; i<this->order; i++){
1417 if (i == 0) edge_lda[0] = 1;
1418 else edge_lda[i] = edge_lda[i-1]*lens[i-1];
1419 mapping const * map = this->edge_map + i;
1422 virt_phase[i] = phase[i]/phys_phase[i];
1424 nvirt = nvirt*virt_phase[i];
1427 idx_lyr -= this->topo->lda[map->
cdt]
1442 for (
int i=0; i<this->order; i++){
1443 if (i == 0) edge_lda[0] = 1;
1444 else edge_lda[i] = edge_lda[i-1]*this->pad_edge_len[i-1];
1445 depadding[i] = -padding[i];
1449 memset(prepadding, 0,
sizeof(
int)*order);
1450 spsfy_tsr(this->order, this->size, nvirt,
1451 this->pad_edge_len, this->sym, phase,
1452 phys_phase, virt_phase, virt_phys_rank,
1453 this->data, this->data, this->nnz_blk, this->sr, edge_lda, f);
1454 char * new_pairs[nvirt];
1455 char const * data_ptr = this->data;
1456 int64_t new_nnz_tot = 0;
1457 for (
int v=0; v<nvirt; v++){
1458 if (nnz_blk[v] > 0){
1459 int64_t old_nnz = nnz_blk[v];
1460 new_pairs[v] = (
char*)sr->pair_alloc(nnz_blk[v]);
1461 depad_tsr(order, nnz_blk[v], this->lens, this->sym, this->padding, prepadding,
1462 data_ptr, new_pairs[v], nnz_blk+v, sr);
1463 pad_key(order, nnz_blk[v], this->pad_edge_len, depadding,
PairIterator(sr,new_pairs[v]), sr);
1464 data_ptr += old_nnz*sr->pair_size();
1465 new_nnz_tot += nnz_blk[v];
1466 }
else new_pairs[v] = NULL;
1470 sr->pair_dealloc(this->data);
1471 this->data = sr->pair_alloc(new_nnz_tot);
1472 char * new_data_ptr = this->data;
1473 for (
int v=0; v<nvirt; v++){
1474 if (new_pairs[v] != NULL){
1475 sr->copy_pairs(new_data_ptr, new_pairs[v], nnz_blk[v]);
1476 sr->pair_dealloc(new_pairs[v]);
1481 memset(nnz_blk, 0,
sizeof(int64_t)*nvirt);
1485 sr->dealloc(old_data);
1487 if (has_home) deregister_size();
1494 this->set_new_nnz_glb(this->nnz_blk);
1505 void tensor::read_local(int64_t * num_pair,
1508 bool unpack_sym)
const {
1510 read_local(num_pair, &pairs, unpack_sym);
1514 for (int64_t i=0; i<*num_pair; i++){
1515 (*inds)[i] = pr[i].
k();
1516 pr[i].
read_val(*data+i*sr->el_size);
1518 sr->pair_dealloc(pairs);
1522 void tensor::read_local_nnz(int64_t * num_pair,
1525 bool unpack_sym)
const {
1527 read_local_nnz(num_pair, &pairs, unpack_sym);
1531 for (int64_t i=0; i<*num_pair; i++){
1532 (*inds)[i] = pr[i].
k();
1533 pr[i].
read_val(*data+i*sr->el_size);
1535 sr->pair_dealloc(pairs);
1538 int tensor::read_local_nnz(int64_t * num_pair,
1539 char ** mapped_data,
1540 bool unpack_sym)
const {
1541 if (sr->isequal(sr->addid(), NULL) && !is_sparse)
1542 return read_local_nnz(num_pair,mapped_data, unpack_sym);
1546 *mapped_data = tsr_cpy.
data;
1553 int tensor::read_local(int64_t * num_pair,
1554 char ** mapped_data,
1555 bool unpack_sym)
const {
1556 int i, num_virt, idx_lyr;
1558 int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
1567 *mapped_data = NULL;
1575 bool has_sym =
false;
1576 for (i=0; i<this->order; i++){
1577 if (this->sym[i] !=
NS) has_sym =
true;
1583 read_local_nnz(&num_nnz, &nnz_data, unpack_sym);
1584 tensor dense_tsr(sr, order, lens, sym, wrld);
1585 dense_tsr.
write(num_nnz, sr->mulid(), sr->addid(), nnz_data);
1586 sr->pair_dealloc(nnz_data);
1587 dense_tsr.
read_local(num_pair, mapped_data, unpack_sym);
1590 }
else if (has_sym && unpack_sym) {
1591 int nosym[this->order];
1592 std::fill(nosym, nosym+this->order,
NS);
1593 tensor nosym_tsr(sr, order, lens, nosym, wrld);
1594 int idx[this->order];
1595 for (i=0; i<this->order; i++){
1598 summation s((
tensor*)
this, idx, sr->mulid(), &nosym_tsr, idx, sr->mulid());
1600 return nosym_tsr.
read_local(num_pair, mapped_data);
1612 idx_lyr = wrld->rank;
1613 for (i=0; i<tsr->
order; i++){
1618 virt_phase[i] = phase[i]/phys_phase[i];
1620 num_virt = num_virt*virt_phase[i];
1629 phase, phys_phase, virt_phase, virt_phys_rank, num_pair,
1630 tsr->
data, &pairs, sr);
1631 *mapped_data = pairs;
1633 *mapped_data = NULL;
1653 char * my_pairs, * all_pairs;
1656 if (has_zero_edge_len){
1682 alloc_ptr(numPes*
sizeof(
int), (
void**)&nXs);
1683 alloc_ptr(numPes*
sizeof(
int), (
void**)&pXs);
1688 read_local(&ntt, &my_pairs, unpack);
1691 MPI_Allgather(&n, 1, MPI_INT, nXs, 1, MPI_INT, wrld->comm);
1692 for (i=1; i<numPes; i++){
1693 pXs[i] = pXs[i-1]+nXs[i-1];
1695 nval = pXs[numPes-1] + nXs[numPes-1];
1696 nval = nval/sr->pair_size();
1697 all_pairs = sr->pair_alloc(nval);
1698 MPI_Allgatherv(my_pairs, n, MPI_CHAR,
1699 all_pairs, nXs, pXs, MPI_CHAR, wrld->comm);
1706 sr->pair_dealloc(my_pairs);
1712 int64_t tensor::get_tot_size(
bool packed=
false){
1715 for (
int i=0; i<order; i++){
1724 int tensor::allread(int64_t * num_pair,
1728 char * ball_data = sr->alloc((*num_pair));
1729 for (int64_t i=0; i<*num_pair; i++){
1730 ipr[i].
read_val(ball_data+i*sr->el_size);
1732 if (ipr.
ptr != NULL)
1733 sr->pair_dealloc(ipr.
ptr);
1734 *all_data = ball_data;
1738 int tensor::allread(int64_t * num_pair,
1742 for (int64_t i=0; i<*num_pair; i++){
1743 ipr[i].
read_val(all_data+i*sr->el_size);
1755 bool is_changed =
false;
1756 if (topo != B->
topo) is_changed =
true;
1758 for (
int i=0; i<order; i++){
1760 edge_map[i].clear();
1767 return redistribute(old_dist);
1771 int tensor::reduce_sum(
char * result) {
1772 return reduce_sum(result, sr);
1775 int tensor::reduce_sum(
char * result,
algstrct const * sr_other) {
1776 ASSERT(is_mapped && !is_folded);
1779 for (
int i=0; i<order; i++){
1784 sr->copy(result, sc.
data);
1785 wrld->cdt.bcast(result, sr_other->
el_size, MPI_CHAR, 0);
1789 int tensor::reduce_sumabs(
char * result) {
1790 return reduce_sumabs(result, sr);
1793 int tensor::reduce_sumabs(
char * result,
algstrct const * sr_other){
1794 ASSERT(is_mapped && !is_folded);
1798 for (
int i=0; i<order; i++){
1803 sr->copy(result, sc.
data);
1804 wrld->cdt.bcast(result, sr->el_size, MPI_CHAR, 0);
1808 int tensor::reduce_sumsq(
char * result) {
1809 ASSERT(is_mapped && !is_folded);
1812 for (
int i=0; i<order; i++){
1817 sr->copy(result, sc.
data);
1818 wrld->cdt.bcast(result, sr->el_size, MPI_CHAR, 0);
1822 void tensor::prnt()
const {
1825 void tensor::print(FILE * fp,
char const * cutoff)
const {
1827 int64_t imy_sz, tot_sz =0;
1828 int * recvcnts, * displs, * idx_arr;
1829 char * pmy_data, * pall_data;
1832 if (wrld->rank == 0)
1833 printf(
"Printing tensor %s\n",name);
1846 if (cutoff != NULL){
1851 read_local_nnz(&imy_sz, &pmy_data,
true);
1861 if (wrld->rank == 0){
1862 alloc_ptr(wrld->np*
sizeof(
int), (
void**)&recvcnts);
1863 alloc_ptr(wrld->np*
sizeof(
int), (
void**)&displs);
1864 alloc_ptr(order*
sizeof(
int), (
void**)&idx_arr);
1868 MPI_Gather(&my_sz, 1, MPI_INT, recvcnts, 1, MPI_INT, 0, wrld->cdt.cm);
1870 if (wrld->rank == 0){
1871 for (
int i=0; i<wrld->np; i++){
1872 recvcnts[i] *= sr->pair_size();
1875 for (
int i=1; i<wrld->np; i++){
1876 displs[i] = displs[i-1] + recvcnts[i-1];
1878 tot_sz = (displs[wrld->np-1] + recvcnts[wrld->np-1])/sr->pair_size();
1880 pall_data = sr->pair_alloc(tot_sz);
1887 if (wrld->cdt.np == 1)
1888 pall_data = pmy_data;
1890 MPI_Gatherv(pmy_data, my_sz*sr->pair_size(), MPI_CHAR,
1891 pall_data, recvcnts, displs, MPI_CHAR, 0, wrld->cdt.cm);
1899 if (wrld->rank == 0){
1900 all_data.sort(tot_sz);
1901 for (int64_t i=0; i<tot_sz; i++){
1908 k = all_data[i].k();
1909 for (
int j=0; j<order; j++){
1911 idx_arr[j] = k%lens[j];
1914 for (
int j=0; j<order; j++){
1915 fprintf(fp,
"[%d]",idx_arr[j]);
1917 fprintf(fp,
"(%ld, <",all_data[i].k());
1918 sr->print(all_data[i].d(), fp);
1924 if (pall_data != pmy_data) sr->pair_dealloc(pall_data);
1926 if (pmy_data != NULL) sr->pair_dealloc(pmy_data);
1930 void tensor::compare(
const tensor * A, FILE * fp,
char const * cutoff){
1933 int64_t imy_sz, tot_sz =0, my_sz_B;
1934 int * recvcnts, * displs, * idx_arr;
1953 assert(my_sz == my_sz_B);
1957 if (global_comm.
rank == 0){
1958 alloc_ptr(global_comm.
np*
sizeof(
int), (
void**)&recvcnts);
1959 alloc_ptr(global_comm.
np*
sizeof(
int), (
void**)&displs);
1965 MPI_Gather(&my_sz, 1, MPI_INT, recvcnts, 1, MPI_INT, 0, global_comm.
cm);
1967 if (global_comm.
rank == 0){
1968 for (i=0; i<global_comm.
np; i++){
1972 for (i=1; i<global_comm.
np; i++){
1973 displs[i] = displs[i-1] + recvcnts[i-1];
1975 tot_sz = (displs[global_comm.
np-1]
1984 if (my_sz == 0) my_data_A = my_data_B = NULL;
1985 MPI_Gatherv(my_data_A, my_sz*A->
sr->
pair_size(), MPI_CHAR,
1986 all_data_A, recvcnts, displs, MPI_CHAR, 0, global_comm.
cm);
1987 MPI_Gatherv(my_data_B, my_sz*A->
sr->
pair_size(), MPI_CHAR,
1988 all_data_B, recvcnts, displs, MPI_CHAR, 0, global_comm.
cm);
1993 if (global_comm.
rank == 0){
1994 pall_data_A.sort(tot_sz);
1995 pall_data_B.
sort(tot_sz);
1996 for (i=0; i<tot_sz; i++){
1999 A->
sr->
abs(pall_data_A[i].d(), aA);
2000 A->
sr->
min(aA, cutoff, aA);
2001 B->
sr->
abs(pall_data_B[i].d(), aB);
2002 B->
sr->
min(aB, cutoff, aB);
2005 k = pall_data_A[i].k();
2006 for (j=0; j<A->
order; j++){
2007 idx_arr[j] = k%A->
lens[j];
2010 for (j=0; j<A->
order; j++){
2011 fprintf(fp,
"[%d]",idx_arr[j]);
2014 A->
sr->
print(pall_data_A[i].d(),fp);
2016 A->
sr->
print(pall_data_B[i].d(),fp);
2023 sr->pair_dealloc(all_data_A);
2024 sr->pair_dealloc(all_data_B);
2029 void tensor::unfold(
bool was_mod){
2030 int i, j, allfold_dim;
2031 int * all_edge_len, * sub_edge_len;
2032 if (this->is_folded){
2035 calc_dim(this->order, this->size, this->pad_edge_len, this->edge_map,
2036 NULL, sub_edge_len, NULL);
2038 for (i=0; i<this->order; i++){
2039 if (this->sym[i] ==
NS){
2041 while (i-j >= 0 && this->sym[i-j] !=
NS) j++;
2042 all_edge_len[allfold_dim] =
sy_packed_size(j, sub_edge_len+i-j+1,
2048 nosym_transpose(
this, allfold_dim, all_edge_len, this->inner_ordering, 0);
2049 assert(!left_home_transp);
2051 ASSERT(this->nrow_idx != -1);
2053 despmatricize(this->nrow_idx, this->is_csr);
2058 this->rec_tsr->is_data_aliased=1;
2059 delete this->rec_tsr;
2062 this->is_folded = 0;
2067 void tensor::remove_fold(){
2068 delete this->rec_tsr;
2070 this->is_folded = 0;
2075 double tensor::est_time_unfold(){
2076 int i, j, allfold_dim;
2077 int * all_edge_len, * sub_edge_len;
2078 if (!this->is_folded)
return 0.0;
2082 calc_dim(this->order, this->size, this->pad_edge_len, this->edge_map,
2083 NULL, sub_edge_len, NULL);
2085 for (i=0; i<this->order; i++){
2086 if (this->sym[i] ==
NS){
2088 while (i-j >= 0 && this->sym[i-j] !=
NS) j++;
2089 all_edge_len[allfold_dim] =
sy_packed_size(j, sub_edge_len+i-j+1,
2094 est_time = this->calc_nvirt()*
est_time_transp(allfold_dim, this->inner_ordering, all_edge_len, 0, sr);
2101 void tensor::fold(
int nfold,
2102 int const * fold_idx,
2103 int const * idx_map,
2106 int i, j, k, fdim, allfold_dim, is_fold, fold_dim;
2107 int * sub_edge_len, * fold_edge_len, * all_edge_len, * dim_order;
2111 if (this->is_folded != 0) this->unfold();
2115 allfold_dim = 0, fold_dim = 0;
2116 for (j=0; j<this->order; j++){
2117 if (this->sym[j] ==
NS){
2119 for (i=0; i<nfold; i++){
2120 if (fold_idx[i] == idx_map[j])
2130 calc_dim(this->order, this->size, this->pad_edge_len, this->edge_map,
2131 NULL, sub_edge_len, NULL);
2133 allfold_dim = 0, fdim = 0;
2134 for (j=0; j<this->order; j++){
2135 if (this->sym[j] ==
NS){
2137 while (j-k >= 0 && this->sym[j-k] !=
NS) k++;
2138 all_edge_len[allfold_dim] =
sy_packed_size(k, sub_edge_len+j-k+1,
2141 for (i=0; i<nfold; i++){
2142 if (fold_idx[i] == idx_map[j]){
2144 while (j-k >= 0 && this->sym[j-k] !=
NS) k++;
2151 dim_order[fdim] = allfold_dim;
2154 dim_order[fold_dim+allfold_dim-fdim] = allfold_dim;
2158 std::fill(fold_sym, fold_sym+fold_dim,
NS);
2159 fold_tsr =
new tensor(sr, fold_dim, fold_edge_len, fold_sym, wrld, 0);
2161 this->is_folded = 1;
2162 this->rec_tsr = fold_tsr;
2163 this->inner_ordering = dim_order;
2166 *all_fdim = allfold_dim;
2167 *all_flen = all_edge_len;
2176 void tensor::pull_alias(
tensor const * other){
2178 this->topo = other->
topo;
2181 this->data = other->
data;
2182 this->is_home = other->
is_home;
2185 this->set_padding();
2189 void tensor::clear_mapping(){
2192 for (j=0; j<this->order; j++){
2193 map = this->edge_map + j;
2197 this->is_mapped = 0;
2198 this->is_folded = 0;
2202 int const * old_offsets,
2203 int *
const * old_permutation,
2204 int const * new_offsets,
2205 int *
const * new_permutation){
2207 int can_block_shuffle;
2208 char * shuffled_data;
2210 char * shuffled_data_corr;
2214 if (is_sparse) can_block_shuffle = 0;
2216 #ifdef USE_BLOCK_RESHUFFLE 2219 can_block_shuffle = 0;
2221 if (old_offsets != NULL || old_permutation != NULL ||
2222 new_offsets != NULL || new_permutation != NULL){
2223 can_block_shuffle = 0;
2227 if (size > INT_MAX && !is_sparse && wrld->cdt.rank == 0)
2228 printf(
"CTF WARNING: Tensor %s is being redistributed to a mapping where its size is %ld, which is greater than INT_MAX=%d, so MPI could run into problems\n", name, size, INT_MAX);
2230 #ifdef HOME_CONTRACT 2232 if (wrld->cdt.rank == 0)
2233 DPRINTF(2,
"Tensor %s leaving home %d\n", name, is_sparse);
2235 if (this->has_home){
2236 this->home_buffer = sr->pair_alloc(nnz_loc);
2237 sr->copy_pairs(this->home_buffer, this->data, nnz_loc);
2241 this->data = sr->alloc(old_dist.
size);
2242 sr->copy(this->data, this->home_buffer, old_dist.
size);
2248 if (this->profile) {
2250 strcpy(spf,
"redistribute_");
2251 strcat(spf,this->name);
2252 if (wrld->cdt.rank == 0){
2259 if (wrld->cdt.rank == 0){
2260 if (can_block_shuffle)
VPRINTF(1,
"Remapping tensor %s via block_reshuffle to mapping\n",this->name);
2261 else if (is_sparse)
VPRINTF(1,
"Remapping tensor %s via sparse reshuffle to mapping\n",this->name);
2262 else VPRINTF(1,
"Remapping tensor %s via cyclic_reshuffle to mapping\n",this->name);
2264 this->print_map(stdout);
2269 padded_reshuffle(sym, old_dist, new_dist, this->data, &shuffled_data_corr, sr, wrld->cdt);
2272 if (can_block_shuffle){
2273 block_reshuffle(old_dist, new_dist, this->data, shuffled_data, sr, wrld->cdt);
2274 sr->dealloc(this->data);
2280 double nnz_frac_ = ((double)nnz_tot)/(old_dist.
size*wrld->cdt.np);
2281 double tps_[] = {0.0, 1.0, (double)log2(wrld->cdt.np), (double)std::max(old_dist.
size, new_dist.
size)*log2(wrld->cdt.np)*sr->el_size*nnz_frac_};
2284 double st_time = MPI_Wtime();
2286 char * old_data = this->data;
2289 int64_t old_nnz = nnz_loc;
2292 nnz_blk = (int64_t*)
alloc(
sizeof(int64_t)*calc_nvirt());
2293 std::fill(nnz_blk, nnz_blk+calc_nvirt(), 0);
2294 this->write(old_nnz, sr->mulid(), sr->addid(), old_data);
2296 shuffled_data = this->data;
2297 if (old_data != NULL){
2298 sr->pair_dealloc(old_data);
2301 double exe_time = MPI_Wtime()-st_time;
2302 double nnz_frac = ((double)nnz_tot)/(old_dist.
size*wrld->cdt.np);
2303 double tps[] = {exe_time, 1.0, (double)log2(wrld->cdt.np), (double)std::max(old_dist.
size, new_dist.
size)*log2(wrld->cdt.np)*sr->el_size*nnz_frac};
2307 dgtog_reshuffle(sym, lens, old_dist, new_dist, &this->data, &shuffled_data, sr, wrld->cdt);
2315 this->data = shuffled_data;
2318 if (!is_sparse && sr->addid() != NULL){
2319 bool abortt =
false;
2320 for (int64_t j=0; j<this->size; j++){
2321 if (!sr->isequal(this->data+j*sr->el_size, shuffled_data_corr+j*sr->el_size)){
2322 printf(
"data element %ld/%ld not received correctly on process %d\n",
2323 j, this->size, wrld->cdt.rank);
2324 printf(
"element received was ");
2325 sr->print(this->data+j*sr->el_size);
2326 printf(
", correct ");
2327 sr->print(shuffled_data_corr+j*sr->el_size);
2333 sr->dealloc(shuffled_data_corr);
2338 if (this->profile) {
2340 strcpy(spf,
"redistribute_");
2341 strcat(spf,this->name);
2342 if (wrld->cdt.rank == 0){
2354 double tensor::est_redist_time(
distribution const & old_dist,
double nnz_frac){
2355 int nvirt = (int64_t)calc_nvirt();
2357 if (is_sparse) can_blres = 0;
2359 #ifdef USE_BLOCK_RESHUFFLE 2366 for (
int i=0; i<order; i++){
2370 double est_time = 0.0;
2373 est_time +=
blres_est_time(std::max(old_dist.
size,this->size)*this->sr->el_size*nnz_frac, nvirt, old_nvirt);
2375 if (this->is_sparse)
2377 est_time +=
spredist_est_time(this->sr->el_size*std::max(this->size,old_dist.
size)*nnz_frac, wrld->cdt.np);
2379 est_time +=
dgtog_est_time(this->sr->el_size*std::max(this->size,old_dist.
size)*nnz_frac, wrld->cdt.np);
2385 int64_t tensor::get_redist_mem(
distribution const & old_dist,
double nnz_frac){
2387 if (is_sparse) can_blres = 0;
2389 #ifdef USE_BLOCK_RESHUFFLE 2396 return (int64_t)this->sr->el_size*std::max(this->size,old_dist.
size)*nnz_frac;
2399 return (int64_t)this->sr->pair_size()*std::max(this->size,old_dist.
size)*nnz_frac*3;
2401 return (int64_t)this->sr->el_size*std::max(this->size,old_dist.
size)*nnz_frac*2.5;
2405 int tensor::map_tensor_rem(
int num_phys_dims,
2408 int i, num_sub_phys_dims, stat;
2409 int * restricted, * phys_mapped, * comm_idx;
2416 memset(phys_mapped, 0, num_phys_dims*
sizeof(
int));
2418 for (i=0; i<this->order; i++){
2419 restricted[i] = (this->edge_map[i].type !=
NOT_MAPPED);
2420 map = &this->edge_map[i];
2422 phys_mapped[map->
cdt] = 1;
2428 num_sub_phys_dims = 0;
2429 for (i=0; i<num_phys_dims; i++){
2430 if (phys_mapped[i] == 0){
2431 num_sub_phys_dims++;
2436 num_sub_phys_dims = 0;
2437 for (i=0; i<num_phys_dims; i++){
2438 if (phys_mapped[i] == 0){
2439 sub_phys_comm[num_sub_phys_dims] = phys_comm[i];
2440 comm_idx[num_sub_phys_dims] = i;
2441 num_sub_phys_dims++;
2444 stat =
map_tensor(num_sub_phys_dims, this->order,
2445 this->pad_edge_len, this->sym_table,
2446 restricted, sub_phys_comm,
2456 int tensor::extract_diag(
int const * idx_map,
2459 int ** idx_map_new){
2460 int i, j, k, * edge_len, * nsym, * ex_idx_map, * diag_idx_map;
2461 for (i=0; i<this->order; i++){
2462 for (j=i+1; j<this->order; j++){
2463 if (idx_map[i] == idx_map[j]){
2469 for (k=0; k<this->order; k++){
2472 diag_idx_map[k] = k;
2473 edge_len[k] = this->pad_edge_len[k]-this->padding[k];
2474 (*idx_map_new)[k] = idx_map[k];
2477 if (this->sym[k] == this->sym[j]) nsym[k] = this->sym[k];
2479 nsym[k] = this->sym[k];
2481 ex_idx_map[k] = k-1;
2482 diag_idx_map[k-1] = k-1;
2483 edge_len[k-1] = this->pad_edge_len[k]-this->padding[k];
2484 nsym[k-1] = this->sym[k];
2485 (*idx_map_new)[k-1] = idx_map[k];
2491 int64_t lda_i=1, lda_j=1;
2492 for (
int ii=0; ii<i; ii++){
2495 for (
int jj=0; jj<j; jj++){
2500 new_tsr =
new tensor(sr, this->order-1, edge_len, nsym, wrld, 1, name, 1, is_sparse);
2502 for (
int p=0; p<nnz_loc; p++){
2503 int64_t k = pi[p].
k();
2504 if ((k/lda_i)%lens[i] == (k/lda_j)%lens[j]) nw++;
2506 char * pwdata = sr->pair_alloc(nw);
2512 for (
int p=0; p<nnz_loc; p++){
2513 int64_t k = pi[p].
k();
2514 if ((k/lda_i)%lens[i] == (k/lda_j)%lens[j]){
2515 int64_t k_new = (k%lda_j)+(k/(lda_j*lens[j])*lda_j);
2517 ((int64_t*)(wdata[nw].ptr))[0] = k_new;
2522 new_tsr->
write(nw, sr->mulid(), sr->addid(), pwdata);
2523 sr->pair_dealloc(pwdata);
2530 #pragma omp parallel for 2532 for (
int p=0; p<nw; p++){
2533 int64_t k = wdata[p].
k();
2534 int64_t kpart = (k/lda_i)%lens[i];
2535 int64_t k_new = (k%lda_j)+((k/lda_j)*lens[j]+kpart)*lda_j;
2536 ((int64_t*)(wdata[p].ptr))[0] = k_new;
2540 for (
int p=0; p<nnz_loc; p++){
2541 int64_t k = pi[p].
k();
2542 if ((k/lda_i)%lens[i] == (k/lda_j)%lens[j]){
2547 this->write(nw, NULL, NULL, pwdata);
2548 sr->pair_dealloc(pwdata);
2553 new_tsr =
new tensor(sr, this->order-1, edge_len, nsym, wrld, 1, name, 1, is_sparse);
2570 int tensor::zero_out_padding(){
2571 int i, num_virt, idx_lyr;
2573 int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
2578 if (this->has_zero_edge_len || is_sparse){
2582 this->set_padding();
2584 if (!this->is_mapped || sr->addid() == NULL){
2597 idx_lyr = wrld->rank;
2598 for (i=0; i<this->order; i++){
2600 map = this->edge_map + i;
2603 virt_phase[i] = phase[i]/phys_phase[i];
2605 num_virt = num_virt*virt_phase[i];
2608 idx_lyr -= topo->lda[map->
cdt]
2613 this->pad_edge_len, this->sym, this->padding,
2614 phase, phys_phase, virt_phase, virt_phys_rank, this->data, sr);
2616 std::fill(this->data, this->data+np, 0.0);
2629 void tensor::scale_diagonals(
int const * sym_mask){
2630 int i, num_virt, idx_lyr;
2632 int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
2638 this->set_padding();
2641 if (!this->is_mapped){
2653 idx_lyr = wrld->rank;
2654 for (i=0; i<this->order; i++){
2656 map = this->edge_map + i;
2659 virt_phase[i] = phase[i]/phys_phase[i];
2661 num_virt = num_virt*virt_phase[i];
2664 idx_lyr -= topo->lda[map->
cdt]
2669 this->pad_edge_len, this->sym, this->padding,
2670 phase, phys_phase, virt_phase, virt_phys_rank, this->data, sr, sym_mask);
2682 void tensor::addinv(){
2686 #pragma omp parallel for 2688 for (int64_t i=0; i<nnz_loc; i++){
2689 sr->addinv(pi[i].d(), pi[i].d());
2693 #pragma omp parallel for 2695 for (int64_t i=0; i<size; i++){
2696 sr->addinv(data+i*sr->el_size,data+i*sr->el_size);
2701 void tensor::set_sym(
int const * sym_){
2703 std::fill(this->sym, this->sym+order,
NS);
2705 memcpy(this->sym, sym_, order*
sizeof(
int));
2707 memset(sym_table, 0, order*order*
sizeof(
int));
2708 for (
int i=0; i<order; i++){
2709 if (this->sym[i] !=
NS) {
2710 sym_table[(i+1)+i*order] = 1;
2711 sym_table[(i+1)*order+i] = 1;
2716 void tensor::set_new_nnz_glb(int64_t
const * nnz_blk_){
2719 for (
int i=0; i<calc_nvirt(); i++){
2720 nnz_blk[i] = nnz_blk_[i];
2721 nnz_loc += nnz_blk[i];
2723 wrld->cdt.allred(&nnz_loc, &nnz_tot, 1, MPI_INT64_T, MPI_SUM);
2728 void tensor::spmatricize(
int m,
int n,
int nrow_idx,
bool csr){
2732 MPI_Barrier(this->wrld->comm);
2736 int64_t new_sz_A = 0;
2737 this->rec_tsr->is_sparse = 1;
2738 int nvirt_A = calc_nvirt();
2739 this->rec_tsr->nnz_blk = (int64_t*)
alloc(nvirt_A*
sizeof(int64_t));
2740 for (
int i=0; i<nvirt_A; i++){
2742 this->rec_tsr->nnz_blk[i] =
get_csr_size(this->nnz_blk[i], m, this->sr->el_size);
2744 this->rec_tsr->nnz_blk[i] =
get_coo_size(this->nnz_blk[i], this->sr->el_size);
2745 new_sz_A += this->rec_tsr->nnz_blk[i];
2748 this->rec_tsr->is_data_aliased =
false;
2749 int phase[this->order];
2750 for (
int i=0; i<this->order; i++){
2751 phase[i] = this->edge_map[i].calc_phase();
2753 char * data_ptr_out = this->rec_tsr->data;
2754 char const * data_ptr_in = this->data;
2755 for (
int i=0; i<nvirt_A; i++){
2758 cm.
set_data(this->nnz_blk[i], this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_in, this->sr, phase);
2759 CSR_Matrix cs(cm, m, n, this->sr, data_ptr_out);
2763 cm.
set_data(this->nnz_blk[i], this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_in, this->sr, phase);
2765 data_ptr_in += this->nnz_blk[i]*this->sr->pair_size();
2766 data_ptr_out += this->rec_tsr->nnz_blk[i];
2769 this->nrow_idx = nrow_idx;
2772 MPI_Barrier(this->wrld->comm);
2790 void tensor::despmatricize(
int nrow_idx,
bool csr){
2794 MPI_Barrier(this->wrld->comm);
2800 this->rec_tsr->is_sparse = 1;
2801 int nvirt = calc_nvirt();
2802 for (
int i=0; i<nvirt; i++){
2803 if (this->rec_tsr->nnz_blk[i]>0){
2812 offset += this->rec_tsr->nnz_blk[i];
2814 this->data = sr->pair_alloc(new_sz);
2815 int phase[this->order];
2816 int phys_phase[this->order];
2817 int phase_rank[this->order];
2818 for (
int i=0; i<this->order; i++){
2819 phase[i] = this->edge_map[i].calc_phase();
2820 phys_phase[i] = this->edge_map[i].calc_phys_phase();
2821 phase_rank[i] = this->edge_map[i].calc_phys_rank(this->topo);
2823 char * data_ptr_out = this->data;
2824 char const * data_ptr_in = this->rec_tsr->data;
2825 for (
int i=0; i<nvirt; i++){
2826 if (this->rec_tsr->nnz_blk[i]>0){
2830 cm.
get_data(cs.
nnz(), this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_out, this->sr, phase, phase_rank);
2831 this->nnz_blk[i] = cm.
nnz();
2835 cm.
get_data(cm.
nnz(), this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_out, this->sr, phase, phase_rank);
2836 this->nnz_blk[i] = cm.
nnz();
2838 }
else this->nnz_blk[i] = 0;
2839 data_ptr_out += this->nnz_blk[i]*this->sr->pair_size();
2840 data_ptr_in += this->rec_tsr->nnz_blk[i];
2845 phase_rank[j]+=phys_phase[j];
2846 if (phase_rank[j]>=phase[j])
2847 phase_rank[j]=phase_rank[j]%phase[j];
2853 set_new_nnz_glb(this->nnz_blk);
2854 this->rec_tsr->is_csr = csr;
2858 MPI_Barrier(this->wrld->comm);
2863 void tensor::leave_home_with_buffer(){
2864 #ifdef HOME_CONTRACT 2865 if (this->has_home){
2866 if (!this->is_home){
2868 sr->pair_dealloc(this->home_buffer);
2869 this->home_buffer = NULL;
2871 sr->dealloc(this->home_buffer);
2872 this->home_buffer = this->data;
2875 if (wrld->rank == 0)
DPRINTF(2,
"Deleting home (leave) of %s\n",name);
2883 void tensor::register_size(int64_t sz){
2885 registered_alloc_size = sz;
2889 void tensor::deregister_size(){
2891 registered_alloc_size = 0;
2894 void tensor::write_dense_to_file(MPI_File & file, int64_t offset){
2895 bool need_unpack = is_sparse;
2896 for (
int i=0; i<order; i++){
2897 if (sym[i] !=
NS) need_unpack =
true;
2901 std::fill(nsym, nsym+order,
NS);
2902 tensor t_dns(sr, order, lens, nsym, wrld);
2903 t_dns[
"ij"] = (*this)[
"ij"];
2907 int64_t chnk_sz = tot_els/wrld->np;
2908 int64_t my_chnk_sz = chnk_sz;
2909 if (wrld->rank < tot_els%wrld->np) my_chnk_sz++;
2910 int64_t my_chnk_st = chnk_sz*wrld->rank + std::min((int64_t)wrld->rank, tot_els%wrld->np);
2912 char * my_pairs = (
char*)
alloc(sr->pair_size()*my_chnk_sz);
2915 for (int64_t i=0; i<my_chnk_sz; i++){
2919 this->read(my_chnk_sz, my_pairs);
2920 for (int64_t i=0; i<my_chnk_sz; i++){
2921 char val[sr->el_size];
2923 memcpy(my_pairs+i*sr->el_size, val, sr->el_size);
2927 MPI_Offset off = my_chnk_st*sr->el_size+offset;
2928 MPI_File_write_at(file, off, my_pairs, my_chnk_sz, sr->mdtype(), &stat);
2933 void tensor::write_dense_to_file(
char const * filename){
2936 MPI_File_open(this->wrld->comm, filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &file);
2937 this->write_dense_to_file(file, 0);
2938 MPI_File_close(&file);
2941 void tensor::read_dense_from_file(MPI_File & file, int64_t offset){
2942 bool need_unpack = is_sparse;
2943 for (
int i=0; i<order; i++){
2944 if (sym[i] !=
NS) need_unpack =
true;
2948 std::fill(nsym, nsym+order,
NS);
2949 tensor t_dns(sr, order, lens, nsym, wrld);
2951 summation ts(&t_dns,
"ij", sr->mulid(),
this,
"ij", sr->addid());
2954 if (is_sparse) this->sparsify();
2957 int64_t chnk_sz = tot_els/wrld->np;
2958 int64_t my_chnk_sz = chnk_sz;
2959 if (wrld->rank < tot_els%wrld->np) my_chnk_sz++;
2960 int64_t my_chnk_st = chnk_sz*wrld->rank + std::min((int64_t)wrld->rank, tot_els%wrld->np);
2962 char * my_pairs = (
char*)
alloc(sr->pair_size()*my_chnk_sz);
2964 char * my_pairs_tail = my_pairs +
sizeof(int64_t)*my_chnk_sz;
2966 MPI_Offset off = my_chnk_st*sr->el_size+offset;
2967 MPI_File_read_at(file, off, my_pairs_tail, my_chnk_sz, sr->mdtype(), &stat);
2970 for (int64_t i=0; i<my_chnk_sz; i++){
2971 char val[sr->el_size];
2972 memcpy(val, my_pairs_tail+i*sr->el_size, sr->el_size);
2973 pi[i].write_key(my_chnk_st+i);
2974 pi[i].write_val(val);
2977 this->write(my_chnk_sz, sr->mulid(), sr->addid(), my_pairs);
2982 void tensor::read_dense_from_file(
char const * filename){
2984 MPI_File_open(this->wrld->comm, filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &file);
2985 this->read_dense_from_file(file, 0);
2986 MPI_File_close(&file);
2990 tensor * tensor::self_reduce(
int const * idx_A,
2999 if (order_C == 0 && this->order == order_B + 1){
3000 bool all_match_except_one =
true;
3001 bool one_skip =
false;
3003 while (iiA < this->order){
3004 if (iiB >= order_B || idx_A[iiA] != idx_B[iiB]){
3005 if (one_skip) all_match_except_one =
false;
3006 else one_skip =
true;
3013 if (all_match_except_one && one_skip)
return this;
3017 for (
int i=0; i<this->order; i++){
3019 bool has_match =
false;
3020 for (
int j=0; j<this->order; j++){
3021 if (j != i && idx_A[j] == iA) has_match =
true;
3023 for (
int j=0; j<order_B; j++){
3024 if (idx_B[j] == iA) has_match =
true;
3026 for (
int j=0; j<order_C; j++){
3027 if (idx_C[j] == iA) has_match =
true;
3031 int new_len[this->order-1];
3032 int new_sym[this->order-1];
3033 int sum_A_idx[this->order];
3034 int sum_B_idx[this->order-1];
3041 for (
int j=0; j<this->order; j++){
3042 max_idx = std::max(max_idx, idx_A[j]);
3046 new_len[j] = this->lens[j];
3047 (*new_idx_A)[j] = idx_A[j];
3048 new_sym[j] = this->sym[j];
3050 if (this->sym[i] ==
NS) new_sym[j] =
NS;
3055 new_len[j-1] = this->lens[j];
3056 new_sym[j-1] = this->sym[j];
3057 (*new_idx_A)[j-1] = idx_A[j];
3063 for (
int j=0; j<this->order; j++){
3064 max_idx = std::max(max_idx, idx_A[j]);
3066 for (
int j=0; j<order_B; j++){
3067 (*new_idx_B)[j] = idx_B[j];
3068 max_idx = std::max(max_idx, idx_B[j]);
3070 for (
int j=0; j<order_C; j++){
3071 (*new_idx_C)[j] = idx_C[j];
3072 max_idx = std::max(max_idx, idx_C[j]);
3076 for (
int j=0; j<this->order-1; j++){
3077 if ((*new_idx_A)[j] == max_idx)
3078 (*new_idx_A)[j] = iA;
3080 for (
int j=0; j<order_B; j++){
3081 if ((*new_idx_B)[j] == max_idx)
3082 (*new_idx_B)[j] = iA;
3084 for (
int j=0; j<order_C; j++){
3085 if ((*new_idx_C)[j] == max_idx)
3086 (*new_idx_C)[j] = iA;
3090 tensor * new_tsr =
new tensor(this->sr, this->order-1, new_len, new_sym, this->wrld, 1, this->name, 1, this->is_sparse);
3091 summation s(
this, sum_A_idx, this->sr->mulid(), new_tsr, sum_B_idx, this->sr->mulid());
void permute(int order, int const *perm, int *arr)
permute an array
void read(int64_t num_pair, char const *alpha, char const *beta, int64_t const *inds, char *data)
read tensor data with <key, value> pairs where key is the global index for the value, which gets filled in with beta times the old values plus alpha times the values read from the tensor.
void write(char const *buf, int64_t n=1)
sets internal pairs to provided data
int calc_phys_rank(topology const *topo) const
compute the physical rank of a mapping
void inc_tot_mem_used(int64_t a)
int map_tensor(int num_phys_dims, int tsr_order, int const *tsr_edge_len, int const *tsr_sym_table, int *restricted, CommData *phys_comm, int const *comm_idx, int fill, mapping *tsr_edge_map)
map a tensor
double spredist_est_time(int64_t size, int np)
void zero_padding(int order, int64_t size, int nvirt, int const *edge_len, int const *sym, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int const *cphase_rank, char *vdata, algstrct const *sr)
sets to zero all values in padded region of tensor
void write_key(int64_t key)
sets key of head pair to key
char * home_buffer
buffer associated with home mapping of tensor, to which it is returned
CTF_int::CommData cdt
communicator data for MPI comm defining this world
def sum(tensor, init_A, axis=None, dtype=None, out=None, keepdims=None)
bool is_home
whether the latest tensor data is in the home buffer
int64_t * nnz_blk
nonzero elements in each block owned locally
int * sym
symmetries among tensor dimensions
void execute()
run contraction
virtual algstrct * clone() const =0
''copy constructor''
void slice(int const *offsets_B, int const *ends_B, char const *beta, tensor *A, int const *offsets_A, int const *ends_A, char const *alpha)
accumulates out a slice (block) of this tensor = B B[offsets,ends)=beta*B[offsets,ends) + alpha*A[offsets_A,ends_A)
virtual int pair_size() const
gets pair size el_size plus the key size
int calc_phase() const
compute the phase of a mapping
int64_t get_coo_size(int64_t nnz, int val_size)
virtual char * pair_alloc(int64_t n) const
allocate space for n (int64_t,dtype) pairs, necessary for object types
double dgtog_est_time(int64_t tot_sz, int np)
estimates execution time, given this processor sends a receives tot_sz across np procs ...
void calc_dim(int order, int64_t size, int const *edge_len, mapping const *edge_map, int64_t *vrt_sz, int *vrt_edge_len, int *blk_edge_len)
calculate the block-sizes of a tensor
void execute(bool run_diag=false)
run summation
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
int * pad_edge_len
padded tensor edge lengths
void serialize(char **buffer, int *size)
serialize object into contiguous data buffer
int * inner_ordering
ordering of the dimensions according to which the tensori s folded
int calc_phys_phase() const
compute the physical phase of a mapping
void allred(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op)
allreduce, same interface as MPI_Allreduce, but excluding the comm
void read_loc_pairs(int order, int64_t nval, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *phase_rank, int64_t *nread, char const *data, char **pairs, algstrct const *sr)
read tensor pairs local to processor
int sparsify(char const *threshold=NULL, bool take_abs=true)
reduce tensor to sparse format, storing only nonzero data, or data above a specified threshold...
bool has_home
whether the tensor has a home mapping/buffer
local process walltime measurement
int64_t size
current size of local tensor data chunk (mapping-dependent)
void * alloc(int64_t len)
alloc abstraction
int find_topology(topology const *topo, std::vector< topology * > &topovec)
searches for an equivalent topology in avector of topologies
int get_best_topo(int64_t nvirt, int topo, CommData global_comm, int64_t bcomm_vol, int64_t bmemuse)
get the best topologoes (least nvirt) over all procs
void set_distribution(char const *idx, CTF::Idx_Partition const &prl, CTF::Idx_Partition const &blk)
set edge mappings as specified
bool is_folded
whether the data is folded/transposed into a (lower-order) tensor
int64_t home_size
size of home buffer
untyped internal class for doubly-typed univariate function
virtual char const * addid() const
MPI datatype for pairs.
#define DEBUG_PRINTF(...)
void sort(int64_t n)
sorts set of pairs using std::sort
void copy_mapping(int order, mapping const *mapping_A, mapping *mapping_B)
copies mapping A to B
an instance of the CTF library (world) on a MPI communicator
void write_dense_to_file(MPI_File &file, int64_t offset=0)
write all tensor data to binary file in element order, unpacking from sparse or symmetric formats ...
void read_dense_from_file(MPI_File &file, int64_t offset=0)
read all tensor data from binary file in element order, which should be stored as nonsymmetric and de...
bool is_sparse
whether only the non-zero elements of the tensor are stored
int order
number of tensor dimensions
void read_val(char *buf) const
sets external value to the value pointed by the iterator
virtual char * alloc(int64_t n) const
allocate space for n items, necessary for object types
char * all_data
serialized buffer containing info and data
int align(tensor const *B)
align mapping of thisa tensor to that of B
void cyclic_reshuffle(int const *sym, distribution const &old_dist, int const *old_offsets, int *const *old_permutation, distribution const &new_dist, int const *new_offsets, int *const *new_permutation, char **ptr_tsr_data, char **ptr_tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm, bool reuse_buffers, char const *alpha, char const *beta)
Goes from any set of phases to any new set of phases.
void set_padding()
sets padding and local size of a tensor given a mapping
void padded_reshuffle(int const *sym, distribution const &old_dist, distribution const &new_dist, char *tsr_data, char **tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm)
Reshuffle elements using key-value pair read/write.
CTF::World * wrld
distributed processor context on which tensor is defined
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
int rank
rank of local processor
bool is_cyclic
whether the tensor data is cyclically distributed (blocked if false)
virtual void min(char const *a, char const *b, char *c) const
c = min(a,b)
int64_t nnz() const
retrieves number of nonzeros out of all_data
int write(int64_t num_pair, char const *alpha, char const *beta, char *mapped_data, char const rw='w')
Add tensor data new=alpha*new+beta*old with <key, value> pairs where key is the global index for the ...
class for execution distributed contraction of tensors
int * padding
padding along each edge length (less than distribution phase)
int * lens
unpadded tensor edge lengths
LinModel< 3 > spredist_mdl(spredist_mdl_init,"spredist_mdl")
void get_data(int64_t nz, int order, int const *lens, int const *rev_ordering, int nrow_idx, char *tsr_data, algstrct const *sr, int const *phase, int const *phase_rank)
unfolds tensor data from COO format based on prespecification of row and column modes ...
int64_t nnz() const
retrieves number of nonzeros out of all_data
int64_t k() const
returns key of pair at head of ptr
serialized matrix in coordinate format, meaning three arrays of dimension nnz are stored...
int can_block_reshuffle(int order, int const *old_phase, mapping const *map)
determines if tensor can be permuted by block
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
void print_map(FILE *stream=stdout, bool allcall=1) const
displays mapping information
int comp_dim_map(mapping const *map_A, mapping const *map_B)
compares two mappings
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
bool is_data_aliased
whether the tensor data is an alias of another tensor object's data
void(* abs)(char const *a, char *b)
b = max(a,addinv(a))
int64_t k() const
returns key of pair at head of ptr
int64_t nnz_tot
maximum number of local nonzero elements over all procs
double est_time_transp(int order, int const *new_order, int const *edge_len, int dir, algstrct const *sr)
estimates time needed to transposes a non-symmetric (folded) tensor based on performance models ...
algstrct * sr
algstrct on which tensor elements and operations are defined
virtual void pair_dealloc(char *ptr) const
deallocate given pointer containing contiguous array of pairs
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
tensor * rec_tsr
representation of folded tensor (shares data pointer)
bool is_mapped
whether a mapping has been selected
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
void read_val(char *buf) const
sets value to the value pointed by the iterator
void orient_subworld(CTF::World *greater_world, int &bw_mirror_rank, int &fw_mirror_rank, distribution *&odst, char **sub_buffer_)
maps data from this world (subcomm) to the correct order of processors with respect to a parent (grea...
void dgtog_reshuffle(int const *sym, int const *edge_len, distribution const &old_dist, distribution const &new_dist, char **ptr_tsr_data, char **ptr_tsr_new_data, algstrct const *sr, CommData ord_glb_comm)
void set_data(int64_t nz, int order, int const *lens, int const *ordering, int nrow_idx, char const *tsr_data, algstrct const *sr, int const *phase)
folds tensor data into COO format based on prespecification of row and column modes ...
virtual void print(char const *a, FILE *fp=stdout) const
prints the value
void depermute_keys(int order, int num_pair, int const *edge_len, int const *new_edge_len, int *const *permutation, char *pairs_buf, algstrct const *sr)
depermutes keys (apply P^T)
void block_reshuffle(distribution const &old_dist, distribution const &new_dist, char *tsr_data, char *&tsr_cyclic_data, algstrct const *sr, CommData glb_comm)
Reshuffle elements by block given the global phases stay the same.
int64_t calc_nvirt() const
calculate virtualization factor of tensor return virtualization factor
int get_distribution_size(int order)
void permute_keys(int order, int num_pair, int const *edge_len, int const *new_edge_len, int *const *permutation, char *pairs_buf, int64_t *new_num_pair, algstrct const *sr)
permutes keys
int check_self_mapping(tensor const *tsr, int const *idx_map)
checks mapping in preparation for tensors scale, summ or contract
int64_t nnz_loc
number of local nonzero elements
int univar_function(int n, World &dw)
int el_size
size of each element of algstrct in bytes
void pad_key(int order, int64_t num_pair, int const *edge_len, int const *padding, PairIterator pairs, algstrct const *sr, int const *offsets)
applies padding to keys
bool profile
whether profiling should be done for contractions/sums involving this tensor
int cdealloc(void *ptr)
free abstraction
int64_t proc_bytes_available()
gives total memory available on this MPI process
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
internal distributed tensor class
void scal_diag(int order, int64_t size, int nvirt, int const *edge_len, int const *sym, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int const *cphase_rank, char *vdata, algstrct const *sr, int const *sym_mask)
scales each element by 1/(number of entries equivalent to it after permutation of indices for which s...
double blres_est_time(int64_t tot_sz, int nv0, int nv1)
estimates execution time, given this processor sends a receives tot_sz across np procs ...
void nosym_transpose(tensor *A, int all_fdim_A, int const *all_flen_A, int const *new_order, int dir)
void write_val(char const *buf)
sets value of head pair to what is in buf
int64_t get_csr_size(int64_t nnz, int nrow_, int val_size)
computes the size of a serialized CSR matrix
void wr_pairs_layout(int order, int np, int64_t inwrite, char const *alpha, char const *beta, char rw, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *virt_phys_rank, int const *bucket_lda, char *wr_pairs_buf, char *rw_data, CommData glb_comm, algstrct const *sr, bool is_sparse, int64_t nnz_loc, int64_t *nnz_blk, char *&pprs_new, int64_t &nnz_loc_new)
read or write pairs from / to tensor
topology * topo
topology to which the tensor is mapped
int redistribute(distribution const &old_dist, int const *old_offsets=NULL, int *const *old_permutation=NULL, int const *new_offsets=NULL, int *const *new_permutation=NULL)
permutes the data of a tensor to its new layout
int64_t packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in packed symmetric (SY, SH, or AS) layout
Idx_Partition reduce_order() const
extracts non-trivial part of partition by ommitting unit dimensions
class for execution distributed summation of tensors
int read_local_nnz(int64_t *num_pair, char **mapped_data, bool unpack_sym=false) const
read tensor data pairs local to processor that have nonzero values
void spsfy_tsr(int order, int64_t size, int nvirt, int const *edge_len, int const *sym, int const *phase, int const *phys_phase, int const *virt_dim, int *phase_rank, char const *vdata, char *&vpairs, int64_t *nnz_blk, algstrct const *sr, int64_t const *edge_lda, std::function< bool(char const *)> f)
extracts all tensor values (in pair format) that pass a sparsifier function (including padded zeros i...
void depad_tsr(int order, int64_t num_pair, int const *edge_len, int const *sym, int const *padding, int const *prepadding, char const *pairsb, char *new_pairsb, int64_t *new_num_pair, algstrct const *sr)
retrieves the unpadded pairs
bool has_zero_edge_len
if true tensor has a zero edge length, so is zero, which short-cuts stuff
virtual char const * mulid() const
identity element for multiplication i.e. 1
a tensor with an index map associated with it (necessary for overloaded operators) ...
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
char * name
name given to tensor
int np
number of processors
double spredist_mdl_init[]
int sum_tensors(bool run_diag)
PDAXPY: a*idx_map_A(A) + b*idx_map_B(B) -> idx_map_B(B). Treats symmetric as lower triangular...
MPI_Comm comm
set of processors making up this world
int read_local(int64_t *num_pair, char **mapped_data, bool unpack_sym=false) const
read tensor data pairs local to processor including those with zero values WARNING: for sparse tensor...
void clear()
resets mapping to NOT_MAPPED
int conv_idx(int order, type const *cidx, int **iidx)
virtual MPI_Datatype mdtype() const
MPI datatype.
void clear_mapping()
zeros out mapping