Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
untyped_tensor.cxx
Go to the documentation of this file.
1 
2 #include "../interface/common.h"
3 #include "../interface/timer.h"
4 #include "../interface/idx_tensor.h"
5 #include "../summation/summation.h"
6 #include "../contraction/contraction.h"
7 #include "untyped_tensor.h"
8 #include "../shared/util.h"
9 #include "../shared/memcontrol.h"
10 #include "../redistribution/sparse_rw.h"
11 #include "../redistribution/pad.h"
12 #include "../redistribution/nosym_transp.h"
13 #include "../redistribution/redist.h"
14 #include "../redistribution/cyclic_reshuffle.h"
15 #include "../redistribution/glb_cyclic_reshuffle.h"
16 #include "../redistribution/dgtog_redist.h"
17 
18 
19 using namespace CTF;
20 
21 namespace CTF_int {
22 
23  LinModel<3> spredist_mdl(spredist_mdl_init,"spredist_mdl");
24  double spredist_est_time(int64_t size, int np){
25  double ps[] = {1.0, (double)log2(np), (double)size*log2(np)};
26  return spredist_mdl.est_time(ps);
27  }
28 
29 // static const char * SY_strings[4] = {"NS", "SY", "AS", "SH"};
30 
31  Idx_Tensor tensor::operator[](const char * idx_map_){
32  Idx_Tensor idxtsr(this, idx_map_);
33  return idxtsr;
34  }
35 
36  tensor::tensor(){
37  order=-1;
38  }
39 
40  void tensor::free_self(){
41  if (order != -1){
42  if (wrld->rank == 0) DPRINTF(3,"Deleted order %d tensor %s\n",order,name);
43  if (is_folded) unfold();
44  cdealloc(sym);
45  cdealloc(lens);
46  cdealloc(pad_edge_len);
47  cdealloc(padding);
48  if (is_scp_padded)
49  cdealloc(scp_padding);
50  cdealloc(sym_table);
51  delete [] edge_map;
52  deregister_size();
53  if (!is_data_aliased){
54  if (is_home){
55  if (!is_sparse) sr->dealloc(home_buffer);
56  else {
57  sr->pair_dealloc(data);
58  }
59  } else {
60  if (data != NULL){
61  //if (order == 0) sr->dealloc(data);
62  if (!is_sparse) sr->dealloc(data);
63  else sr->pair_dealloc(data);
64  }
65  }
66  if (has_home && !is_home){
67  if (!is_sparse) sr->dealloc(home_buffer);
68  else sr->pair_dealloc(home_buffer);
69  }
70  }
71  if (is_sparse) cdealloc(nnz_blk);
72  order = -1;
73  delete sr;
74  cdealloc(name);
75  }
76  }
77 
78  tensor::~tensor(){
79  free_self();
80  }
81 
82  tensor::tensor(algstrct const * sr,
83  int order,
84  int const * edge_len,
85  int const * sym,
86  World * wrld,
87  bool alloc_data,
88  char const * name,
89  bool profile,
90  bool is_sparse){
91  this->init(sr, order,edge_len,sym,wrld,alloc_data,name,profile,is_sparse);
92  }
93 
94  tensor::tensor(algstrct const * sr,
95  int order,
96  bool is_sparse,
97  int const * edge_len,
98  int const * sym,
99  CTF::World * wrld,
100  char const * idx,
101  CTF::Idx_Partition const & prl,
102  CTF::Idx_Partition const & blk,
103  char const * name,
104  bool profile){
105  this->init(sr, order,edge_len,sym,wrld,0,name,profile,is_sparse);
106  set_distribution(idx, prl, blk);
107  if (is_sparse){
108  nnz_blk = (int64_t*)alloc(sizeof(int64_t)*calc_nvirt());
109  std::fill(nnz_blk, nnz_blk+calc_nvirt(), 0);
110 #ifdef HOME_CONTRACT
111  this->is_home = 1;
112  this->has_home = 1;
113  this->home_buffer = NULL;
114 #else
115  this->is_home = 0;
116  this->has_home = 0;
117 #endif
118  } else {
119 
120  this->data = sr->alloc(this->size);
121  this->sr->set(this->data, this->sr->addid(), this->size);
122 #ifdef HOME_CONTRACT
123  this->home_size = this->size;
124  register_size(home_size*sr->el_size);
125  this->has_home = 1;
126  this->is_home = 1;
127  this->home_buffer = this->data;
128 #else
129  this->is_home = 0;
130  this->has_home = 0;
131 #endif
132  }
133 
134  }
135 
136  tensor::tensor(tensor const * other, bool copy, bool alloc_data){
137  char * nname = (char*)alloc(strlen(other->name) + 2);
138  char d[] = "\'";
139  strcpy(nname, other->name);
140  strcat(nname, d);
141  if (other->wrld->rank == 0) {
142  DPRINTF(2,"Cloning tensor %s into %s copy=%d alloc_data=%d\n",other->name, nname,copy, alloc_data);
143  }
144  this->init(other->sr, other->order, other->lens,
145  other->sym, other->wrld, (!copy & alloc_data), nname,
146  other->profile, other->is_sparse);
147  cdealloc(nname);
148 
149  this->has_zero_edge_len = other->has_zero_edge_len;
150 
151  if (copy) {
152  copy_tensor_data(other);
153  } else if (!alloc_data) data = NULL;
154 
155  }
156 
157  tensor::tensor(tensor * other, int const * new_sym){
158  char * nname = (char*)alloc(strlen(other->name) + 2);
159  char d[] = "\'";
160  strcpy(nname, other->name);
161  strcat(nname, d);
162  if (other->wrld->rank == 0) {
163  DPRINTF(2,"Repacking tensor %s into %s\n",other->name,nname);
164  }
165 
166  bool has_chng=false, less_sym=false, more_sym=false;
167  for (int i=0; i<other->order; i++){
168  if (other->sym[i] != new_sym[i]){
169  if (other->wrld->rank == 0) {
170  DPRINTF(2,"sym[%d] was %d now %d\n",i,other->sym[i],new_sym[i]);
171  }
172  has_chng = true;
173  if (other->sym[i] == NS){
174  assert(!less_sym);
175  more_sym = true;
176  }
177  if (new_sym[i] == NS){
178  assert(!more_sym);
179  less_sym = true;
180  }
181  }
182  }
183 
184  this->has_zero_edge_len = other->has_zero_edge_len;
185 
186 
187  if (!less_sym && !more_sym){
188  this->init(other->sr, other->order, other->lens,
189  new_sym, other->wrld, 0, nname,
190  other->profile, other->is_sparse);
191  copy_tensor_data(other);
192  if (has_chng)
193  zero_out_padding();
194  } else {
195  this->init(other->sr, other->order, other->lens,
196  new_sym, other->wrld, 1, nname,
197  other->profile, other->is_sparse);
198  int idx[order];
199  for (int j=0; j<order; j++){
200  idx[j] = j;
201  }
202  summation ts(other, idx, sr->mulid(), this, idx, sr->addid());
203  ts.sum_tensors(true);
204  }
205  cdealloc(nname);
206  }
207 
208 
209 
210  void tensor::copy_tensor_data(tensor const * other){
211  //FIXME: do not unfold
212 // if (other->is_folded) other->unfold();
213  ASSERT(!other->is_folded);
214  ASSERT(other->is_mapped);
215 
216  if (other->is_mapped && !other->is_sparse){
217  #ifdef HOME_CONTRACT
218  if (other->has_home){
219 /* if (this->has_home &&
220  (!this->is_home && this->home_size != other->home_size)){
221  CTF_int::cdealloc(this->home_buffer);
222  }*/
223  this->home_size = other->home_size;
224  register_size(this->home_size*sr->el_size);
225  this->home_buffer = sr->alloc(other->home_size);
226  if (other->is_home){
227  this->is_home = 1;
228  this->data = this->home_buffer;
229  } else {
230  /*if (this->is_home || this->home_size != other->home_size){
231  }*/
232  this->is_home = 0;
233  sr->copy(this->home_buffer, other->home_buffer, other->home_size);
234  //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data);
235  this->data = sr->alloc(other->size);
236  }
237  this->has_home = 1;
238  } else {
239  //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data);
240  this->data = sr->alloc(other->size);
241 /* if (this->has_home && !this->is_home){
242  CTF_int::cdealloc(this->home_buffer);
243  }*/
244  this->has_home = 0;
245  this->is_home = 0;
246  }
247  #else
248  //CTF_int::alloc_ptr(other->size*sr->el_size, (void**)&this->data);
249  this->data = sr->alloc(other->size);
250  #endif
251  sr->copy(this->data, other->data, other->size);
252  } else {
253  ASSERT(this->is_sparse);
254  has_home = other->has_home;
255  is_home = other->is_home;
256  this->home_buffer = other->home_buffer;
257  if (data!=NULL) sr->dealloc(this->data);
258  if (nnz_blk!=NULL) CTF_int::cdealloc(this->nnz_blk);
259  //CTF_int::alloc_ptr(other->nnz_loc*(sizeof(int64_t)+sr->el_size),
260  // (void**)&this->data);
261  this->data = sr->pair_alloc(other->nnz_loc);
262  CTF_int::alloc_ptr(other->calc_nvirt()*sizeof(int64_t), (void**)&this->nnz_blk);
263  memcpy(this->nnz_blk, other->nnz_blk, other->calc_nvirt()*sizeof(int64_t));
264  this->set_new_nnz_glb(other->nnz_blk);
265  sr->copy_pairs(this->data, other->data, other->nnz_loc);
266  }
267  if (this->is_folded){
268  delete this->rec_tsr;
269  }
270  this->is_folded = other->is_folded;
271  if (other->is_folded){
272  tensor * itsr = other->rec_tsr;
273  tensor * rtsr = new tensor(itsr->sr, itsr->order, itsr->lens, itsr->sym, itsr->wrld, 0);
274  CTF_int::alloc_ptr(sizeof(int)*other->order,
275  (void**)&this->inner_ordering);
276  for (int i=0; i<other->order; i++){
277  this->inner_ordering[i] = other->inner_ordering[i];
278  }
279  this->rec_tsr = rtsr;
280  }
281 
282  this->order = other->order;
283  memcpy(this->pad_edge_len, other->pad_edge_len, sizeof(int)*other->order);
284  memcpy(this->padding, other->padding, sizeof(int)*other->order);
285  this->is_mapped = other->is_mapped;
286  this->is_cyclic = other->is_cyclic;
287  this->topo = other->topo;
288  if (other->is_mapped)
289  copy_mapping(other->order, other->edge_map, this->edge_map);
290  this->size = other->size;
291  this->nnz_loc = other->nnz_loc;
292  this->nnz_tot = other->nnz_tot;
293  //this->nnz_loc_max = other->nnz_loc_max;
294 #if DEBUG>= 1
295  if (wrld->rank == 0){
296  if (is_sparse){
297  printf("New sparse tensor %s copied from %s of size %ld nonzeros (%ld bytes) locally, %ld nonzeros total:\n",name, other->name, this->nnz_loc,this->nnz_loc*sr->el_size,nnz_tot);
298  } else {
299  printf("New tensor %s copied from %s of size %ld elms (%ld bytes):\n",name, other->name, this->size,this->size*sr->el_size);
300  }
301  }
302 #endif
303 
304  }
305 
306  void tensor::init(algstrct const * sr_,
307  int order_,
308  int const * edge_len,
309  int const * sym_,
310  World * wrld_,
311  bool alloc_data,
312  char const * name_,
313  bool profile_,
314  bool is_sparse_){
315  TAU_FSTART(init_tensor);
316  this->sr = sr_->clone();
317  this->order = order_;
318  this->wrld = wrld_;
319  this->is_scp_padded = 0;
320  this->is_mapped = 0;
321  this->topo = NULL;
322  this->is_cyclic = 1;
323  this->size = 0;
324  this->is_folded = 0;
325  this->is_data_aliased = 0;
326  this->has_zero_edge_len = 0;
327  this->is_home = 0;
328  this->has_home = 0;
329  this->profile = profile_;
330  this->is_sparse = is_sparse_;
331  this->data = NULL;
332  this->nnz_loc = 0;
333  this->nnz_tot = 0;
334  this->nnz_blk = NULL;
335  this->is_csr = false;
336  this->nrow_idx = -1;
337  this->left_home_transp = 0;
338 // this->nnz_loc_max = 0;
339  this->registered_alloc_size = 0;
340  if (name_ != NULL){
341  this->name = (char*)alloc(strlen(name_)+1);
342  strcpy(this->name, name_);
343  } else {
344  this->name = (char*)alloc(7*sizeof(char));
345  for (int i=0; i<4; i++){
346  this->name[i] = 'A'+(wrld->glob_wrld_rng()%26);
347  }
348  this->name[4] = '0'+(order_/10);
349  this->name[5] = '0'+(order_%10);
350  this->name[6] = '\0';
351  }
352  this->home_buffer = NULL;
353  if (wrld->rank == 0)
354  DPRINTF(3,"Created order %d tensor %s, is_sparse = %d, allocated = %d\n",order,name,is_sparse,alloc_data);
355 
356  CTF_int::alloc_ptr(order*sizeof(int), (void**)&this->padding);
357  memset(this->padding, 0, order*sizeof(int));
358 
359  this->lens = (int*)CTF_int::alloc(order*sizeof(int));
360  memcpy(this->lens, edge_len, order*sizeof(int));
361  this->pad_edge_len = (int*)CTF_int::alloc(order*sizeof(int));
362  memcpy(this->pad_edge_len, lens, order*sizeof(int));
363  this->sym = (int*)CTF_int::alloc(order*sizeof(int));
364  sym_table = (int*)CTF_int::alloc(order*order*sizeof(int));
365  this->set_sym (sym_);
366  this->edge_map = new mapping[order];
367 
368  /* initialize map array and symmetry table */
369  for (int i=0; i<order; i++){
370  if (this->lens[i] <= 0) this->has_zero_edge_len = 1;
371  this->edge_map[i].type = NOT_MAPPED;
372  this->edge_map[i].has_child = 0;
373  this->edge_map[i].np = 1;
374  /*if (this->sym[i] != NS) {
375  //FIXME: keep track of capabilities of algberaic structure and add more robust property checking
376  if (this->sym[i] == AS && !sr->is_ring){
377  if (wrld->rank == 0){
378  printf("CTF ERROR: It is illegal to define antisymmetric tensor must be defined on a ring, yet no additive inverse was provided for this algstrct (see algstrct constructor), aborting.\n");
379  }
380  ABORT;
381  }
382  this->sym_table[(i+1)+i*order] = 1;
383  this->sym_table[(i+1)*order+i] = 1;
384  }*/
385  }
386  /* Set tensor data to zero. */
387  if (alloc_data){
388  int ret = set_zero();
389  ASSERT(ret == SUCCESS);
390  }
391  TAU_FSTOP(init_tensor);
392  }
393 
394  int * tensor::calc_phase() const {
395  mapping * map;
396  int * phase;
397  int i;
398  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&phase);
399  for (i=0; i<this->order; i++){
400  map = this->edge_map + i;
401  phase[i] = map->calc_phase();
402  }
403  return phase;
404  }
405 
406  int tensor::calc_tot_phase() const {
407  int i, tot_phase;
408  int * phase = this->calc_phase();
409  tot_phase = 1;
410  for (i=0 ; i<this->order; i++){
411  tot_phase *= phase[i];
412  }
413  CTF_int::cdealloc(phase);
414  return tot_phase;
415  }
416 
417  int64_t tensor::calc_nvirt() const {
418  int j;
419  int64_t nvirt, tnvirt;
420  mapping * map;
421  nvirt = 1;
422  for (j=0; j<this->order; j++){
423  map = &this->edge_map[j];
424  while (map->has_child) map = map->child;
425  if (map->type == VIRTUAL_MAP){
426  tnvirt = nvirt*(int64_t)map->np;
427  if (tnvirt < nvirt) return UINT64_MAX;
428  else nvirt = tnvirt;
429  }
430  }
431  return nvirt;
432  }
433 
434 
435  int64_t tensor::calc_npe() const {
436  int j;
437  int64_t npe;
438  mapping * map;
439  npe = 1;
440  for (j=0; j<this->order; j++){
441  map = &this->edge_map[j];
442  if (map->type == PHYSICAL_MAP){
443  npe *= map->np;
444  }
445  while (map->has_child){
446  map = map->child;
447  if (map->type == PHYSICAL_MAP){
448  npe *= map->np;
449  }
450  }
451  }
452  return npe;
453  }
454 
455 
456  void tensor::set_padding(){
457  int j, pad, i;
458  int * new_phase, * sub_edge_len;
459  mapping * map;
460  //if (!is_mapped) return;
461 
462  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&new_phase);
463  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&sub_edge_len);
464 
465 /*
466  for (i=0; i<this->order; i++){
467  this->edge_len[i] -= this->padding[i];
468  }*/
469 
470  for (j=0; j<this->order; j++){
471  map = this->edge_map + j;
472  new_phase[j] = map->calc_phase();
473  pad = this->lens[j]%new_phase[j];
474  if (pad != 0) {
475  pad = new_phase[j]-pad;
476  }
477  this->padding[j] = pad;
478  }
479  for (i=0; i<this->order; i++){
480  this->pad_edge_len[i] = this->lens[i] + this->padding[i];
481  sub_edge_len[i] = this->pad_edge_len[i]/new_phase[i];
482  }
483 
484  this->size = calc_nvirt()*sy_packed_size(this->order, sub_edge_len, this->sym);
485 
486  //NEW: I think its always true
487  //is_mapped = 1;
488 
489  CTF_int::cdealloc(sub_edge_len);
490  CTF_int::cdealloc(new_phase);
491  }
492 
493  int tensor::set(char const * val) {
494  sr->set(this->data, val, this->size);
495  return zero_out_padding();
496  }
497 
498 
499  int tensor::set_zero() {
500  TAU_FSTART(set_zero_tsr);
501  int * restricted;
502  int i, map_success, btopo;
503 // int64_t nvirt, bnvirt;
504  int64_t memuse, bmemuse;
505 
506  if (this->is_mapped){
507  if (is_sparse){
508  sr->pair_dealloc(this->data);
509  this->data = NULL;
510  this->home_buffer = NULL;
511 // this->size = 0;
512  memset(this->nnz_blk, 0, sizeof(int64_t)*calc_nvirt());
513  this->set_new_nnz_glb(this->nnz_blk);
514  } else {
515  sr->set(this->data, sr->addid(), this->size);
516  }
517  } else {
518  CTF_int::alloc_ptr(this->order*sizeof(int), (void**)&restricted);
519  // memset(restricted, 0, this->order*sizeof(int));
520 
521  /* Map the tensor if necessary */
522 // bnvirt = INT64_MAX;
523  btopo = -1;
524  bmemuse = INT64_MAX;
525  bool fully_distributed = false;
526  for (i=wrld->rank; i<(int64_t)wrld->topovec.size(); i+=wrld->np){
527  this->clear_mapping();
528  this->set_padding();
529  memset(restricted, 0, this->order*sizeof(int));
530  map_success = map_tensor(wrld->topovec[i]->order, this->order, this->pad_edge_len,
531  this->sym_table, restricted,
532  wrld->topovec[i]->dim_comm, NULL, 0,
533  this->edge_map);
534  if (map_success == ERROR) {
535  ASSERT(0);
536  TAU_FSTOP(set_zero_tsr);
537  return ERROR;
538  } else if (map_success == SUCCESS){
539  this->topo = wrld->topovec[i];
540  this->set_padding();
541  memuse = (int64_t)this->size;
542  if (!is_sparse && (int64_t)memuse*sr->el_size >= (int64_t)proc_bytes_available()){
543  DPRINTF(1,"Not enough memory %E to map tensor (size %E) on topo %d\n", (double)proc_bytes_available(),(double)memuse*sr->el_size,i);
544  continue;
545  }
546  int64_t sum_phases = 0;
547  int64_t prod_phys_phases = 1;
548  for (int j=0; j<this->order; j++){
549  int phase = this->edge_map[j].calc_phase();
550  prod_phys_phases *= this->edge_map[j].calc_phys_phase();
551  int max_lcm_phase = phase;
552  for (int k=0; k<this->order; k++){
553  max_lcm_phase = std::max(max_lcm_phase,lcm(phase,this->edge_map[k].calc_phase()));
554  }
555  sum_phases += max_lcm_phase + phase;
556  }
557  memuse = memuse*(1.+((double)sum_phases)/(4.*wrld->topovec[i]->glb_comm.np));
558 
559 
560 
561 // nvirt = (int64_t)this->calc_nvirt();
562  // ASSERT(nvirt != 0);
563  //for consistency with old code compare nvirt, but might b et better to discard
564  if (btopo == -1){ // || nvirt < bnvirt){
565  // bnvirt = nvirt;
566  btopo = i;
567  bmemuse = memuse;
568  fully_distributed = prod_phys_phases == wrld->np;
569  } else if ((memuse < bmemuse && !fully_distributed) || (memuse < bmemuse && prod_phys_phases == wrld->np)){
570  btopo = i;
571  bmemuse = memuse;
572  fully_distributed = prod_phys_phases == wrld->np;
573  }
574  } else
575  DPRINTF(1,"Unsuccessful in map_tensor() in set_zero()\n");
576  }
577  if (btopo == -1)
578  bmemuse = INT64_MAX;
579  /* pick lower dimensional mappings, if equivalent */
581  int btopo1 = get_best_topo((1-fully_distributed)*INT64_MAX+fully_distributed*bmemuse, btopo, wrld->cdt);
582  btopo = get_best_topo(bmemuse, btopo, wrld->cdt);
583  if (btopo != btopo1 && btopo1 != -1) btopo = btopo1;
584 
585  if (btopo == -1 || btopo == INT_MAX) {
586  if (wrld->rank==0)
587  printf("ERROR: FAILED TO MAP TENSOR\n");
588  MPI_Barrier(MPI_COMM_WORLD);
589  TAU_FSTOP(set_zero_tsr);
590  ASSERT(0);
591  return ERROR;
592  }
593 
594  memset(restricted, 0, this->order*sizeof(int));
595  this->clear_mapping();
596  this->set_padding();
597  map_success = map_tensor(wrld->topovec[btopo]->order, this->order,
598  this->pad_edge_len, this->sym_table, restricted,
599  wrld->topovec[btopo]->dim_comm, NULL, 0,
600  this->edge_map);
601  ASSERT(map_success == SUCCESS);
602 
603  this->topo = wrld->topovec[btopo];
604 
605  CTF_int::cdealloc(restricted);
606 
607  this->is_mapped = 1;
608  this->set_padding();
609 
610  if (!is_sparse && this->size > INT_MAX && wrld->rank == 0)
611  printf("CTF WARNING: Tensor %s is has local size %ld, which is greater than INT_MAX=%d, so MPI could run into problems\n", name, size, INT_MAX);
612 
613  if (is_sparse){
614  nnz_blk = (int64_t*)alloc(sizeof(int64_t)*calc_nvirt());
615  std::fill(nnz_blk, nnz_blk+calc_nvirt(), 0);
616  this->is_home = 1;
617  this->has_home = 1;
618  this->home_buffer = NULL;
619  } else {
620  #ifdef HOME_CONTRACT
621  if (this->order > 0){
622  this->home_size = this->size; //MAX(1024+this->size, 1.20*this->size);
623  this->is_home = 1;
624  this->has_home = 1;
625  //this->is_home = 0;
626  //this->has_home = 0;
627  /* if (wrld->rank == 0)
628  DPRINTF(3,"Initial size of tensor %d is " PRId64 ",",tensor_id,this->size);*/
629  this->home_buffer = sr->alloc(this->home_size);
630  if (wrld->rank == 0) DPRINTF(2,"Creating home of %s\n",name);
631  register_size(this->size*sr->el_size);
632  this->data = this->home_buffer;
633  } else {
634  this->data = sr->alloc(this->size);
635  }
636  #else
637  this->data = sr->alloc(this->size);
638  //CTF_int::alloc_ptr(this->size*sr->el_size, (void**)&this->data);
639  #endif
640  #if DEBUG >= 2
641  if (wrld->rank == 0)
642  printf("New tensor %s defined of size %ld elms (%ld bytes):\n",name, this->size,this->size*sr->el_size);
643  this->print_map(stdout);
644  #endif
645  sr->init(this->size, this->data);
646  }
647  }
648  TAU_FSTOP(set_zero_tsr);
649  return SUCCESS;
650  }
651 
652  void tensor::print_map(FILE * stream, bool allcall) const {
653  if (!allcall || wrld->rank == 0){
654  if (is_sparse)
655  printf("printing mapping of sparse tensor %s\n",name);
656  else
657  printf("printing mapping of dense tensor %s\n",name);
658  if (topo != NULL){
659  printf("CTF: %s mapped to order %d topology with dims:",name,topo->order);
660  for (int dim=0; dim<topo->order; dim++){
661  printf(" %d ",topo->lens[dim]);
662  }
663  }
664  printf("\n");
665  char tname[200];
666  tname[0] = '\0';
667  sprintf(tname, "%s[", name);
668  for (int dim=0; dim<order; dim++){
669  if (dim>0)
670  sprintf(tname+strlen(tname), ",");
671  int tp = edge_map[dim].calc_phase();
672  int pp = edge_map[dim].calc_phys_phase();
673  int vp = tp/pp;
674  if (tp==1) sprintf(tname+strlen(tname),"1");
675  else {
676  if (pp > 1){
677  sprintf(tname+strlen(tname),"p%d(%d)",edge_map[dim].np,edge_map[dim].cdt);
678  if (edge_map[dim].has_child && edge_map[dim].child->type == PHYSICAL_MAP)
679  sprintf(tname+strlen(tname),"p%d(%d)",edge_map[dim].child->np,edge_map[dim].child->cdt);
680  }
681  if (vp > 1) sprintf(tname+strlen(tname),"v%d",vp);
682  }
683 // sprintf(tname+strlen(tname),"c%d",edge_map[dim].has_child);
684  }
685  sprintf(tname+strlen(tname), "]");
686  /*printf("CTF: Tensor mapping is %s\n",tname);
687  printf("\nCTF: sym len tphs pphs vphs\n");
688  for (int dim=0; dim<order; dim++){
689  int tp = edge_map[dim].calc_phase();
690  int pp = edge_map[dim].calc_phys_phase();
691  int vp = tp/pp;
692  printf("CTF: %5d %5d %5d %5d\n", lens[dim], tp, pp, vp);
693  }*/
694  }
695  }
696 
697  void tensor::set_name(char const * name_){
698  cdealloc(name);
699  this->name = (char*)alloc(strlen(name_)+1);
700  strcpy(this->name, name_);
701  }
702 
703  char const * tensor::get_name() const {
704  return name;
705  }
706 
707  void tensor::profile_on(){
708  profile = true;
709  }
710 
711  void tensor::profile_off(){
712  profile = false;
713  }
714 
715  void tensor::get_raw_data(char ** data_, int64_t * size_) const {
716  *size_ = size;
717  *data_ = data;
718  }
719 
721  int * const * permutation_A,
722  char const * alpha,
723  int * const * permutation_B,
724  char const * beta){
725  int64_t sz_A, blk_sz_A, sz_B, blk_sz_B;
726  char * all_data_A;
727  char * all_data_B;
728  tensor * tsr_A, * tsr_B;
729  int ret;
730 
731  tsr_A = A;
732  tsr_B = this;
733 
734  if (permutation_B != NULL){
735  ASSERT(permutation_A == NULL);
736  ASSERT(tsr_B->wrld->np <= tsr_A->wrld->np);
737  if (tsr_B->order == 0 || tsr_B->has_zero_edge_len){
738  blk_sz_B = 0;
739  all_data_B = NULL;
740  } else {
741  if (wrld->rank == 0 && tsr_B->is_sparse) printf("CTF ERROR: please use other variant of permute function when the output is sparse\n");
742  assert(!tsr_B->is_sparse);
743  tsr_B->read_local(&sz_B, &all_data_B, true);
744  //permute all_data_B
745  permute_keys(tsr_B->order, sz_B, tsr_B->lens, tsr_A->lens, permutation_B, all_data_B, &blk_sz_B, sr);
746  }
747  ret = tsr_A->write(blk_sz_B, sr->mulid(), sr->addid(), all_data_B, 'r');
748  if (blk_sz_B > 0)
749  depermute_keys(tsr_B->order, blk_sz_B, tsr_B->lens, tsr_A->lens, permutation_B, all_data_B, sr);
750  all_data_A = all_data_B;
751  blk_sz_A = blk_sz_B;
752  } else {
753  ASSERT(tsr_B->wrld->np >= tsr_A->wrld->np);
754  if (tsr_A->order == 0 || tsr_A->has_zero_edge_len){
755  blk_sz_A = 0;
756  all_data_A = NULL;
757  } else {
758  ASSERT(permutation_A != NULL);
759  ASSERT(permutation_B == NULL);
760  tsr_A->read_local(&sz_A, &all_data_A, true);
761  //permute all_data_A
762  permute_keys(tsr_A->order, sz_A, tsr_A->lens, tsr_B->lens, permutation_A, all_data_A, &blk_sz_A, sr);
763  }
764  }
765  /*printf("alpha: "); tsr_B->sr->print(alpha);
766  printf(" beta: "); tsr_B->sr->print(beta);
767  printf(", writing first value is "); tsr_B->sr->print(all_data_A+sizeof(int64_t));
768  printf("\n");*/
769  ret = tsr_B->write(blk_sz_A, alpha, beta, all_data_A, 'w');
770 
771  if (blk_sz_A > 0)
772  tsr_A->sr->pair_dealloc(all_data_A);
773 
774  return ret;
775  }
776 
777  void tensor::orient_subworld(CTF::World * greater_world,
778  int & bw_mirror_rank,
779  int & fw_mirror_rank,
780  distribution *& odst,
781  char ** sub_buffer_){
782  int is_sub = 0;
783  //FIXME: assumes order 0 dummy, what if we run this on actual order 0 tensor?
784  if (order != -1) is_sub = 1;
785  int tot_sub;
786  greater_world->cdt.allred(&is_sub, &tot_sub, 1, MPI_INT, MPI_SUM);
787  //ensure the number of processes that have a subcomm defined is equal to the size of the subcomm
788  //this should in most sane cases ensure that a unique subcomm is involved
789  if (order != -1) ASSERT(tot_sub == wrld->np);
790  int aorder;
791  greater_world->cdt.allred(&order, &aorder, 1, MPI_INT, MPI_MAX);
792 
793  int sub_root_rank = 0;
794  int buf_sz = get_distribution_size(aorder);
795  char * buffer;
796  if (order >= 0 && wrld->rank == 0){
797  greater_world->cdt.allred(&greater_world->rank, &sub_root_rank, 1, MPI_INT, MPI_SUM);
798  ASSERT(sub_root_rank == greater_world->rank);
799  distribution dstrib = distribution(this);
800  int bsz;
801  dstrib.serialize(&buffer, &bsz);
802  ASSERT(bsz == buf_sz);
803  greater_world->cdt.bcast(buffer, buf_sz, MPI_CHAR, sub_root_rank);
804  } else {
805  buffer = (char*)CTF_int::alloc(buf_sz);
806  greater_world->cdt.allred(MPI_IN_PLACE, &sub_root_rank, 1, MPI_INT, MPI_SUM);
807  greater_world->cdt.bcast(buffer, buf_sz, MPI_CHAR, sub_root_rank);
808  }
809  odst = new distribution(buffer);
810  CTF_int::cdealloc(buffer);
811 
812  bw_mirror_rank = -1;
813  fw_mirror_rank = -1;
814  MPI_Request req;
815  if (order >= 0){
816  fw_mirror_rank = wrld->rank;
817  MPI_Isend(&(greater_world->rank), 1, MPI_INT, wrld->rank, 13, greater_world->comm, &req);
818  }
819  if (greater_world->rank < tot_sub){
820  MPI_Status stat;
821  MPI_Recv(&bw_mirror_rank, 1, MPI_INT, MPI_ANY_SOURCE, 13, greater_world->comm, &stat);
822  }
823  if (fw_mirror_rank >= 0){
824  MPI_Status stat;
825  MPI_Wait(&req, &stat);
826  }
827 
828  MPI_Request req1, req2;
829 
830  char * sub_buffer = sr->alloc(odst->size);
831 
832  char * rbuffer = NULL;
833  if (bw_mirror_rank >= 0){
834  rbuffer = (char*)CTF_int::alloc(buf_sz);
835  MPI_Irecv(rbuffer, buf_sz, MPI_CHAR, bw_mirror_rank, 0, greater_world->comm, &req1);
836  MPI_Irecv(sub_buffer, odst->size*sr->el_size, MPI_CHAR, bw_mirror_rank, 1, greater_world->comm, &req2);
837  }
838  if (fw_mirror_rank >= 0){
839  char * sbuffer;
840  distribution ndstr = distribution(this);
841  int bsz;
842  ndstr.serialize(&sbuffer, &bsz);
843  ASSERT(bsz == buf_sz);
844  MPI_Send(sbuffer, buf_sz, MPI_CHAR, fw_mirror_rank, 0, greater_world->comm);
845  MPI_Send(this->data, odst->size*sr->el_size, MPI_CHAR, fw_mirror_rank, 1, greater_world->comm);
846  CTF_int::cdealloc(sbuffer);
847  }
848  if (bw_mirror_rank >= 0){
849  MPI_Status stat;
850  MPI_Wait(&req1, &stat);
851  MPI_Wait(&req2, &stat);
852  delete odst;
853  odst = new distribution(rbuffer);
854  CTF_int::cdealloc(rbuffer);
855  } else
856  sr->set(sub_buffer, sr->addid(), odst->size);
857  *sub_buffer_ = sub_buffer;
858 
859  }
860 
861  void tensor::slice(int const * offsets_B,
862  int const * ends_B,
863  char const * beta,
864  tensor * A,
865  int const * offsets_A,
866  int const * ends_A,
867  char const * alpha){
868 
869  int64_t i, j, sz_A, blk_sz_A, sz_B, blk_sz_B;
870  char * all_data_A, * blk_data_A;
871  char * all_data_B, * blk_data_B;
872  tensor * tsr_A, * tsr_B;
873 
874  tsr_A = A;
875  tsr_B = this;
876 
877  int * padding_A = (int*)CTF_int::alloc(sizeof(int)*tsr_A->order);
878  int * toffset_A = (int*)CTF_int::alloc(sizeof(int)*tsr_A->order);
879  int * padding_B = (int*)CTF_int::alloc(sizeof(int)*tsr_B->order);
880  int * toffset_B = (int*)CTF_int::alloc(sizeof(int)*tsr_B->order);
881  for (i=0,j=0; i<this->order && j<A->order; i++, j++){
882  if (ends_A[j] - offsets_A[j] != ends_B[i] - offsets_B[i]){
883  if (ends_B[i] - offsets_B[i] == 1){ j--; continue; } // continue with i+1,j
884  if (ends_A[j] - offsets_A[j] == 1){ i--; continue; } // continue with i,j+1
885  printf("CTF ERROR: slice dimensions inconsistent 1\n");
886  ASSERT(0);
887  return;
888  }
889  }
890 
891  while (A->order != 0 && i < this->order){
892  if (ends_B[i] - offsets_B[i] == 1){ i++; continue; }
893  printf("CTF ERROR: slice dimensions inconsistent 2\n");
894  ASSERT(0);
895  return;
896  }
897  while (this->order != 0 && j < A->order){
898  if (ends_A[j] - offsets_A[j] == 1){ j++; continue; }
899  printf("CTF ERROR: slice dimensions inconsistent 3\n");
900  ASSERT(0);
901  return;
902  }
903  // bool tsr_A_has_sym = false;
904 
905  if (tsr_B->wrld->np <= tsr_A->wrld->np){
906  //usually 'read' elements of B from A, since B may be smalelr than A
907  if (tsr_B->order == 0 || tsr_B->has_zero_edge_len){
908  blk_sz_B = 0;
909  blk_data_B = NULL;
910  } else {
911  tsr_B->read_local(&sz_B, &all_data_B, false);
912 
913  blk_data_B = tsr_B->sr->pair_alloc(sz_B);
914 
915  for (i=0; i<tsr_B->order; i++){
916  padding_B[i] = tsr_B->lens[i] - ends_B[i];
917  }
918  depad_tsr(tsr_B->order, sz_B, ends_B, tsr_B->sym, padding_B, offsets_B,
919  all_data_B, blk_data_B, &blk_sz_B, sr);
920  if (sz_B > 0)
921  sr->pair_dealloc(all_data_B);
922 
923  for (i=0; i<tsr_B->order; i++){
924  toffset_B[i] = -offsets_B[i];
925  padding_B[i] = ends_B[i]-offsets_B[i]-tsr_B->lens[i];
926  }
927  PairIterator pblk_data_B = PairIterator(sr, blk_data_B);
928  pad_key(tsr_B->order, blk_sz_B, tsr_B->lens,
929  padding_B, pblk_data_B, sr, toffset_B);
930  for (i=0; i<tsr_A->order; i++){
931  toffset_A[i] = ends_A[i] - offsets_A[i];
932  padding_A[i] = tsr_A->lens[i] - toffset_A[i];
933  }
934  pad_key(tsr_A->order, blk_sz_B, toffset_A,
935  padding_A, pblk_data_B, sr, offsets_A);
936  }
937  tsr_A->write(blk_sz_B, sr->mulid(), sr->addid(), blk_data_B, 'r');
938  all_data_A = blk_data_B;
939  sz_A = blk_sz_B;
940  } else {
941  tsr_A->read_local(&sz_A, &all_data_A, true);
942  //printf("sz_A=%ld\n",sz_A);
943  }
944 
945  if (tsr_A->order == 0 || tsr_A->has_zero_edge_len){
946  blk_sz_A = 0;
947  blk_data_A = NULL;
948  } else {
949  blk_data_A = tsr_A->sr->pair_alloc(sz_A);
950 
951  for (i=0; i<tsr_A->order; i++){
952  padding_A[i] = tsr_A->lens[i] - ends_A[i];
953  }
954  int nosym[tsr_A->order];
955  std::fill(nosym, nosym+tsr_A->order, NS);
956  depad_tsr(tsr_A->order, sz_A, ends_A, nosym, padding_A, offsets_A,
957  all_data_A, blk_data_A, &blk_sz_A, sr);
958  //if (sz_A > 0)
959  tsr_A->sr->pair_dealloc(all_data_A);
960 
961 
962  for (i=0; i<tsr_A->order; i++){
963  toffset_A[i] = -offsets_A[i];
964  padding_A[i] = ends_A[i]-offsets_A[i]-tsr_A->lens[i];
965  }
966  PairIterator pblk_data_A = PairIterator(sr, blk_data_A);
967  pad_key(tsr_A->order, blk_sz_A, tsr_A->lens,
968  padding_A, pblk_data_A, sr, toffset_A);
969  for (i=0; i<tsr_B->order; i++){
970  toffset_B[i] = ends_B[i] - offsets_B[i];
971  padding_B[i] = tsr_B->lens[i] - toffset_B[i];
972  }
973  pad_key(tsr_B->order, blk_sz_A, toffset_B,
974  padding_B, pblk_data_A, sr, offsets_B);
975  }
976 /* printf("alpha is "); tsr_B->sr->print(alpha); printf("\n");
977  printf("beta is "); tsr_B->sr->print(beta); printf("\n");
978  printf("writing B blk_sz_A = %ld key =%ld\n",blk_sz_A,*(int64_t*)blk_data_A);
979  tsr_B->sr->print(blk_data_A+sizeof(int64_t));*/
980 
981  tsr_B->write(blk_sz_A, alpha, beta, blk_data_A, 'w');
982  if (tsr_A->order != 0 && !tsr_A->has_zero_edge_len)
983  tsr_A->sr->pair_dealloc(blk_data_A);
984  CTF_int::cdealloc(padding_A);
985  CTF_int::cdealloc(padding_B);
986  CTF_int::cdealloc(toffset_A);
987  CTF_int::cdealloc(toffset_B);
988  }
989 
990 //#define USE_SLICE_FOR_SUBWORLD
991  void tensor::add_to_subworld(tensor * tsr_sub,
992  char const * alpha,
993  char const * beta){
994  #ifdef USE_SLICE_FOR_SUBWORLD
995  int offsets[this->order];
996  memset(offsets, 0, this->order*sizeof(int));
997  if (tsr_sub->order == -1){ // == NULL){
998 // CommData * cdt = new CommData(MPI_COMM_SELF);
999  // (CommData*)CTF_int::alloc(sizeof(CommData));
1000  // SET_COMM(MPI_COMM_SELF, 0, 1, cdt);
1001  World dt_self = World(MPI_COMM_SELF);
1002  tensor stsr = tensor(sr, 0, NULL, NULL, &dt_self, 0);
1003  stsr.slice(NULL, NULL, beta, this, offsets, offsets, alpha);
1004  } else {
1005  tsr_sub->slice(offsets, lens, beta, this, offsets, lens, alpha);
1006  }
1007  #else
1008  int fw_mirror_rank, bw_mirror_rank;
1009  distribution * odst;
1010  char * sub_buffer;
1011  tsr_sub->orient_subworld(wrld, bw_mirror_rank, fw_mirror_rank, odst, &sub_buffer);
1012 
1013  distribution idst = distribution(this);
1014 
1015 /* redistribute(sym, wrld->comm, idst, this->data, alpha,
1016  odst, sub_buffer, beta);*/
1017  cyclic_reshuffle(sym, idst, NULL, NULL, *odst, NULL, NULL, &this->data, &sub_buffer, sr, wrld->cdt, 0, alpha, beta);
1018 
1019  MPI_Request req;
1020  if (fw_mirror_rank >= 0){
1021  ASSERT(tsr_sub != NULL);
1022  MPI_Irecv(tsr_sub->data, odst->size, tsr_sub->sr->mdtype(), fw_mirror_rank, 0, wrld->cdt.cm, &req);
1023  }
1024 
1025  if (bw_mirror_rank >= 0)
1026  MPI_Send(sub_buffer, odst->size, sr->mdtype(), bw_mirror_rank, 0, wrld->cdt.cm);
1027  if (fw_mirror_rank >= 0){
1028  MPI_Status stat;
1029  MPI_Wait(&req, &stat);
1030  }
1031  delete odst;
1032  sr->dealloc(sub_buffer);
1033  #endif
1034 
1035  }
1036 
1037  void tensor::add_from_subworld(tensor * tsr_sub,
1038  char const * alpha,
1039  char const * beta){
1040  #ifdef USE_SLICE_FOR_SUBWORLD
1041  int offsets[this->order];
1042  memset(offsets, 0, this->order*sizeof(int));
1043  if (tsr_sub->order == -1){ // == NULL){
1044  World dt_self = World(MPI_COMM_SELF);
1045  tensor stsr = tensor(sr, 0, NULL, NULL, &dt_self, 0);
1046  slice(offsets, offsets, beta, &stsr, NULL, NULL, alpha);
1047  } else {
1048  slice(offsets, lens, alpha, tsr_sub, offsets, lens, beta);
1049  }
1050  #else
1051  int fw_mirror_rank, bw_mirror_rank;
1052  distribution * odst;
1053  char * sub_buffer;
1054  tsr_sub->orient_subworld(wrld, bw_mirror_rank, fw_mirror_rank, odst, &sub_buffer);
1055 
1056  distribution idst = distribution(this);
1057 
1058 /* redistribute(sym, wrld->cdt, odst, sub_buffer, alpha,
1059  idst, this->data, beta);*/
1060  cyclic_reshuffle(sym, *odst, NULL, NULL, idst, NULL, NULL, &sub_buffer, &this->data, sr, wrld->cdt, 0, alpha, beta);
1061  delete odst;
1062  sr->dealloc(sub_buffer);
1063  #endif
1064 
1065  }
1066 
1067  void tensor::write(int64_t num_pair,
1068  char const * alpha,
1069  char const * beta,
1070  int64_t const *inds,
1071  char const * data){
1072  char * pairs = sr->pair_alloc(num_pair);
1073  PairIterator pr(sr, pairs);
1074  for (int64_t i=0; i<num_pair; i++){
1075  pr[i].write_key(inds[i]);
1076  pr[i].write_val(data+i*sr->el_size);
1077  }
1078  this->write(num_pair,alpha,beta,pairs,'w');
1079  sr->pair_dealloc(pairs);
1080  }
1081 
1082  int tensor::write(int64_t num_pair,
1083  char const * alpha,
1084  char const * beta,
1085  char * mapped_data,
1086  char const rw){
1087  int i, num_virt;
1088  int * phase, * phys_phase, * virt_phase, * bucket_lda;
1089  int * virt_phys_rank;
1090  mapping * map;
1091  tensor * tsr;
1092 
1093  #if DEBUG >= 1
1094  if (wrld->rank == 0){
1095  /* if (rw == 'w')
1096  printf("Writing data to tensor %d\n", tensor_id);
1097  else
1098  printf("Reading data from tensor %d\n", tensor_id);*/
1099  this->print_map(stdout);
1100  }
1101  #endif
1102  tsr = this;
1103 
1104  if (tsr->has_zero_edge_len) return SUCCESS;
1105  TAU_FSTART(write_pairs);
1106  ASSERT(!is_folded);
1107 
1108  if (tsr->is_mapped){
1109  tsr->set_padding();
1110  CTF_int::alloc_ptr(tsr->order*sizeof(int), (void**)&phase);
1111  CTF_int::alloc_ptr(tsr->order*sizeof(int), (void**)&phys_phase);
1112  CTF_int::alloc_ptr(tsr->order*sizeof(int), (void**)&virt_phys_rank);
1113  CTF_int::alloc_ptr(tsr->order*sizeof(int), (void**)&bucket_lda);
1114  CTF_int::alloc_ptr(tsr->order*sizeof(int), (void**)&virt_phase);
1115  num_virt = 1;
1116  /* Setup rank/phase arrays, given current mapping */
1117  for (i=0; i<tsr->order; i++){
1118  map = tsr->edge_map + i;
1119  phase[i] = map->calc_phase();
1120  phys_phase[i] = map->calc_phys_phase();
1121  virt_phase[i] = phase[i]/phys_phase[i];
1122  virt_phys_rank[i] = map->calc_phys_rank(tsr->topo);
1123  //*virt_phase[i];
1124  num_virt = num_virt*virt_phase[i];
1125  if (map->type == PHYSICAL_MAP)
1126  bucket_lda[i] = tsr->topo->lda[map->cdt];
1127  else
1128  bucket_lda[i] = 0;
1129  }
1130 
1131  // buffer write if not enough memory
1132  int npart = 1;
1133  int64_t max_memuse = proc_bytes_available();
1134  if (4*num_pair*sr->pair_size() >= max_memuse){
1135  npart = 1 + (6*num_pair*sr->pair_size())/max_memuse;
1136  }
1137  MPI_Allreduce(MPI_IN_PLACE, &npart, 1, MPI_INT, MPI_MAX, wrld->cdt.cm);
1138 
1139 /* int64_t max_np;
1140  MPI_Allreduce(&num_pair, &max_np, 1, MPI_INT64_T, MPI_MAX, wrld->cdt.cm);
1141  if (wrld->cdt.rank == 0) printf("Performing write of %ld (max %ld) elements (max mem %1.1E) in %d parts %1.5E memory available, %1.5E used\n", num_pair, max_np, (double)max_np*sr->pair_size(), npart, (double)max_memuse, (double)proc_bytes_used());*/
1142 
1143  int64_t part_size = num_pair/npart;
1144  for (int part = 0; part<npart; part++){
1145  int64_t nnz_loc_new;
1146  char * new_pairs;
1147  int64_t pnum_pair;
1148  if (part == npart-1) pnum_pair = num_pair - part_size*part;
1149  else pnum_pair = part_size;
1150  char * buf_ptr = mapped_data + sr->pair_size()*part_size*part;
1151  wr_pairs_layout(tsr->order,
1152  wrld->np,
1153  pnum_pair,
1154  alpha,
1155  beta,
1156  rw,
1157  num_virt,
1158  tsr->sym,
1159  tsr->pad_edge_len,
1160  tsr->padding,
1161  phase,
1162  phys_phase,
1163  virt_phase,
1164  virt_phys_rank,
1165  bucket_lda,
1166  buf_ptr,
1167  tsr->data,
1168  wrld->cdt,
1169  sr,
1170  is_sparse,
1171  nnz_loc,
1172  nnz_blk,
1173  new_pairs,
1174  nnz_loc_new);
1175  if (is_sparse && rw == 'w'){
1176  this->set_new_nnz_glb(nnz_blk);
1177  if (tsr->data != NULL) sr->pair_dealloc(tsr->data);
1178  tsr->data = new_pairs;
1179  /* for (int64_t i=0; i<nnz_loc; i++){
1180  printf("rank = %d, stores key %ld value %lf\n",wrld->rank,
1181  ((int64_t*)(new_pairs+i*sr->pair_size()))[0],
1182  ((double*)(new_pairs+i*sr->pair_size()+sizeof(int64_t)))[0]);
1183  }*/
1184  }
1185  }
1186 // if (wrld->cdt.rank == 0) printf("Completed write of %ld elements\n", num_pair);
1187 
1188  CTF_int::cdealloc(phase);
1189  CTF_int::cdealloc(phys_phase);
1190  CTF_int::cdealloc(virt_phys_rank);
1191  CTF_int::cdealloc(bucket_lda);
1192  CTF_int::cdealloc(virt_phase);
1193 
1194  } else {
1195  DEBUG_PRINTF("SHOULD NOT BE HERE, ALWAYS MAP ME\n");
1196  TAU_FSTOP(write_pairs);
1197  return ERROR;
1198  }
1199  TAU_FSTOP(write_pairs);
1200  return SUCCESS;
1201  }
1202 
1203  void tensor::read(int64_t num_pair,
1204  char const * alpha,
1205  char const * beta,
1206  int64_t const * inds,
1207  char * data){
1208  char * pairs = sr->pair_alloc(num_pair);
1209  PairIterator pr(sr, pairs);
1210  for (int64_t i=0; i<num_pair; i++){
1211  pr[i].write_key(inds[i]);
1212  pr[i].write_val(data+i*sr->el_size);
1213  }
1214  write(num_pair, alpha, beta, pairs, 'r');
1215  for (int64_t i=0; i<num_pair; i++){
1216  pr[i].read_val(data+i*sr->el_size);
1217  }
1218  sr->pair_dealloc(pairs);
1219  }
1220 
1221  int tensor::read(int64_t num_pair,
1222  char const * alpha,
1223  char const * beta,
1224  char * mapped_data){
1225  return write(num_pair, alpha, beta, mapped_data, 'r');
1226  }
1227 
1228  int tensor::read(int64_t num_pair,
1229  char * mapped_data){
1230  return write(num_pair, NULL, NULL, mapped_data, 'r');
1231  }
1232 
1233  void tensor::set_distribution(char const * idx,
1234  Idx_Partition const & prl_,
1235  Idx_Partition const & blk){
1236  Idx_Partition prl = prl_.reduce_order();
1237  topology * top = new topology(prl.part.order, prl.part.lens, wrld->cdt);
1238  int itopo = find_topology(top, wrld->topovec);
1239 /* if (wrld->rank == 0){
1240  for (int i=0; i<wrld->topovec.size(); i++){
1241  if (wrld->topovec[i]->order == 2){
1242  printf("topo %d lens %d %d\n", i, wrld->topovec[i]->lens[0], wrld->topovec[i]->lens[1]);
1243  }
1244  }
1245  printf("lens %d %d\n", top.lens[0], top.lens[1]);
1246  }*/
1247  if (itopo == -1){
1248  itopo = wrld->topovec.size();
1249  wrld->topovec.push_back(top);
1250  } else delete top;
1251  ASSERT(itopo != -1);
1252  assert(itopo != -1);
1253 
1254  this->clear_mapping();
1255 
1256  this->topo = wrld->topovec[itopo];
1257  for (int i=0; i<order; i++){
1258  mapping * map = this->edge_map+i;
1259  for (int j=0; j<prl.part.order; j++){
1260  if (idx[i] == prl.idx[j]){
1261  if (map->type != NOT_MAPPED){
1262  map->has_child = 1;
1263  map->child = new mapping();
1264  map = map->child;
1265  }
1266  map->type = PHYSICAL_MAP;
1267  map->np = this->topo->dim_comm[j].np;
1268  map->cdt = j;
1269  }
1270  }
1271  for (int j=0; j<blk.part.order; j++){
1272  mapping * map1 = map;
1273  if (idx[i] == blk.idx[j]){
1274  if (map1->type != NOT_MAPPED){
1275  assert(map1->type == PHYSICAL_MAP);
1276  map1->has_child = 1;
1277  map1->child = new mapping();
1278  map1 = map1->child;
1279  }
1280  map1->type = VIRTUAL_MAP;
1281  map1->np = blk.part.lens[j];
1282  }
1283  }
1284  }
1285  this->is_mapped = true;
1286  this->set_padding();
1287  int * idx_A;
1288  conv_idx(this->order, idx, &idx_A);
1289  if (!check_self_mapping(this, idx_A)){
1290  if (wrld->rank == 0)
1291  printf("CTF ERROR: invalid distribution in read() call, aborting.\n");
1292  ASSERT(0);
1293  assert(0);
1294  }
1295  cdealloc(idx_A);
1296  }
1297 
1298 
1299  char * tensor::read(char const * idx,
1300  Idx_Partition const & prl,
1301  Idx_Partition const & blk,
1302  bool unpack){
1303  if (unpack){
1304  for (int i=0; i<order; i++){
1305  if (sym[i] != NS){
1306  int new_sym[order];
1307  std::fill(new_sym, new_sym+order, NS);
1308  tensor tsr(sr, order, lens, new_sym, wrld, true);
1309  tsr[idx] += (*this)[idx];
1310  return tsr.read(idx, prl, blk, unpack);
1311  }
1312  }
1313  }
1314  distribution st_dist(this);
1315  tensor tsr_ali(this, 1, 1);
1316 #if DEBUG>=1
1317  if (wrld->rank == 0)
1318  printf("Redistributing via read() starting from distribution:\n");
1319  tsr_ali.print_map();
1320 #endif
1321  tsr_ali.clear_mapping();
1322  tsr_ali.set_distribution(idx, prl, blk);
1323  if (tsr_ali.has_home) deregister_size();
1324  tsr_ali.has_home = 0;
1325  tsr_ali.is_home = 0;
1326  tsr_ali.redistribute(st_dist);
1327  tsr_ali.is_data_aliased = 1;
1328  return tsr_ali.data;
1329 
1330  }
1331 
1332  int tensor::sparsify(char const * threshold,
1333  bool take_abs){
1334  if ((threshold == NULL && sr->addid() == NULL) ||
1335  (threshold != NULL && !sr->is_ordered())){
1336  return SUCCESS;
1337  }
1338  if (threshold == NULL)
1339  return sparsify([&](char const* c){ return !sr->isequal(c, sr->addid()); });
1340  else if (!take_abs)
1341  return sparsify([&](char const* c){
1342  char tmp[sr->el_size];
1343  sr->max(c,threshold,tmp);
1344  return !sr->isequal(tmp, threshold);
1345  });
1346  else
1347  return sparsify([&](char const* c){
1348  char tmp[sr->el_size];
1349  sr->abs(c,tmp);
1350  sr->max(tmp,threshold,tmp);
1351  return !sr->isequal(tmp, threshold);
1352  });
1353  }
1354 
1355  int tensor::sparsify(std::function<bool(char const*)> f){
1356  if (is_sparse){
1357  TAU_FSTART(sparsify);
1358  int64_t nnz_loc_new = 0;
1359  PairIterator pi(sr, data);
1360  int64_t nnz_blk_old[calc_nvirt()];
1361  memcpy(nnz_blk_old, nnz_blk, calc_nvirt()*sizeof(int64_t));
1362  memset(nnz_blk, 0, calc_nvirt()*sizeof(int64_t));
1363  int64_t i=0;
1364  bool * keep_vals;
1365  CTF_int::alloc_ptr(nnz_loc*sizeof(bool), (void**)&keep_vals);
1366  for (int v=0; v<calc_nvirt(); v++){
1367  for (int64_t j=0; j<nnz_blk_old[v]; j++,i++){
1368 // printf("Filtering %ldth/%ld elements %p %d %d\n",i,nnz_loc,pi.ptr,sr->el_size,sr->pair_size());
1369  ASSERT(i<nnz_loc);
1370  keep_vals[i] = f(pi[i].d());
1371  if (keep_vals[i]){
1372  nnz_loc_new++;
1373  nnz_blk[v]++;
1374  }
1375  }
1376  }
1377 
1378  // if we don't have any actual zeros don't do anything
1379  if (nnz_loc_new != nnz_loc){
1380  char * old_data = data;
1381  data = sr->pair_alloc(nnz_loc_new);
1382  PairIterator pi_new(sr, data);
1383  nnz_loc_new = 0;
1384  for (int64_t i=0; i<nnz_loc; i++){
1385  if (keep_vals[i]){
1386  pi_new[nnz_loc_new].write(pi[i].ptr);
1387  nnz_loc_new++;
1388  }
1389  }
1390  sr->dealloc(old_data);
1391  }
1392  CTF_int::cdealloc(keep_vals);
1393 
1394  this->set_new_nnz_glb(nnz_blk);
1395  //FIXME compute max nnz_loc?
1396  TAU_FSTOP(sparsify);
1397  } else {
1398  TAU_FSTART(sparsify_dense);
1399  ASSERT(!has_home || is_home);
1400  int nvirt = calc_nvirt();
1401  this->nnz_blk = (int64_t*)alloc(sizeof(int64_t)*nvirt);
1402 
1403 
1404  int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
1405  int64_t * edge_lda;
1406  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&virt_phase);
1407  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&phys_phase);
1408  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&phase);
1409  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&virt_phys_rank);
1410  CTF_int::alloc_ptr(sizeof(int64_t)*this->order, (void**)&edge_lda);
1411  char * old_data = this->data;
1412 
1413  nvirt = 1;
1414  int idx_lyr = wrld->rank;
1415  for (int i=0; i<this->order; i++){
1416  /* Calcute rank and phase arrays */
1417  if (i == 0) edge_lda[0] = 1;
1418  else edge_lda[i] = edge_lda[i-1]*lens[i-1];
1419  mapping const * map = this->edge_map + i;
1420  phase[i] = map->calc_phase();
1421  phys_phase[i] = map->calc_phys_phase();
1422  virt_phase[i] = phase[i]/phys_phase[i];
1423  virt_phys_rank[i] = map->calc_phys_rank(this->topo);//*virt_phase[i];
1424  nvirt = nvirt*virt_phase[i];
1425 
1426  if (map->type == PHYSICAL_MAP)
1427  idx_lyr -= this->topo->lda[map->cdt]
1428  *virt_phys_rank[i];
1429  }
1430  if (idx_lyr == 0){
1431  //invalid for nondeterministic f
1432  /*if (!f(this->sr->addid())){
1433  spsfy_tsr(this->order, this->size, nvirt,
1434  this->pad_edge_len, this->sym, phase,
1435  phys_phase, virt_phase, virt_phys_rank,
1436  this->data, this->data, this->nnz_blk, this->sr, edge_lda, f);
1437  } else {*/
1438  //printf("sparsifying with padding handling\n");
1439  // if zero passes filter, then padding may be included, so get rid of it
1440  int * depadding;
1441  CTF_int::alloc_ptr(sizeof(int)*order, (void**)&depadding);
1442  for (int i=0; i<this->order; i++){
1443  if (i == 0) edge_lda[0] = 1;
1444  else edge_lda[i] = edge_lda[i-1]*this->pad_edge_len[i-1];
1445  depadding[i] = -padding[i];
1446  }
1447  int * prepadding;
1448  CTF_int::alloc_ptr(sizeof(int)*order, (void**)&prepadding);
1449  memset(prepadding, 0, sizeof(int)*order);
1450  spsfy_tsr(this->order, this->size, nvirt,
1451  this->pad_edge_len, this->sym, phase,
1452  phys_phase, virt_phase, virt_phys_rank,
1453  this->data, this->data, this->nnz_blk, this->sr, edge_lda, f);
1454  char * new_pairs[nvirt];
1455  char const * data_ptr = this->data;
1456  int64_t new_nnz_tot = 0;
1457  for (int v=0; v<nvirt; v++){
1458  if (nnz_blk[v] > 0){
1459  int64_t old_nnz = nnz_blk[v];
1460  new_pairs[v] = (char*)sr->pair_alloc(nnz_blk[v]);
1461  depad_tsr(order, nnz_blk[v], this->lens, this->sym, this->padding, prepadding,
1462  data_ptr, new_pairs[v], nnz_blk+v, sr);
1463  pad_key(order, nnz_blk[v], this->pad_edge_len, depadding, PairIterator(sr,new_pairs[v]), sr);
1464  data_ptr += old_nnz*sr->pair_size();
1465  new_nnz_tot += nnz_blk[v];
1466  } else new_pairs[v] = NULL;
1467  }
1468  cdealloc(depadding);
1469  cdealloc(prepadding);
1470  sr->pair_dealloc(this->data);
1471  this->data = sr->pair_alloc(new_nnz_tot);
1472  char * new_data_ptr = this->data;
1473  for (int v=0; v<nvirt; v++){
1474  if (new_pairs[v] != NULL){
1475  sr->copy_pairs(new_data_ptr, new_pairs[v], nnz_blk[v]);
1476  sr->pair_dealloc(new_pairs[v]);
1477  }
1478  }
1479  //}
1480  } else {
1481  memset(nnz_blk, 0, sizeof(int64_t)*nvirt);
1482  this->data = NULL;
1483  }
1484 
1485  sr->dealloc(old_data);
1486  //become sparse
1487  if (has_home) deregister_size();
1488  is_home = true;
1489  has_home = true;
1490  home_buffer = NULL;
1491  is_sparse = true;
1492  nnz_loc = 0;
1493  nnz_tot = 0;
1494  this->set_new_nnz_glb(this->nnz_blk);
1495  cdealloc(virt_phase);
1496  cdealloc(phys_phase);
1497  cdealloc(phase);
1498  cdealloc(virt_phys_rank);
1499  cdealloc(edge_lda);
1500  TAU_FSTOP(sparsify_dense);
1501  }
1502  return SUCCESS;
1503  }
1504 
1505  void tensor::read_local(int64_t * num_pair,
1506  int64_t ** inds,
1507  char ** data,
1508  bool unpack_sym) const {
1509  char * pairs;
1510  read_local(num_pair, &pairs, unpack_sym);
1511  *inds = (int64_t*)CTF_int::alloc(sizeof(int64_t)*(*num_pair));
1512  *data = (char*)CTF_int::alloc(sr->el_size*(*num_pair));
1513  ConstPairIterator pr(sr, pairs);
1514  for (int64_t i=0; i<*num_pair; i++){
1515  (*inds)[i] = pr[i].k();
1516  pr[i].read_val(*data+i*sr->el_size);
1517  }
1518  sr->pair_dealloc(pairs);
1519  }
1520 
1521 
1522  void tensor::read_local_nnz(int64_t * num_pair,
1523  int64_t ** inds,
1524  char ** data,
1525  bool unpack_sym) const {
1526  char * pairs;
1527  read_local_nnz(num_pair, &pairs, unpack_sym);
1528  *inds = (int64_t*)CTF_int::alloc(sizeof(int64_t)*(*num_pair));
1529  *data = (char*)CTF_int::alloc(sr->el_size*(*num_pair));
1530  ConstPairIterator pr(sr, pairs);
1531  for (int64_t i=0; i<*num_pair; i++){
1532  (*inds)[i] = pr[i].k();
1533  pr[i].read_val(*data+i*sr->el_size);
1534  }
1535  sr->pair_dealloc(pairs);
1536  }
1537 
1538  int tensor::read_local_nnz(int64_t * num_pair,
1539  char ** mapped_data,
1540  bool unpack_sym) const {
1541  if (sr->isequal(sr->addid(), NULL) && !is_sparse)
1542  return read_local_nnz(num_pair,mapped_data, unpack_sym);
1543  tensor tsr_cpy(this);
1544  if (!is_sparse)
1545  tsr_cpy.sparsify();
1546  *mapped_data = tsr_cpy.data;
1547  *num_pair = tsr_cpy.nnz_loc;
1548  tsr_cpy.is_data_aliased = true;
1549  return SUCCESS;
1550  }
1551 
1552 
1553  int tensor::read_local(int64_t * num_pair,
1554  char ** mapped_data,
1555  bool unpack_sym) const {
1556  int i, num_virt, idx_lyr;
1557  int64_t np;
1558  int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
1559  tensor const * tsr;
1560  char * pairs;
1561  mapping * map;
1562 
1563 
1564  tsr = this;
1565  if (tsr->has_zero_edge_len){
1566  *num_pair = 0;
1567  *mapped_data = NULL;
1568  return SUCCESS;
1569  }
1570  ASSERT(!tsr->is_folded);
1571  ASSERT(tsr->is_mapped);
1572 
1573 // tsr->set_padding();
1574 
1575  bool has_sym = false;
1576  for (i=0; i<this->order; i++){
1577  if (this->sym[i] != NS) has_sym = true;
1578  }
1579 
1580  if (tsr->is_sparse){
1581  char * nnz_data;
1582  int64_t num_nnz;
1583  read_local_nnz(&num_nnz, &nnz_data, unpack_sym);
1584  tensor dense_tsr(sr, order, lens, sym, wrld);
1585  dense_tsr.write(num_nnz, sr->mulid(), sr->addid(), nnz_data);
1586  sr->pair_dealloc(nnz_data);
1587  dense_tsr.read_local(num_pair, mapped_data, unpack_sym);
1588  //*num_pair = num_pair;
1589  return SUCCESS;
1590  } else if (has_sym && unpack_sym) {
1591  int nosym[this->order];
1592  std::fill(nosym, nosym+this->order, NS);
1593  tensor nosym_tsr(sr, order, lens, nosym, wrld);
1594  int idx[this->order];
1595  for (i=0; i<this->order; i++){
1596  idx[i] = i;
1597  }
1598  summation s((tensor*)this, idx, sr->mulid(), &nosym_tsr, idx, sr->mulid());
1599  s.execute();
1600  return nosym_tsr.read_local(num_pair, mapped_data);
1601  } else {
1602  TAU_FSTART(read_local_pairs);
1603  np = tsr->size;
1604 
1605  CTF_int::alloc_ptr(sizeof(int)*tsr->order, (void**)&virt_phase);
1606  CTF_int::alloc_ptr(sizeof(int)*tsr->order, (void**)&phys_phase);
1607  CTF_int::alloc_ptr(sizeof(int)*tsr->order, (void**)&phase);
1608  CTF_int::alloc_ptr(sizeof(int)*tsr->order, (void**)&virt_phys_rank);
1609 
1610 
1611  num_virt = 1;
1612  idx_lyr = wrld->rank;
1613  for (i=0; i<tsr->order; i++){
1614  /* Calcute rank and phase arrays */
1615  map = tsr->edge_map + i;
1616  phase[i] = map->calc_phase();
1617  phys_phase[i] = map->calc_phys_phase();
1618  virt_phase[i] = phase[i]/phys_phase[i];
1619  virt_phys_rank[i] = map->calc_phys_rank(tsr->topo);//*virt_phase[i];
1620  num_virt = num_virt*virt_phase[i];
1621 
1622  if (map->type == PHYSICAL_MAP)
1623  idx_lyr -= tsr->topo->lda[map->cdt]
1624  *virt_phys_rank[i];
1625  }
1626  if (idx_lyr == 0){
1627  read_loc_pairs(tsr->order, np, num_virt,
1628  tsr->sym, tsr->pad_edge_len, tsr->padding,
1629  phase, phys_phase, virt_phase, virt_phys_rank, num_pair,
1630  tsr->data, &pairs, sr);
1631  *mapped_data = pairs;
1632  } else {
1633  *mapped_data = NULL;
1634  *num_pair = 0;
1635  }
1636 
1637 
1638  CTF_int::cdealloc((void*)virt_phase);
1639  CTF_int::cdealloc((void*)phys_phase);
1640  CTF_int::cdealloc((void*)phase);
1641  CTF_int::cdealloc((void*)virt_phys_rank);
1642 
1643  TAU_FSTOP(read_local_pairs);
1644  return SUCCESS;
1645  }
1646  }
1647 
1648  PairIterator tensor::read_all_pairs(int64_t * num_pair, bool unpack){
1649  int numPes;
1650  int * nXs;
1651  int nval, n, i;
1652  int * pXs;
1653  char * my_pairs, * all_pairs;
1654 
1655  numPes = wrld->np;
1656  if (has_zero_edge_len){
1657  *num_pair = 0;
1658  return PairIterator(sr, NULL);
1659  }
1660  //unpack symmetry
1661  /*if (unpack){
1662  bool is_nonsym=true;
1663  for (int i=0; i<order; i++){
1664  if (sym[i] != NS){
1665  is_nonsym = false;
1666  }
1667  }
1668  if (!is_nonsym){
1669  int sym_A[order];
1670  std::fill(sym_A, sym_A+order, NS);
1671  int idx_A[order];
1672  for (int i=0; i<order; i++){
1673  idx_A[i] = i;
1674  }
1675  tensor tA(sr, order, lens, sym_A, wrld, 1);
1676  //tA.leave_home_with_buffer();
1677  summation st(this, idx_A, sr->mulid(), &tA, idx_A, sr->mulid());
1678  st.execute();
1679  return PairIterator(sr,tA.read_all_pairs(num_pair, false).ptr);
1680  }
1681  }*/
1682  alloc_ptr(numPes*sizeof(int), (void**)&nXs);
1683  alloc_ptr(numPes*sizeof(int), (void**)&pXs);
1684  pXs[0] = 0;
1685 
1686  int64_t ntt = 0;
1687  my_pairs = NULL;
1688  read_local(&ntt, &my_pairs, unpack);
1689  n = (int)ntt;
1690  n*=sr->pair_size();
1691  MPI_Allgather(&n, 1, MPI_INT, nXs, 1, MPI_INT, wrld->comm);
1692  for (i=1; i<numPes; i++){
1693  pXs[i] = pXs[i-1]+nXs[i-1];
1694  }
1695  nval = pXs[numPes-1] + nXs[numPes-1];
1696  nval = nval/sr->pair_size();
1697  all_pairs = sr->pair_alloc(nval);
1698  MPI_Allgatherv(my_pairs, n, MPI_CHAR,
1699  all_pairs, nXs, pXs, MPI_CHAR, wrld->comm);
1700  cdealloc(nXs);
1701  cdealloc(pXs);
1702 
1703  PairIterator ipr(sr, all_pairs);
1704  ipr.sort(nval);
1705  if (n>0){
1706  sr->pair_dealloc(my_pairs);
1707  }
1708  *num_pair = nval;
1709  return ipr;
1710  }
1711 
1712  int64_t tensor::get_tot_size(bool packed=false){
1713  if (!packed){
1714  int64_t tsize = 1;
1715  for (int i=0; i<order; i++){
1716  tsize *= lens[i];
1717  }
1718  return tsize;
1719  } else {
1720  return packed_size(order, lens, sym);
1721  }
1722  }
1723 
1724  int tensor::allread(int64_t * num_pair,
1725  char ** all_data,
1726  bool unpack){
1727  PairIterator ipr = read_all_pairs(num_pair, unpack);
1728  char * ball_data = sr->alloc((*num_pair));
1729  for (int64_t i=0; i<*num_pair; i++){
1730  ipr[i].read_val(ball_data+i*sr->el_size);
1731  }
1732  if (ipr.ptr != NULL)
1733  sr->pair_dealloc(ipr.ptr);
1734  *all_data = ball_data;
1735  return SUCCESS;
1736  }
1737 
1738  int tensor::allread(int64_t * num_pair,
1739  char * all_data,
1740  bool unpack){
1741  PairIterator ipr = read_all_pairs(num_pair, unpack);
1742  for (int64_t i=0; i<*num_pair; i++){
1743  ipr[i].read_val(all_data+i*sr->el_size);
1744  }
1745  return SUCCESS;
1746  }
1747 
1748 
1749  int tensor::align(tensor const * B){
1750  if (B==this) return SUCCESS;
1751  ASSERT(!is_folded && !B->is_folded);
1752  ASSERT(B->wrld == wrld);
1753  ASSERT(B->order == order);
1754  distribution old_dist = distribution(this);
1755  bool is_changed = false;
1756  if (topo != B->topo) is_changed = true;
1757  topo = B->topo;
1758  for (int i=0; i<order; i++){
1759  if (!comp_dim_map(edge_map+i, B->edge_map+i)){
1760  edge_map[i].clear();
1761  copy_mapping(1, B->edge_map+i, edge_map+i);
1762  is_changed = true;
1763  }
1764  }
1765  if (is_changed){
1766  set_padding();
1767  return redistribute(old_dist);
1768  } else return SUCCESS;
1769  }
1770 
1771  int tensor::reduce_sum(char * result) {
1772  return reduce_sum(result, sr);
1773  }
1774 
1775  int tensor::reduce_sum(char * result, algstrct const * sr_other) {
1776  ASSERT(is_mapped && !is_folded);
1777  tensor sc = tensor(sr_other, 0, NULL, NULL, wrld, 1);
1778  int idx_A[order];
1779  for (int i=0; i<order; i++){
1780  idx_A[i] = i;
1781  }
1782  summation sm = summation(this, idx_A, sr_other->mulid(), &sc, NULL, sr_other->mulid());
1783  sm.execute();
1784  sr->copy(result, sc.data);
1785  wrld->cdt.bcast(result, sr_other->el_size, MPI_CHAR, 0);
1786  return SUCCESS;
1787  }
1788 
1789  int tensor::reduce_sumabs(char * result) {
1790  return reduce_sumabs(result, sr);
1791  }
1792 
1793  int tensor::reduce_sumabs(char * result, algstrct const * sr_other){
1794  ASSERT(is_mapped && !is_folded);
1795  univar_function func = univar_function(sr_other->abs);
1796  tensor sc = tensor(sr_other, 0, NULL, NULL, wrld, 1);
1797  int idx_A[order];
1798  for (int i=0; i<order; i++){
1799  idx_A[i] = i;
1800  }
1801  summation sm = summation(this, idx_A, sr->mulid(), &sc, NULL, sr_other->mulid(), &func);
1802  sm.execute();
1803  sr->copy(result, sc.data);
1804  wrld->cdt.bcast(result, sr->el_size, MPI_CHAR, 0);
1805  return SUCCESS;
1806  }
1807 
1808  int tensor::reduce_sumsq(char * result) {
1809  ASSERT(is_mapped && !is_folded);
1810  tensor sc = tensor(sr, 0, NULL, NULL, wrld, 1);
1811  int idx_A[order];
1812  for (int i=0; i<order; i++){
1813  idx_A[i] = i;
1814  }
1815  contraction ctr = contraction(this, idx_A, this, idx_A, sr->mulid(), &sc, NULL, sr->addid());
1816  ctr.execute();
1817  sr->copy(result, sc.data);
1818  wrld->cdt.bcast(result, sr->el_size, MPI_CHAR, 0);
1819  return SUCCESS;
1820  }
1821 
1822  void tensor::prnt() const {
1823  this->print();
1824  }
1825  void tensor::print(FILE * fp, char const * cutoff) const {
1826  int my_sz;
1827  int64_t imy_sz, tot_sz =0;
1828  int * recvcnts, * displs, * idx_arr;
1829  char * pmy_data, * pall_data;
1830  int64_t k;
1831 
1832  if (wrld->rank == 0)
1833  printf("Printing tensor %s\n",name);
1834 #ifdef DEBUG
1835  print_map(fp);
1836 #endif
1837 
1838  /*for (int i=0; i<this->size; i++){
1839  printf("this->data[%d] = ",i);
1840  sr->print(data+i*sr->el_size);
1841  printf("\n");
1842  }*/
1843 
1844 
1845  imy_sz = 0;
1846  if (cutoff != NULL){
1847  tensor tsr_cpy(this);
1848  tsr_cpy.sparsify(cutoff);
1849  tsr_cpy.read_local_nnz(&imy_sz, &pmy_data, true);
1850  } else
1851  read_local_nnz(&imy_sz, &pmy_data, true);
1852  /*PairIterator my_data = PairIterator(sr,pmy_data);
1853  for (int i=0; i<imy_sz; i++){
1854  printf("my_data[%d].k() = %ld my_data[%d].d() = ",i,my_data[i].k(),i);
1855  sr->print(my_data[i].d());
1856  printf("\n");
1857  }*/
1858 
1859  my_sz = imy_sz;
1860 
1861  if (wrld->rank == 0){
1862  alloc_ptr(wrld->np*sizeof(int), (void**)&recvcnts);
1863  alloc_ptr(wrld->np*sizeof(int), (void**)&displs);
1864  alloc_ptr(order*sizeof(int), (void**)&idx_arr);
1865  } else
1866  recvcnts = NULL;
1867 
1868  MPI_Gather(&my_sz, 1, MPI_INT, recvcnts, 1, MPI_INT, 0, wrld->cdt.cm);
1869 
1870  if (wrld->rank == 0){
1871  for (int i=0; i<wrld->np; i++){
1872  recvcnts[i] *= sr->pair_size();
1873  }
1874  displs[0] = 0;
1875  for (int i=1; i<wrld->np; i++){
1876  displs[i] = displs[i-1] + recvcnts[i-1];
1877  }
1878  tot_sz = (displs[wrld->np-1] + recvcnts[wrld->np-1])/sr->pair_size();
1879  //tot_sz = displs[wrld->np-1] + recvcnts[wrld->np-1];
1880  pall_data = sr->pair_alloc(tot_sz);
1881  } else {
1882  pall_data = NULL;
1883  displs = NULL;
1884  }
1885 
1886  //if (my_sz == 0) pmy_data = NULL;
1887  if (wrld->cdt.np == 1)
1888  pall_data = pmy_data;
1889  else {
1890  MPI_Gatherv(pmy_data, my_sz*sr->pair_size(), MPI_CHAR,
1891  pall_data, recvcnts, displs, MPI_CHAR, 0, wrld->cdt.cm);
1892  }
1893  PairIterator all_data = PairIterator(sr,pall_data);
1894  /*for (int i=0; i<tot_sz; i++){
1895  printf("all_data[%d].k() = %ld all_data[%d].d() = ",i,all_data[i].k(),i);
1896  sr->print(all_data[i].d());
1897  printf("\n");
1898  }*/
1899  if (wrld->rank == 0){
1900  all_data.sort(tot_sz);
1901  for (int64_t i=0; i<tot_sz; i++){
1902  /*if (cutoff != NULL){
1903  char absval[sr->el_size];
1904  sr->abs(all_data[i].d(),absval);
1905  sr->max(absval, cutoff, absval);
1906  if(sr->isequal(absval, cutoff)) continue;
1907  }*/
1908  k = all_data[i].k();
1909  for (int j=0; j<order; j++){
1910  //idx_arr[order-j-1] = k%lens[j];
1911  idx_arr[j] = k%lens[j];
1912  k = k/lens[j];
1913  }
1914  for (int j=0; j<order; j++){
1915  fprintf(fp,"[%d]",idx_arr[j]);
1916  }
1917  fprintf(fp,"(%ld, <",all_data[i].k());
1918  sr->print(all_data[i].d(), fp);
1919  fprintf(fp,">)\n");
1920  }
1921  cdealloc(recvcnts);
1922  cdealloc(displs);
1923  cdealloc(idx_arr);
1924  if (pall_data != pmy_data) sr->pair_dealloc(pall_data);
1925  }
1926  if (pmy_data != NULL) sr->pair_dealloc(pmy_data);
1927 
1928  }
1929 
1930  void tensor::compare(const tensor * A, FILE * fp, char const * cutoff){
1931  int i, j;
1932  int my_sz;
1933  int64_t imy_sz, tot_sz =0, my_sz_B;
1934  int * recvcnts, * displs, * idx_arr;
1935  char * my_data_A;
1936  char * my_data_B;
1937  char * all_data_A;
1938  char * all_data_B;
1939  int64_t k;
1940 
1941  tensor * B = this;
1942 
1943  B->align(A);
1944 
1945  A->print_map(stdout, 1);
1946  B->print_map(stdout, 1);
1947 
1948  imy_sz = 0;
1949  A->read_local(&imy_sz, &my_data_A);
1950  my_sz = imy_sz;
1951  my_sz_B = 0;
1952  B->read_local(&my_sz_B, &my_data_B);
1953  assert(my_sz == my_sz_B);
1954 
1955  CommData const & global_comm = A->wrld->cdt;
1956 
1957  if (global_comm.rank == 0){
1958  alloc_ptr(global_comm.np*sizeof(int), (void**)&recvcnts);
1959  alloc_ptr(global_comm.np*sizeof(int), (void**)&displs);
1960  alloc_ptr(A->order*sizeof(int), (void**)&idx_arr);
1961  }
1962  recvcnts = NULL;
1963 
1964 
1965  MPI_Gather(&my_sz, 1, MPI_INT, recvcnts, 1, MPI_INT, 0, global_comm.cm);
1966 
1967  if (global_comm.rank == 0){
1968  for (i=0; i<global_comm.np; i++){
1969  recvcnts[i] *= A->sr->pair_size();
1970  }
1971  displs[0] = 0;
1972  for (i=1; i<global_comm.np; i++){
1973  displs[i] = displs[i-1] + recvcnts[i-1];
1974  }
1975  tot_sz = (displs[global_comm.np-1]
1976  + recvcnts[global_comm.np-1])/A->sr->pair_size();
1977  all_data_A = A->sr->pair_alloc(tot_sz);
1978  all_data_B = B->sr->pair_alloc(tot_sz);
1979  } else {
1980  all_data_A = NULL;
1981  all_data_B = NULL;
1982  }
1983 
1984  if (my_sz == 0) my_data_A = my_data_B = NULL;
1985  MPI_Gatherv(my_data_A, my_sz*A->sr->pair_size(), MPI_CHAR,
1986  all_data_A, recvcnts, displs, MPI_CHAR, 0, global_comm.cm);
1987  MPI_Gatherv(my_data_B, my_sz*A->sr->pair_size(), MPI_CHAR,
1988  all_data_B, recvcnts, displs, MPI_CHAR, 0, global_comm.cm);
1989 
1990  PairIterator pall_data_A(A->sr, all_data_A);
1991  PairIterator pall_data_B(B->sr, all_data_B);
1992 
1993  if (global_comm.rank == 0){
1994  pall_data_A.sort(tot_sz);
1995  pall_data_B.sort(tot_sz);
1996  for (i=0; i<tot_sz; i++){
1997  char aA[A->sr->el_size];
1998  char aB[B->sr->el_size];
1999  A->sr->abs(pall_data_A[i].d(), aA);
2000  A->sr->min(aA, cutoff, aA);
2001  B->sr->abs(pall_data_B[i].d(), aB);
2002  B->sr->min(aB, cutoff, aB);
2003 
2004  if (A->sr->isequal(aA, cutoff) || B->sr->isequal(aB,cutoff)){
2005  k = pall_data_A[i].k();
2006  for (j=0; j<A->order; j++){
2007  idx_arr[j] = k%A->lens[j];
2008  k = k/A->lens[j];
2009  }
2010  for (j=0; j<A->order; j++){
2011  fprintf(fp,"[%d]",idx_arr[j]);
2012  }
2013  fprintf(fp," <");
2014  A->sr->print(pall_data_A[i].d(),fp);
2015  fprintf(fp,">,<");
2016  A->sr->print(pall_data_B[i].d(),fp);
2017  fprintf(fp,">\n");
2018  }
2019  }
2020  cdealloc(recvcnts);
2021  cdealloc(displs);
2022  cdealloc(idx_arr);
2023  sr->pair_dealloc(all_data_A);
2024  sr->pair_dealloc(all_data_B);
2025  }
2026 
2027  }
2028 
2029  void tensor::unfold(bool was_mod){
2030  int i, j, allfold_dim;
2031  int * all_edge_len, * sub_edge_len;
2032  if (this->is_folded){
2033  CTF_int::alloc_ptr(this->order*sizeof(int), (void**)&all_edge_len);
2034  CTF_int::alloc_ptr(this->order*sizeof(int), (void**)&sub_edge_len);
2035  calc_dim(this->order, this->size, this->pad_edge_len, this->edge_map,
2036  NULL, sub_edge_len, NULL);
2037  allfold_dim = 0;
2038  for (i=0; i<this->order; i++){
2039  if (this->sym[i] == NS){
2040  j=1;
2041  while (i-j >= 0 && this->sym[i-j] != NS) j++;
2042  all_edge_len[allfold_dim] = sy_packed_size(j, sub_edge_len+i-j+1,
2043  this->sym+i-j+1);
2044  allfold_dim++;
2045  }
2046  }
2047  if (!is_sparse){
2048  nosym_transpose(this, allfold_dim, all_edge_len, this->inner_ordering, 0);
2049  assert(!left_home_transp);
2050  } else {
2051  ASSERT(this->nrow_idx != -1);
2052  if (was_mod)
2053  despmatricize(this->nrow_idx, this->is_csr);
2054  cdealloc(this->rec_tsr->data);
2055  }
2056  CTF_int::cdealloc(all_edge_len);
2057  CTF_int::cdealloc(sub_edge_len);
2058  this->rec_tsr->is_data_aliased=1;
2059  delete this->rec_tsr;
2060  CTF_int::cdealloc(this->inner_ordering);
2061  }
2062  this->is_folded = 0;
2063  //maybe not necessary
2064  set_padding();
2065  }
2066 
2067  void tensor::remove_fold(){
2068  delete this->rec_tsr;
2069  CTF_int::cdealloc(this->inner_ordering);
2070  this->is_folded = 0;
2071  //maybe not necessary
2072  set_padding();
2073  }
2074 
2075  double tensor::est_time_unfold(){
2076  int i, j, allfold_dim;
2077  int * all_edge_len, * sub_edge_len;
2078  if (!this->is_folded) return 0.0;
2079  double est_time;
2080  CTF_int::alloc_ptr(this->order*sizeof(int), (void**)&all_edge_len);
2081  CTF_int::alloc_ptr(this->order*sizeof(int), (void**)&sub_edge_len);
2082  calc_dim(this->order, this->size, this->pad_edge_len, this->edge_map,
2083  NULL, sub_edge_len, NULL);
2084  allfold_dim = 0;
2085  for (i=0; i<this->order; i++){
2086  if (this->sym[i] == NS){
2087  j=1;
2088  while (i-j >= 0 && this->sym[i-j] != NS) j++;
2089  all_edge_len[allfold_dim] = sy_packed_size(j, sub_edge_len+i-j+1,
2090  this->sym+i-j+1);
2091  allfold_dim++;
2092  }
2093  }
2094  est_time = this->calc_nvirt()*est_time_transp(allfold_dim, this->inner_ordering, all_edge_len, 0, sr);
2095  CTF_int::cdealloc(all_edge_len);
2096  CTF_int::cdealloc(sub_edge_len);
2097  return est_time;
2098  }
2099 
2100 
2101  void tensor::fold(int nfold,
2102  int const * fold_idx,
2103  int const * idx_map,
2104  int * all_fdim,
2105  int ** all_flen){
2106  int i, j, k, fdim, allfold_dim, is_fold, fold_dim;
2107  int * sub_edge_len, * fold_edge_len, * all_edge_len, * dim_order;
2108  int * fold_sym;
2109  tensor * fold_tsr;
2110 
2111  if (this->is_folded != 0) this->unfold();
2112 
2113  CTF_int::alloc_ptr(this->order*sizeof(int), (void**)&sub_edge_len);
2114 
2115  allfold_dim = 0, fold_dim = 0;
2116  for (j=0; j<this->order; j++){
2117  if (this->sym[j] == NS){
2118  allfold_dim++;
2119  for (i=0; i<nfold; i++){
2120  if (fold_idx[i] == idx_map[j])
2121  fold_dim++;
2122  }
2123  }
2124  }
2125  CTF_int::alloc_ptr(allfold_dim*sizeof(int), (void**)&all_edge_len);
2126  CTF_int::alloc_ptr(allfold_dim*sizeof(int), (void**)&dim_order);
2127  CTF_int::alloc_ptr(fold_dim*sizeof(int), (void**)&fold_edge_len);
2128  CTF_int::alloc_ptr(fold_dim*sizeof(int), (void**)&fold_sym);
2129 
2130  calc_dim(this->order, this->size, this->pad_edge_len, this->edge_map,
2131  NULL, sub_edge_len, NULL);
2132 
2133  allfold_dim = 0, fdim = 0;
2134  for (j=0; j<this->order; j++){
2135  if (this->sym[j] == NS){
2136  k=1;
2137  while (j-k >= 0 && this->sym[j-k] != NS) k++;
2138  all_edge_len[allfold_dim] = sy_packed_size(k, sub_edge_len+j-k+1,
2139  this->sym+j-k+1);
2140  is_fold = 0;
2141  for (i=0; i<nfold; i++){
2142  if (fold_idx[i] == idx_map[j]){
2143  k=1;
2144  while (j-k >= 0 && this->sym[j-k] != NS) k++;
2145  fold_edge_len[fdim] = sy_packed_size(k, sub_edge_len+j-k+1,
2146  this->sym+j-k+1);
2147  is_fold = 1;
2148  }
2149  }
2150  if (is_fold) {
2151  dim_order[fdim] = allfold_dim;
2152  fdim++;
2153  } else
2154  dim_order[fold_dim+allfold_dim-fdim] = allfold_dim;
2155  allfold_dim++;
2156  }
2157  }
2158  std::fill(fold_sym, fold_sym+fold_dim, NS);
2159  fold_tsr = new tensor(sr, fold_dim, fold_edge_len, fold_sym, wrld, 0);
2160 
2161  this->is_folded = 1;
2162  this->rec_tsr = fold_tsr;
2163  this->inner_ordering = dim_order;
2164 // for (int h=0; h<allfold_dim; h++)
2165 
2166  *all_fdim = allfold_dim;
2167  *all_flen = all_edge_len;
2168 
2169  CTF_int::cdealloc(fold_edge_len);
2170  CTF_int::cdealloc(fold_sym);
2171 
2172  CTF_int::cdealloc(sub_edge_len);
2173 
2174  }
2175 
2176  void tensor::pull_alias(tensor const * other){
2177  if (other->is_data_aliased){
2178  this->topo = other->topo;
2179  copy_mapping(other->order, other->edge_map,
2180  this->edge_map);
2181  this->data = other->data;
2182  this->is_home = other->is_home;
2183  ASSERT(this->has_home == other->has_home);
2184  this->home_buffer = other->home_buffer;
2185  this->set_padding();
2186  }
2187  }
2188 
2189  void tensor::clear_mapping(){
2190  int j;
2191  mapping * map;
2192  for (j=0; j<this->order; j++){
2193  map = this->edge_map + j;
2194  map->clear();
2195  }
2196  this->topo = NULL;
2197  this->is_mapped = 0;
2198  this->is_folded = 0;
2199  }
2200 
2201  int tensor::redistribute(distribution const & old_dist,
2202  int const * old_offsets,
2203  int * const * old_permutation,
2204  int const * new_offsets,
2205  int * const * new_permutation){
2206 
2207  int can_block_shuffle;
2208  char * shuffled_data;
2209  #if VERIFY_REMAP
2210  char * shuffled_data_corr;
2211  #endif
2212 
2213  distribution new_dist = distribution(this);
2214  if (is_sparse) can_block_shuffle = 0;
2215  else {
2216  #ifdef USE_BLOCK_RESHUFFLE
2217  can_block_shuffle = can_block_reshuffle(this->order, old_dist.phase, this->edge_map);
2218  #else
2219  can_block_shuffle = 0;
2220  #endif
2221  if (old_offsets != NULL || old_permutation != NULL ||
2222  new_offsets != NULL || new_permutation != NULL){
2223  can_block_shuffle = 0;
2224  }
2225  }
2226 
2227  if (size > INT_MAX && !is_sparse && wrld->cdt.rank == 0)
2228  printf("CTF WARNING: Tensor %s is being redistributed to a mapping where its size is %ld, which is greater than INT_MAX=%d, so MPI could run into problems\n", name, size, INT_MAX);
2229 
2230  #ifdef HOME_CONTRACT
2231  if (this->is_home){
2232  if (wrld->cdt.rank == 0)
2233  DPRINTF(2,"Tensor %s leaving home %d\n", name, is_sparse);
2234  if (is_sparse){
2235  if (this->has_home){
2236  this->home_buffer = sr->pair_alloc(nnz_loc);
2237  sr->copy_pairs(this->home_buffer, this->data, nnz_loc);
2238  }
2239  this->is_home = 0;
2240  } else {
2241  this->data = sr->alloc(old_dist.size);
2242  sr->copy(this->data, this->home_buffer, old_dist.size);
2243  this->is_home = 0;
2244  }
2245  }
2246  #endif
2247  #ifdef PROF_REDIST
2248  if (this->profile) {
2249  char spf[80];
2250  strcpy(spf,"redistribute_");
2251  strcat(spf,this->name);
2252  if (wrld->cdt.rank == 0){
2253  Timer t_pf(spf);
2254  t_pf.start();
2255  }
2256  }
2257  #endif
2258  #if VERBOSE >=1
2259  if (wrld->cdt.rank == 0){
2260  if (can_block_shuffle) VPRINTF(1,"Remapping tensor %s via block_reshuffle to mapping\n",this->name);
2261  else if (is_sparse) VPRINTF(1,"Remapping tensor %s via sparse reshuffle to mapping\n",this->name);
2262  else VPRINTF(1,"Remapping tensor %s via cyclic_reshuffle to mapping\n",this->name);
2263  }
2264  this->print_map(stdout);
2265  #endif
2266 
2267 #if VERIFY_REMAP
2268  if (!is_sparse)
2269  padded_reshuffle(sym, old_dist, new_dist, this->data, &shuffled_data_corr, sr, wrld->cdt);
2270 #endif
2271 
2272  if (can_block_shuffle){
2273  block_reshuffle(old_dist, new_dist, this->data, shuffled_data, sr, wrld->cdt);
2274  sr->dealloc(this->data);
2275  } else {
2276  if (is_sparse){
2277  //padded_reshuffle(sym, old_dist, new_dist, this->data, &shuffled_data, sr, wrld->cdt);
2278 #ifdef TUNE
2279  // change-of-observe
2280  double nnz_frac_ = ((double)nnz_tot)/(old_dist.size*wrld->cdt.np);
2281  double tps_[] = {0.0, 1.0, (double)log2(wrld->cdt.np), (double)std::max(old_dist.size, new_dist.size)*log2(wrld->cdt.np)*sr->el_size*nnz_frac_};
2282  if (!spredist_mdl.should_observe(tps_)) return SUCCESS;
2283 
2284  double st_time = MPI_Wtime();
2285 #endif
2286  char * old_data = this->data;
2287 
2288  this->data = NULL;
2289  int64_t old_nnz = nnz_loc;
2290  nnz_loc = 0;
2291  cdealloc(nnz_blk);
2292  nnz_blk = (int64_t*)alloc(sizeof(int64_t)*calc_nvirt());
2293  std::fill(nnz_blk, nnz_blk+calc_nvirt(), 0);
2294  this->write(old_nnz, sr->mulid(), sr->addid(), old_data);
2295  //this->set_new_nnz_glb(nnz_blk);
2296  shuffled_data = this->data;
2297  if (old_data != NULL){
2298  sr->pair_dealloc(old_data);
2299  }
2300 #ifdef TUNE
2301  double exe_time = MPI_Wtime()-st_time;
2302  double nnz_frac = ((double)nnz_tot)/(old_dist.size*wrld->cdt.np);
2303  double tps[] = {exe_time, 1.0, (double)log2(wrld->cdt.np), (double)std::max(old_dist.size, new_dist.size)*log2(wrld->cdt.np)*sr->el_size*nnz_frac};
2304  spredist_mdl.observe(tps);
2305 #endif
2306  } else
2307  dgtog_reshuffle(sym, lens, old_dist, new_dist, &this->data, &shuffled_data, sr, wrld->cdt);
2308  //glb_cyclic_reshuffle(sym, old_dist, old_offsets, old_permutation, new_dist, new_offsets, new_permutation, &this->data, &shuffled_data, sr, wrld->cdt, 1, sr->mulid(), sr->addid());
2309  //cyclic_reshuffle(sym, old_dist, old_offsets, old_permutation, new_dist, new_offsets, new_permutation, &this->data, &shuffled_data, sr, wrld->cdt, 1, sr->mulid(), sr->addid());
2310  //CTF_int::cdealloc((void*)this->data);
2311 // padded_reshuffle(sym, old_dist, new_dist, this->data, &shuffled_data, sr, wrld->cdt);
2312  // CTF_int::alloc_ptr(sizeof(dtype)*this->size, (void**)&shuffled_data);
2313  }
2314 
2315  this->data = shuffled_data;
2316 // zero_out_padding();
2317  #if VERIFY_REMAP
2318  if (!is_sparse && sr->addid() != NULL){
2319  bool abortt = false;
2320  for (int64_t j=0; j<this->size; j++){
2321  if (!sr->isequal(this->data+j*sr->el_size, shuffled_data_corr+j*sr->el_size)){
2322  printf("data element %ld/%ld not received correctly on process %d\n",
2323  j, this->size, wrld->cdt.rank);
2324  printf("element received was ");
2325  sr->print(this->data+j*sr->el_size);
2326  printf(", correct ");
2327  sr->print(shuffled_data_corr+j*sr->el_size);
2328  printf("\n");
2329  abortt = true;
2330  }
2331  }
2332  if (abortt) ABORT;
2333  sr->dealloc(shuffled_data_corr);
2334  }
2335 
2336  #endif
2337  #ifdef PROF_REDIST
2338  if (this->profile) {
2339  char spf[80];
2340  strcpy(spf,"redistribute_");
2341  strcat(spf,this->name);
2342  if (wrld->cdt.rank == 0){
2343  Timer t_pf(spf);
2344  t_pf.stop();
2345  }
2346  }
2347  #endif
2348 
2349  return SUCCESS;
2350 
2351  }
2352 
2353 
2354  double tensor::est_redist_time(distribution const & old_dist, double nnz_frac){
2355  int nvirt = (int64_t)calc_nvirt();
2356  bool can_blres;
2357  if (is_sparse) can_blres = 0;
2358  else {
2359  #ifdef USE_BLOCK_RESHUFFLE
2360  can_blres = can_block_reshuffle(this->order, old_dist.phase, this->edge_map);
2361  #else
2362  can_blres = 0;
2363  #endif
2364  }
2365  int old_nvirt = 1;
2366  for (int i=0; i<order; i++){
2367  old_nvirt *= old_dist.virt_phase[i];
2368  }
2369 
2370  double est_time = 0.0;
2371 
2372  if (can_blres){
2373  est_time += blres_est_time(std::max(old_dist.size,this->size)*this->sr->el_size*nnz_frac, nvirt, old_nvirt);
2374  } else {
2375  if (this->is_sparse)
2376  //est_time += 25.*COST_MEMBW*this->sr->el_size*std::max(this->size,old_dist.size)*nnz_frac+wrld->cdt.estimate_alltoall_time(1);
2377  est_time += spredist_est_time(this->sr->el_size*std::max(this->size,old_dist.size)*nnz_frac, wrld->cdt.np);
2378  else
2379  est_time += dgtog_est_time(this->sr->el_size*std::max(this->size,old_dist.size)*nnz_frac, wrld->cdt.np);
2380  }
2381 
2382  return est_time;
2383  }
2384 
2385  int64_t tensor::get_redist_mem(distribution const & old_dist, double nnz_frac){
2386  bool can_blres;
2387  if (is_sparse) can_blres = 0;
2388  else {
2389  #ifdef USE_BLOCK_RESHUFFLE
2390  can_blres = can_block_reshuffle(this->order, old_dist.phase, this->edge_map);
2391  #else
2392  can_blres = 0;
2393  #endif
2394  }
2395  if (can_blres)
2396  return (int64_t)this->sr->el_size*std::max(this->size,old_dist.size)*nnz_frac;
2397  else {
2398  if (is_sparse)
2399  return (int64_t)this->sr->pair_size()*std::max(this->size,old_dist.size)*nnz_frac*3;
2400  else
2401  return (int64_t)this->sr->el_size*std::max(this->size,old_dist.size)*nnz_frac*2.5;
2402  }
2403  }
2404 
2405  int tensor::map_tensor_rem(int num_phys_dims,
2406  CommData * phys_comm,
2407  int fill){
2408  int i, num_sub_phys_dims, stat;
2409  int * restricted, * phys_mapped, * comm_idx;
2410  CommData * sub_phys_comm;
2411  mapping * map;
2412 
2413  CTF_int::alloc_ptr(this->order*sizeof(int), (void**)&restricted);
2414  CTF_int::alloc_ptr(num_phys_dims*sizeof(int), (void**)&phys_mapped);
2415 
2416  memset(phys_mapped, 0, num_phys_dims*sizeof(int));
2417 
2418  for (i=0; i<this->order; i++){
2419  restricted[i] = (this->edge_map[i].type != NOT_MAPPED);
2420  map = &this->edge_map[i];
2421  while (map->type == PHYSICAL_MAP){
2422  phys_mapped[map->cdt] = 1;
2423  if (map->has_child) map = map->child;
2424  else break;
2425  }
2426  }
2427 
2428  num_sub_phys_dims = 0;
2429  for (i=0; i<num_phys_dims; i++){
2430  if (phys_mapped[i] == 0){
2431  num_sub_phys_dims++;
2432  }
2433  }
2434  CTF_int::alloc_ptr(num_sub_phys_dims*sizeof(CommData), (void**)&sub_phys_comm);
2435  CTF_int::alloc_ptr(num_sub_phys_dims*sizeof(int), (void**)&comm_idx);
2436  num_sub_phys_dims = 0;
2437  for (i=0; i<num_phys_dims; i++){
2438  if (phys_mapped[i] == 0){
2439  sub_phys_comm[num_sub_phys_dims] = phys_comm[i];
2440  comm_idx[num_sub_phys_dims] = i;
2441  num_sub_phys_dims++;
2442  }
2443  }
2444  stat = map_tensor(num_sub_phys_dims, this->order,
2445  this->pad_edge_len, this->sym_table,
2446  restricted, sub_phys_comm,
2447  comm_idx, fill,
2448  this->edge_map);
2449  CTF_int::cdealloc(restricted);
2450  CTF_int::cdealloc(phys_mapped);
2451  CTF_int::cdealloc(sub_phys_comm);
2452  CTF_int::cdealloc(comm_idx);
2453  return stat;
2454  }
2455 
2456  int tensor::extract_diag(int const * idx_map,
2457  int rw,
2458  tensor *& new_tsr,
2459  int ** idx_map_new){
2460  int i, j, k, * edge_len, * nsym, * ex_idx_map, * diag_idx_map;
2461  for (i=0; i<this->order; i++){
2462  for (j=i+1; j<this->order; j++){
2463  if (idx_map[i] == idx_map[j]){
2464  CTF_int::alloc_ptr(sizeof(int)*this->order-1, (void**)&edge_len);
2465  CTF_int::alloc_ptr(sizeof(int)*this->order-1, (void**)&nsym);
2466  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)idx_map_new);
2467  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&ex_idx_map);
2468  CTF_int::alloc_ptr(sizeof(int)*this->order-1, (void**)&diag_idx_map);
2469  for (k=0; k<this->order; k++){
2470  if (k<j){
2471  ex_idx_map[k] = k;
2472  diag_idx_map[k] = k;
2473  edge_len[k] = this->pad_edge_len[k]-this->padding[k];
2474  (*idx_map_new)[k] = idx_map[k];
2475  if (k==j-1){
2476  nsym[k] = NS;
2477  if (this->sym[k] == this->sym[j]) nsym[k] = this->sym[k];
2478  } else
2479  nsym[k] = this->sym[k];
2480  } else if (k>j) {
2481  ex_idx_map[k] = k-1;
2482  diag_idx_map[k-1] = k-1;
2483  edge_len[k-1] = this->pad_edge_len[k]-this->padding[k];
2484  nsym[k-1] = this->sym[k];
2485  (*idx_map_new)[k-1] = idx_map[k];
2486  } else {
2487  ex_idx_map[k] = i;
2488  }
2489  }
2490  if (is_sparse){
2491  int64_t lda_i=1, lda_j=1;
2492  for (int ii=0; ii<i; ii++){
2493  lda_i *= lens[ii];
2494  }
2495  for (int jj=0; jj<j; jj++){
2496  lda_j *= lens[jj];
2497  }
2498  if (rw){
2499  PairIterator pi(sr, data);
2500  new_tsr = new tensor(sr, this->order-1, edge_len, nsym, wrld, 1, name, 1, is_sparse);
2501  int64_t nw = 0;
2502  for (int p=0; p<nnz_loc; p++){
2503  int64_t k = pi[p].k();
2504  if ((k/lda_i)%lens[i] == (k/lda_j)%lens[j]) nw++;
2505  }
2506  char * pwdata = sr->pair_alloc(nw);
2507  PairIterator wdata(sr, pwdata);
2508  nw=0;
2509 #ifdef USE_OMP
2510 // #pragma omp parallel for
2511 #endif
2512  for (int p=0; p<nnz_loc; p++){
2513  int64_t k = pi[p].k();
2514  if ((k/lda_i)%lens[i] == (k/lda_j)%lens[j]){
2515  int64_t k_new = (k%lda_j)+(k/(lda_j*lens[j])*lda_j);
2516  //printf("p = %d k = %ld lda_j = %ld lens[j] = %d k_new = %ld\n",p, k, lda_j, lens[j], k_new);
2517  ((int64_t*)(wdata[nw].ptr))[0] = k_new;
2518  wdata[nw].write_val(pi[p].d());
2519  nw++;
2520  }
2521  }
2522  new_tsr->write(nw, sr->mulid(), sr->addid(), pwdata);
2523  sr->pair_dealloc(pwdata);
2524  } else {
2525  char * pwdata;
2526  int64_t nw;
2527  new_tsr->read_local_nnz(&nw, &pwdata);
2528  PairIterator wdata(sr, pwdata);
2529 #ifdef USE_OMP
2530  #pragma omp parallel for
2531 #endif
2532  for (int p=0; p<nw; p++){
2533  int64_t k = wdata[p].k();
2534  int64_t kpart = (k/lda_i)%lens[i];
2535  int64_t k_new = (k%lda_j)+((k/lda_j)*lens[j]+kpart)*lda_j;
2536  ((int64_t*)(wdata[p].ptr))[0] = k_new;
2537 // printf("k = %ld, k_new = %ld lda_i = %ld lda_j = %ld lens[0] = %d lens[1] = %d\n", k,k_new,lda_i,lda_j,lens[0],lens[1]);
2538  }
2539  PairIterator pi(sr, this->data);
2540  for (int p=0; p<nnz_loc; p++){
2541  int64_t k = pi[p].k();
2542  if ((k/lda_i)%lens[i] == (k/lda_j)%lens[j]){
2543  pi[p].write_val(sr->addid());
2544  }
2545  }
2546 
2547  this->write(nw, NULL, NULL, pwdata);
2548  sr->pair_dealloc(pwdata);
2549  }
2550  } else {
2551  if (rw){
2552 
2553  new_tsr = new tensor(sr, this->order-1, edge_len, nsym, wrld, 1, name, 1, is_sparse);
2554  summation sum = summation(this, ex_idx_map, sr->mulid(), new_tsr, diag_idx_map, sr->addid());
2555  sum.execute(1);
2556  } else {
2557  summation sum = summation(new_tsr, diag_idx_map, sr->mulid(), this, ex_idx_map, sr->addid());
2558  sum.execute(1);
2559  }
2560  }
2561  CTF_int::cdealloc(edge_len), CTF_int::cdealloc(nsym), CTF_int::cdealloc(ex_idx_map), CTF_int::cdealloc(diag_idx_map);
2562  return SUCCESS;
2563  }
2564  }
2565  }
2566  return NEGATIVE;
2567  }
2568 
2569 
2570  int tensor::zero_out_padding(){
2571  int i, num_virt, idx_lyr;
2572  int64_t np;
2573  int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
2574  mapping * map;
2575 
2576  TAU_FSTART(zero_out_padding);
2577 
2578  if (this->has_zero_edge_len || is_sparse){
2579  return SUCCESS;
2580  }
2581  this->unfold();
2582  this->set_padding();
2583 
2584  if (!this->is_mapped || sr->addid() == NULL){
2585  TAU_FSTOP(zero_out_padding);
2586  return SUCCESS;
2587  } else {
2588  np = this->size;
2589 
2590  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&virt_phase);
2591  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&phys_phase);
2592  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&phase);
2593  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&virt_phys_rank);
2594 
2595 
2596  num_virt = 1;
2597  idx_lyr = wrld->rank;
2598  for (i=0; i<this->order; i++){
2599  /* Calcute rank and phase arrays */
2600  map = this->edge_map + i;
2601  phase[i] = map->calc_phase();
2602  phys_phase[i] = map->calc_phys_phase();
2603  virt_phase[i] = phase[i]/phys_phase[i];
2604  virt_phys_rank[i] = map->calc_phys_rank(topo);
2605  num_virt = num_virt*virt_phase[i];
2606 
2607  if (map->type == PHYSICAL_MAP)
2608  idx_lyr -= topo->lda[map->cdt]
2609  *virt_phys_rank[i];
2610  }
2611  if (idx_lyr == 0){
2612  zero_padding(this->order, np, num_virt,
2613  this->pad_edge_len, this->sym, this->padding,
2614  phase, phys_phase, virt_phase, virt_phys_rank, this->data, sr);
2615  } else {
2616  std::fill(this->data, this->data+np, 0.0);
2617  }
2618  CTF_int::cdealloc(virt_phase);
2619  CTF_int::cdealloc(phys_phase);
2620  CTF_int::cdealloc(phase);
2621  CTF_int::cdealloc(virt_phys_rank);
2622  }
2623  TAU_FSTOP(zero_out_padding);
2624 
2625  return SUCCESS;
2626 
2627  }
2628 
2629  void tensor::scale_diagonals(int const * sym_mask){
2630  int i, num_virt, idx_lyr;
2631  int64_t np;
2632  int * virt_phase, * virt_phys_rank, * phys_phase, * phase;
2633  mapping * map;
2634 
2635  TAU_FSTART(scale_diagonals);
2636 
2637  this->unfold();
2638  this->set_padding();
2639 
2640 
2641  if (!this->is_mapped){
2642  ASSERT(0);
2643  } else {
2644  np = this->size;
2645 
2646  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&virt_phase);
2647  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&phys_phase);
2648  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&phase);
2649  CTF_int::alloc_ptr(sizeof(int)*this->order, (void**)&virt_phys_rank);
2650 
2651 
2652  num_virt = 1;
2653  idx_lyr = wrld->rank;
2654  for (i=0; i<this->order; i++){
2655  /* Calcute rank and phase arrays */
2656  map = this->edge_map + i;
2657  phase[i] = map->calc_phase();
2658  phys_phase[i] = map->calc_phys_phase();
2659  virt_phase[i] = phase[i]/phys_phase[i];
2660  virt_phys_rank[i] = map->calc_phys_rank(topo);
2661  num_virt = num_virt*virt_phase[i];
2662 
2663  if (map->type == PHYSICAL_MAP)
2664  idx_lyr -= topo->lda[map->cdt]
2665  *virt_phys_rank[i];
2666  }
2667  if (idx_lyr == 0){
2668  scal_diag(this->order, np, num_virt,
2669  this->pad_edge_len, this->sym, this->padding,
2670  phase, phys_phase, virt_phase, virt_phys_rank, this->data, sr, sym_mask);
2671  } /*else {
2672  std::fill(this->data, this->data+np, 0.0);
2673  }*/
2674  CTF_int::cdealloc(virt_phase);
2675  CTF_int::cdealloc(phys_phase);
2676  CTF_int::cdealloc(phase);
2677  CTF_int::cdealloc(virt_phys_rank);
2678  }
2679  TAU_FSTOP(scale_diagonals);
2680  }
2681 
2682  void tensor::addinv(){
2683  if (is_sparse){
2684  PairIterator pi(sr,data);
2685 #ifdef USE_OMP
2686  #pragma omp parallel for
2687 #endif
2688  for (int64_t i=0; i<nnz_loc; i++){
2689  sr->addinv(pi[i].d(), pi[i].d());
2690  }
2691  } else {
2692 #ifdef USE_OMP
2693  #pragma omp parallel for
2694 #endif
2695  for (int64_t i=0; i<size; i++){
2696  sr->addinv(data+i*sr->el_size,data+i*sr->el_size);
2697  }
2698  }
2699  }
2700 
2701  void tensor::set_sym(int const * sym_){
2702  if (sym_ == NULL)
2703  std::fill(this->sym, this->sym+order, NS);
2704  else
2705  memcpy(this->sym, sym_, order*sizeof(int));
2706 
2707  memset(sym_table, 0, order*order*sizeof(int));
2708  for (int i=0; i<order; i++){
2709  if (this->sym[i] != NS) {
2710  sym_table[(i+1)+i*order] = 1;
2711  sym_table[(i+1)*order+i] = 1;
2712  }
2713  }
2714  }
2715 
2716  void tensor::set_new_nnz_glb(int64_t const * nnz_blk_){
2717  if (is_sparse){
2718  nnz_loc = 0;
2719  for (int i=0; i<calc_nvirt(); i++){
2720  nnz_blk[i] = nnz_blk_[i];
2721  nnz_loc += nnz_blk[i];
2722  }
2723  wrld->cdt.allred(&nnz_loc, &nnz_tot, 1, MPI_INT64_T, MPI_SUM);
2724  // printf("New nnz loc = %ld tot = %ld\n", nnz_loc, nnz_tot);
2725  }
2726  }
2727 
2728  void tensor::spmatricize(int m, int n, int nrow_idx, bool csr){
2729  ASSERT(is_sparse);
2730 
2731 #ifdef PROFILE
2732  MPI_Barrier(this->wrld->comm);
2733  TAU_FSTART(sparse_transpose);
2734 // double t_st = MPI_Wtime();
2735 #endif
2736  int64_t new_sz_A = 0;
2737  this->rec_tsr->is_sparse = 1;
2738  int nvirt_A = calc_nvirt();
2739  this->rec_tsr->nnz_blk = (int64_t*)alloc(nvirt_A*sizeof(int64_t));
2740  for (int i=0; i<nvirt_A; i++){
2741  if (csr)
2742  this->rec_tsr->nnz_blk[i] = get_csr_size(this->nnz_blk[i], m, this->sr->el_size);
2743  else
2744  this->rec_tsr->nnz_blk[i] = get_coo_size(this->nnz_blk[i], this->sr->el_size);
2745  new_sz_A += this->rec_tsr->nnz_blk[i];
2746  }
2747  CTF_int::alloc_ptr(new_sz_A, (void**)&this->rec_tsr->data);
2748  this->rec_tsr->is_data_aliased = false;
2749  int phase[this->order];
2750  for (int i=0; i<this->order; i++){
2751  phase[i] = this->edge_map[i].calc_phase();
2752  }
2753  char * data_ptr_out = this->rec_tsr->data;
2754  char const * data_ptr_in = this->data;
2755  for (int i=0; i<nvirt_A; i++){
2756  if (csr){
2757  COO_Matrix cm(this->nnz_blk[i], this->sr);
2758  cm.set_data(this->nnz_blk[i], this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_in, this->sr, phase);
2759  CSR_Matrix cs(cm, m, n, this->sr, data_ptr_out);
2760  cdealloc(cm.all_data);
2761  } else {
2762  COO_Matrix cm(data_ptr_out);
2763  cm.set_data(this->nnz_blk[i], this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_in, this->sr, phase);
2764  }
2765  data_ptr_in += this->nnz_blk[i]*this->sr->pair_size();
2766  data_ptr_out += this->rec_tsr->nnz_blk[i];
2767  }
2768  this->is_csr = csr;
2769  this->nrow_idx = nrow_idx;
2770 #ifdef PROFILE
2771 // double t_end = MPI_Wtime();
2772  MPI_Barrier(this->wrld->comm);
2773  TAU_FSTOP(sparse_transpose);
2774  /*int64_t max_nnz, avg_nnz;
2775  double max_time, avg_time;
2776  max_nnz = this->nnz_loc;
2777  MPI_Allreduce(MPI_IN_PLACE, &max_nnz, 1, MPI_INT64_T, MPI_MAX, this->wrld->comm);
2778  avg_nnz = (this->nnz_loc+this->wrld->np/2)/this->wrld->np;
2779  MPI_Allreduce(MPI_IN_PLACE, &avg_nnz, 1, MPI_INT64_T, MPI_SUM, this->wrld->comm);
2780  max_time = t_end-t_st;
2781  MPI_Allreduce(MPI_IN_PLACE, &max_time, 1, MPI_DOUBLE, MPI_MAX, this->wrld->comm);
2782  avg_time = (t_end-t_st)/this->wrld->np;
2783  MPI_Allreduce(MPI_IN_PLACE, &avg_time, 1, MPI_DOUBLE, MPI_SUM, this->wrld->comm);
2784  if (this->wrld->rank == 0){
2785  printf("avg_nnz = %ld max_nnz = %ld, avg_time = %lf max_time = %lf\n", avg_nnz, max_nnz, avg_time, max_time);
2786  }*/
2787 #endif
2788  }
2789 
2790  void tensor::despmatricize(int nrow_idx, bool csr){
2791  ASSERT(is_sparse);
2792 
2793 #ifdef PROFILE
2794  MPI_Barrier(this->wrld->comm);
2795  TAU_FSTART(sparse_transpose);
2796 // double t_st = MPI_Wtime();
2797 #endif
2798  int64_t offset = 0;
2799  int64_t new_sz = 0;
2800  this->rec_tsr->is_sparse = 1;
2801  int nvirt = calc_nvirt();
2802  for (int i=0; i<nvirt; i++){
2803  if (this->rec_tsr->nnz_blk[i]>0){
2804  if (csr){
2805  CSR_Matrix cA(this->rec_tsr->data+offset);
2806  new_sz += cA.nnz();
2807  } else {
2808  COO_Matrix cA(this->rec_tsr->data+offset);
2809  new_sz += cA.nnz();
2810  }
2811  }
2812  offset += this->rec_tsr->nnz_blk[i];
2813  }
2814  this->data = sr->pair_alloc(new_sz);
2815  int phase[this->order];
2816  int phys_phase[this->order];
2817  int phase_rank[this->order];
2818  for (int i=0; i<this->order; i++){
2819  phase[i] = this->edge_map[i].calc_phase();
2820  phys_phase[i] = this->edge_map[i].calc_phys_phase();
2821  phase_rank[i] = this->edge_map[i].calc_phys_rank(this->topo);
2822  }
2823  char * data_ptr_out = this->data;
2824  char const * data_ptr_in = this->rec_tsr->data;
2825  for (int i=0; i<nvirt; i++){
2826  if (this->rec_tsr->nnz_blk[i]>0){
2827  if (csr){
2828  CSR_Matrix cs((char*)data_ptr_in);
2829  COO_Matrix cm(cs, this->sr);
2830  cm.get_data(cs.nnz(), this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_out, this->sr, phase, phase_rank);
2831  this->nnz_blk[i] = cm.nnz();
2832  cdealloc(cm.all_data);
2833  } else {
2834  COO_Matrix cm((char*)data_ptr_in);
2835  cm.get_data(cm.nnz(), this->order, this->lens, this->inner_ordering, nrow_idx, data_ptr_out, this->sr, phase, phase_rank);
2836  this->nnz_blk[i] = cm.nnz();
2837  }
2838  } else this->nnz_blk[i] = 0;
2839  data_ptr_out += this->nnz_blk[i]*this->sr->pair_size();
2840  data_ptr_in += this->rec_tsr->nnz_blk[i];
2841  if (i<nvirt-1){
2842  int j=0;
2843  bool cont = true;
2844  while (cont){
2845  phase_rank[j]+=phys_phase[j];
2846  if (phase_rank[j]>=phase[j])
2847  phase_rank[j]=phase_rank[j]%phase[j];
2848  else cont = false;
2849  j++;
2850  }
2851  }
2852  }
2853  set_new_nnz_glb(this->nnz_blk);
2854  this->rec_tsr->is_csr = csr;
2855 
2856 #ifdef PROFILE
2857 // double t_end = MPI_Wtime();
2858  MPI_Barrier(this->wrld->comm);
2859  TAU_FSTOP(sparse_transpose);
2860 #endif
2861  }
2862 
2863  void tensor::leave_home_with_buffer(){
2864 #ifdef HOME_CONTRACT
2865  if (this->has_home){
2866  if (!this->is_home){
2867  if (is_sparse){
2868  sr->pair_dealloc(this->home_buffer);
2869  this->home_buffer = NULL;
2870  } else {
2871  sr->dealloc(this->home_buffer);
2872  this->home_buffer = this->data;
2873  }
2874  }
2875  if (wrld->rank == 0) DPRINTF(2,"Deleting home (leave) of %s\n",name);
2876  deregister_size();
2877  }
2878  this->is_home = 0;
2879  this->has_home = 0;
2880 #endif
2881  }
2882 
2883  void tensor::register_size(int64_t sz){
2884  deregister_size();
2885  registered_alloc_size = sz;
2886  inc_tot_mem_used(registered_alloc_size);
2887  }
2888 
2889  void tensor::deregister_size(){
2890  inc_tot_mem_used(-registered_alloc_size);
2891  registered_alloc_size = 0;
2892  }
2893 
2894  void tensor::write_dense_to_file(MPI_File & file, int64_t offset){
2895  bool need_unpack = is_sparse;
2896  for (int i=0; i<order; i++){
2897  if (sym[i] != NS) need_unpack = true;
2898  }
2899  if (need_unpack){
2900  int nsym[order];
2901  std::fill(nsym, nsym+order, NS);
2902  tensor t_dns(sr, order, lens, nsym, wrld);
2903  t_dns["ij"] = (*this)["ij"];
2904  t_dns.write_dense_to_file(file);
2905  } else {
2906  int64_t tot_els = packed_size(order, lens, sym);
2907  int64_t chnk_sz = tot_els/wrld->np;
2908  int64_t my_chnk_sz = chnk_sz;
2909  if (wrld->rank < tot_els%wrld->np) my_chnk_sz++;
2910  int64_t my_chnk_st = chnk_sz*wrld->rank + std::min((int64_t)wrld->rank, tot_els%wrld->np);
2911 
2912  char * my_pairs = (char*)alloc(sr->pair_size()*my_chnk_sz);
2913  PairIterator pi(sr, my_pairs);
2914 
2915  for (int64_t i=0; i<my_chnk_sz; i++){
2916  pi[i].write_key(my_chnk_st+i);
2917  }
2918 
2919  this->read(my_chnk_sz, my_pairs);
2920  for (int64_t i=0; i<my_chnk_sz; i++){
2921  char val[sr->el_size];
2922  pi[i].read_val(val);
2923  memcpy(my_pairs+i*sr->el_size, val, sr->el_size);
2924  }
2925 
2926  MPI_Status stat;
2927  MPI_Offset off = my_chnk_st*sr->el_size+offset;
2928  MPI_File_write_at(file, off, my_pairs, my_chnk_sz, sr->mdtype(), &stat);
2929  cdealloc(my_pairs);
2930  }
2931  }
2932 
2933  void tensor::write_dense_to_file(char const * filename){
2934 
2935  MPI_File file;
2936  MPI_File_open(this->wrld->comm, filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &file);
2937  this->write_dense_to_file(file, 0);
2938  MPI_File_close(&file);
2939  }
2940 
2941  void tensor::read_dense_from_file(MPI_File & file, int64_t offset){
2942  bool need_unpack = is_sparse;
2943  for (int i=0; i<order; i++){
2944  if (sym[i] != NS) need_unpack = true;
2945  }
2946  if (need_unpack){
2947  int nsym[order];
2948  std::fill(nsym, nsym+order, NS);
2949  tensor t_dns(sr, order, lens, nsym, wrld);
2950  t_dns.read_dense_from_file(file);
2951  summation ts(&t_dns, "ij", sr->mulid(), this, "ij", sr->addid());
2952  ts.sum_tensors(true); //does not symmetrize
2953 // this->["ij"] = t_dns["ij"];
2954  if (is_sparse) this->sparsify();
2955  } else {
2956  int64_t tot_els = packed_size(order, lens, sym);
2957  int64_t chnk_sz = tot_els/wrld->np;
2958  int64_t my_chnk_sz = chnk_sz;
2959  if (wrld->rank < tot_els%wrld->np) my_chnk_sz++;
2960  int64_t my_chnk_st = chnk_sz*wrld->rank + std::min((int64_t)wrld->rank, tot_els%wrld->np);
2961 
2962  char * my_pairs = (char*)alloc(sr->pair_size()*my_chnk_sz);
2963  //use latter part of buffer for the pure dense data, so that we do not need another buffer when forming pairs
2964  char * my_pairs_tail = my_pairs + sizeof(int64_t)*my_chnk_sz;
2965  MPI_Status stat;
2966  MPI_Offset off = my_chnk_st*sr->el_size+offset;
2967  MPI_File_read_at(file, off, my_pairs_tail, my_chnk_sz, sr->mdtype(), &stat);
2968 
2969  PairIterator pi(sr, my_pairs);
2970  for (int64_t i=0; i<my_chnk_sz; i++){
2971  char val[sr->el_size];
2972  memcpy(val, my_pairs_tail+i*sr->el_size, sr->el_size);
2973  pi[i].write_key(my_chnk_st+i);
2974  pi[i].write_val(val);
2975  }
2976 
2977  this->write(my_chnk_sz, sr->mulid(), sr->addid(), my_pairs);
2978  cdealloc(my_pairs);
2979  }
2980  }
2981 
2982  void tensor::read_dense_from_file(char const * filename){
2983  MPI_File file;
2984  MPI_File_open(this->wrld->comm, filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &file);
2985  this->read_dense_from_file(file, 0);
2986  MPI_File_close(&file);
2987 
2988  }
2989 
2990  tensor * tensor::self_reduce(int const * idx_A,
2991  int ** new_idx_A,
2992  int order_B,
2993  int const * idx_B,
2994  int ** new_idx_B,
2995  int order_C,
2996  int const * idx_C,
2997  int ** new_idx_C){
2998  //check first that we are not already effectively doing a summation for a self_reduce, to ensure that there is no infinite recursion
2999  if (order_C == 0 && this->order == order_B + 1){
3000  bool all_match_except_one = true;
3001  bool one_skip = false;
3002  int iiA=0,iiB=0;
3003  while (iiA < this->order){
3004  if (iiB >= order_B || idx_A[iiA] != idx_B[iiB]){
3005  if (one_skip) all_match_except_one = false;
3006  else one_skip = true;
3007  iiA++;
3008  } else {
3009  iiA++;
3010  iiB++;
3011  }
3012  }
3013  if (all_match_except_one && one_skip) return this;
3014  }
3015 
3016  //look for unmatched indices
3017  for (int i=0; i<this->order; i++){
3018  int iA = idx_A[i];
3019  bool has_match = false;
3020  for (int j=0; j<this->order; j++){
3021  if (j != i && idx_A[j] == iA) has_match = true;
3022  }
3023  for (int j=0; j<order_B; j++){
3024  if (idx_B[j] == iA) has_match = true;
3025  }
3026  for (int j=0; j<order_C; j++){
3027  if (idx_C[j] == iA) has_match = true;
3028  }
3029  //reduce/contract any unmatched index
3030  if (!has_match){
3031  int new_len[this->order-1];
3032  int new_sym[this->order-1];
3033  int sum_A_idx[this->order];
3034  int sum_B_idx[this->order-1];
3035  *new_idx_A = (int*)CTF_int::alloc(sizeof(int)*(this->order-1));
3036  *new_idx_B = (int*)CTF_int::alloc(sizeof(int)*(order_B));
3037  if (order_C > 0)
3038  *new_idx_C = (int*)CTF_int::alloc(sizeof(int)*(order_C));
3039  int max_idx = 0;
3040  //determine new symmetry and edge lengths
3041  for (int j=0; j<this->order; j++){
3042  max_idx = std::max(max_idx, idx_A[j]);
3043  sum_A_idx[j] = j;
3044  if (j==i) continue;
3045  if (j<i){
3046  new_len[j] = this->lens[j];
3047  (*new_idx_A)[j] = idx_A[j];
3048  new_sym[j] = this->sym[j];
3049  if (j == i-1){
3050  if (this->sym[i] == NS) new_sym[j] = NS;
3051  }
3052  sum_A_idx[j] = j;
3053  sum_B_idx[j] = j;
3054  } else {
3055  new_len[j-1] = this->lens[j];
3056  new_sym[j-1] = this->sym[j];
3057  (*new_idx_A)[j-1] = idx_A[j];
3058  sum_A_idx[j] = j;
3059  sum_B_idx[j-1] = j;
3060  }
3061  }
3062  //determine maximum index
3063  for (int j=0; j<this->order; j++){
3064  max_idx = std::max(max_idx, idx_A[j]);
3065  }
3066  for (int j=0; j<order_B; j++){
3067  (*new_idx_B)[j] = idx_B[j];
3068  max_idx = std::max(max_idx, idx_B[j]);
3069  }
3070  for (int j=0; j<order_C; j++){
3071  (*new_idx_C)[j] = idx_C[j];
3072  max_idx = std::max(max_idx, idx_C[j]);
3073  }
3074  //adjust indices by rotating maximum index with removed index, so that indices range from 0 to num_indices-1
3075  if (iA != max_idx){
3076  for (int j=0; j<this->order-1; j++){
3077  if ((*new_idx_A)[j] == max_idx)
3078  (*new_idx_A)[j] = iA;
3079  }
3080  for (int j=0; j<order_B; j++){
3081  if ((*new_idx_B)[j] == max_idx)
3082  (*new_idx_B)[j] = iA;
3083  }
3084  for (int j=0; j<order_C; j++){
3085  if ((*new_idx_C)[j] == max_idx)
3086  (*new_idx_C)[j] = iA;
3087  }
3088  }
3089  //run summation to reduce index
3090  tensor * new_tsr = new tensor(this->sr, this->order-1, new_len, new_sym, this->wrld, 1, this->name, 1, this->is_sparse);
3091  summation s(this, sum_A_idx, this->sr->mulid(), new_tsr, sum_B_idx, this->sr->mulid());
3092  s.execute();
3093  return new_tsr;
3094  }
3095  }
3096  return this;
3097  }
3098 }
void permute(int order, int const *perm, int *arr)
permute an array
Definition: util.cxx:205
void read(int64_t num_pair, char const *alpha, char const *beta, int64_t const *inds, char *data)
read tensor data with <key, value> pairs where key is the global index for the value, which gets filled in with beta times the old values plus alpha times the values read from the tensor.
void write(char const *buf, int64_t n=1)
sets internal pairs to provided data
Definition: algstrct.cxx:805
int calc_phys_rank(topology const *topo) const
compute the physical rank of a mapping
Definition: mapping.cxx:74
void inc_tot_mem_used(int64_t a)
Definition: memcontrol.cxx:80
int map_tensor(int num_phys_dims, int tsr_order, int const *tsr_edge_len, int const *tsr_sym_table, int *restricted, CommData *phys_comm, int const *comm_idx, int fill, mapping *tsr_edge_map)
map a tensor
Definition: mapping.cxx:244
double spredist_est_time(int64_t size, int np)
void zero_padding(int order, int64_t size, int nvirt, int const *edge_len, int const *sym, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int const *cphase_rank, char *vdata, algstrct const *sr)
sets to zero all values in padded region of tensor
Definition: pad.cxx:374
void write_key(int64_t key)
sets key of head pair to key
Definition: algstrct.cxx:821
char * home_buffer
buffer associated with home mapping of tensor, to which it is returned
CTF_int::CommData cdt
communicator data for MPI comm defining this world
Definition: world.h:32
map_type type
Definition: mapping.h:22
def sum(tensor, init_A, axis=None, dtype=None, out=None, keepdims=None)
Definition: core.pyx:4261
bool is_home
whether the latest tensor data is in the home buffer
int64_t * nnz_blk
nonzero elements in each block owned locally
int * sym
symmetries among tensor dimensions
void execute()
run contraction
Definition: contraction.cxx:99
virtual algstrct * clone() const =0
&#39;&#39;copy constructor&#39;&#39;
void slice(int const *offsets_B, int const *ends_B, char const *beta, tensor *A, int const *offsets_A, int const *ends_A, char const *alpha)
accumulates out a slice (block) of this tensor = B B[offsets,ends)=beta*B[offsets,ends) + alpha*A[offsets_A,ends_A)
#define DPRINTF(...)
Definition: util.h:235
virtual int pair_size() const
gets pair size el_size plus the key size
Definition: algstrct.h:46
int calc_phase() const
compute the phase of a mapping
Definition: mapping.cxx:39
int64_t get_coo_size(int64_t nnz, int val_size)
Definition: coo.cxx:7
virtual char * pair_alloc(int64_t n) const
allocate space for n (int64_t,dtype) pairs, necessary for object types
Definition: algstrct.cxx:681
double dgtog_est_time(int64_t tot_sz, int np)
estimates execution time, given this processor sends a receives tot_sz across np procs ...
void calc_dim(int order, int64_t size, int const *edge_len, mapping const *edge_map, int64_t *vrt_sz, int *vrt_edge_len, int *blk_edge_len)
calculate the block-sizes of a tensor
void execute(bool run_diag=false)
run summation
Definition: summation.cxx:119
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340
int * pad_edge_len
padded tensor edge lengths
void serialize(char **buffer, int *size)
serialize object into contiguous data buffer
int * inner_ordering
ordering of the dimensions according to which the tensori s folded
int calc_phys_phase() const
compute the physical phase of a mapping
Definition: mapping.cxx:57
void allred(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op)
allreduce, same interface as MPI_Allreduce, but excluding the comm
Definition: common.cxx:360
void read_loc_pairs(int order, int64_t nval, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *phase_rank, int64_t *nread, char const *data, char **pairs, algstrct const *sr)
read tensor pairs local to processor
Definition: sparse_rw.cxx:1235
int sparsify(char const *threshold=NULL, bool take_abs=true)
reduce tensor to sparse format, storing only nonzero data, or data above a specified threshold...
void stop()
Definition: int_timer.cxx:151
bool has_home
whether the tensor has a home mapping/buffer
local process walltime measurement
Definition: timer.h:50
int64_t size
current size of local tensor data chunk (mapping-dependent)
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
Definition: common.h:37
int find_topology(topology const *topo, std::vector< topology * > &topovec)
searches for an equivalent topology in avector of topologies
Definition: topology.cxx:571
int get_best_topo(int64_t nvirt, int topo, CommData global_comm, int64_t bcomm_vol, int64_t bmemuse)
get the best topologoes (least nvirt) over all procs
Definition: topology.cxx:591
void set_distribution(char const *idx, CTF::Idx_Partition const &prl, CTF::Idx_Partition const &blk)
set edge mappings as specified
bool is_folded
whether the data is folded/transposed into a (lower-order) tensor
int64_t home_size
size of home buffer
untyped internal class for doubly-typed univariate function
Definition: sum_tsr.h:14
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
#define DEBUG_PRINTF(...)
Definition: util.h:238
void sort(int64_t n)
sorts set of pairs using std::sort
Definition: algstrct.cxx:825
void copy_mapping(int order, mapping const *mapping_A, mapping *mapping_B)
copies mapping A to B
Definition: mapping.cxx:190
an instance of the CTF library (world) on a MPI communicator
Definition: world.h:19
void write_dense_to_file(MPI_File &file, int64_t offset=0)
write all tensor data to binary file in element order, unpacking from sparse or symmetric formats ...
void read_dense_from_file(MPI_File &file, int64_t offset=0)
read all tensor data from binary file in element order, which should be stored as nonsymmetric and de...
bool is_sparse
whether only the non-zero elements of the tensor are stored
int order
number of tensor dimensions
void read_val(char *buf) const
sets external value to the value pointed by the iterator
Definition: algstrct.cxx:801
virtual char * alloc(int64_t n) const
allocate space for n items, necessary for object types
Definition: algstrct.cxx:685
char * all_data
serialized buffer containing info and data
Definition: coo.h:17
int align(tensor const *B)
align mapping of thisa tensor to that of B
void cyclic_reshuffle(int const *sym, distribution const &old_dist, int const *old_offsets, int *const *old_permutation, distribution const &new_dist, int const *new_offsets, int *const *new_permutation, char **ptr_tsr_data, char **ptr_tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm, bool reuse_buffers, char const *alpha, char const *beta)
Goes from any set of phases to any new set of phases.
void set_padding()
sets padding and local size of a tensor given a mapping
void padded_reshuffle(int const *sym, distribution const &old_dist, distribution const &new_dist, char *tsr_data, char **tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm)
Reshuffle elements using key-value pair read/write.
Definition: redist.cxx:8
Partition part
Definition: partition.h:30
CTF::World * wrld
distributed processor context on which tensor is defined
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
#define VPRINTF(...)
Definition: util.h:207
int rank
rank of local processor
Definition: world.h:24
bool is_cyclic
whether the tensor data is cyclically distributed (blocked if false)
virtual void min(char const *a, char const *b, char *c) const
c = min(a,b)
Definition: algstrct.cxx:132
int64_t nnz() const
retrieves number of nonzeros out of all_data
Definition: coo.cxx:45
int write(int64_t num_pair, char const *alpha, char const *beta, char *mapped_data, char const rw='w')
Add tensor data new=alpha*new+beta*old with <key, value> pairs where key is the global index for the ...
class for execution distributed contraction of tensors
Definition: contraction.h:16
CTF::World World
Definition: back_comp.h:7
int * padding
padding along each edge length (less than distribution phase)
int * lens
unpadded tensor edge lengths
LinModel< 3 > spredist_mdl(spredist_mdl_init,"spredist_mdl")
void get_data(int64_t nz, int order, int const *lens, int const *rev_ordering, int nrow_idx, char *tsr_data, algstrct const *sr, int const *phase, int const *phase_rank)
unfolds tensor data from COO format based on prespecification of row and column modes ...
Definition: coo.cxx:146
int64_t nnz() const
retrieves number of nonzeros out of all_data
Definition: csr.cxx:80
int64_t k() const
returns key of pair at head of ptr
Definition: algstrct.cxx:764
serialized matrix in coordinate format, meaning three arrays of dimension nnz are stored...
Definition: coo.h:14
int can_block_reshuffle(int order, int const *old_phase, mapping const *map)
determines if tensor can be permuted by block
Definition: redist.cxx:618
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
void print_map(FILE *stream=stdout, bool allcall=1) const
displays mapping information
int comp_dim_map(mapping const *map_A, mapping const *map_B)
compares two mappings
Definition: mapping.cxx:143
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
Definition: csr.h:22
bool is_data_aliased
whether the tensor data is an alias of another tensor object&#39;s data
void(* abs)(char const *a, char *b)
b = max(a,addinv(a))
Definition: algstrct.h:42
int64_t k() const
returns key of pair at head of ptr
Definition: algstrct.cxx:789
int64_t nnz_tot
maximum number of local nonzero elements over all procs
double est_time_transp(int order, int const *new_order, int const *edge_len, int dir, algstrct const *sr)
estimates time needed to transposes a non-symmetric (folded) tensor based on performance models ...
algstrct * sr
algstrct on which tensor elements and operations are defined
virtual void pair_dealloc(char *ptr) const
deallocate given pointer containing contiguous array of pairs
Definition: algstrct.cxx:693
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
tensor * rec_tsr
representation of folded tensor (shares data pointer)
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
bool is_mapped
whether a mapping has been selected
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
Definition: common.cxx:336
void read_val(char *buf) const
sets value to the value pointed by the iterator
Definition: algstrct.cxx:776
mapping * child
Definition: mapping.h:26
void orient_subworld(CTF::World *greater_world, int &bw_mirror_rank, int &fw_mirror_rank, distribution *&odst, char **sub_buffer_)
maps data from this world (subcomm) to the correct order of processors with respect to a parent (grea...
void dgtog_reshuffle(int const *sym, int const *edge_len, distribution const &old_dist, distribution const &new_dist, char **ptr_tsr_data, char **ptr_tsr_new_data, algstrct const *sr, CommData ord_glb_comm)
void set_data(int64_t nz, int order, int const *lens, int const *ordering, int nrow_idx, char const *tsr_data, algstrct const *sr, int const *phase)
folds tensor data into COO format based on prespecification of row and column modes ...
Definition: coo.cxx:75
MPI_Comm cm
Definition: common.h:129
def copy(tensor, A)
Definition: core.pyx:3583
virtual void print(char const *a, FILE *fp=stdout) const
prints the value
Definition: algstrct.cxx:170
void depermute_keys(int order, int num_pair, int const *edge_len, int const *new_edge_len, int *const *permutation, char *pairs_buf, algstrct const *sr)
depermutes keys (apply P^T)
Definition: sparse_rw.cxx:99
void block_reshuffle(distribution const &old_dist, distribution const &new_dist, char *tsr_data, char *&tsr_cyclic_data, algstrct const *sr, CommData glb_comm)
Reshuffle elements by block given the global phases stay the same.
Definition: redist.cxx:454
int64_t calc_nvirt() const
calculate virtualization factor of tensor return virtualization factor
int get_distribution_size(int order)
Definition: distribution.h:13
void permute_keys(int order, int num_pair, int const *edge_len, int const *new_edge_len, int *const *permutation, char *pairs_buf, int64_t *new_num_pair, algstrct const *sr)
permutes keys
Definition: sparse_rw.cxx:10
int check_self_mapping(tensor const *tsr, int const *idx_map)
checks mapping in preparation for tensors scale, summ or contract
Definition: mapping.cxx:332
int64_t nnz_loc
number of local nonzero elements
int univar_function(int n, World &dw)
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
void start()
Definition: int_timer.cxx:141
void pad_key(int order, int64_t num_pair, int const *edge_len, int const *padding, PairIterator pairs, algstrct const *sr, int const *offsets)
applies padding to keys
Definition: pad.cxx:6
bool profile
whether profiling should be done for contractions/sums involving this tensor
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
int64_t proc_bytes_available()
gives total memory available on this MPI process
Definition: memcontrol.cxx:655
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
Definition: apsp.cxx:17
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
internal distributed tensor class
int lcm(int a, int b)
Definition: util.h:340
void scal_diag(int order, int64_t size, int nvirt, int const *edge_len, int const *sym, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int const *cphase_rank, char *vdata, algstrct const *sr, int const *sym_mask)
scales each element by 1/(number of entries equivalent to it after permutation of indices for which s...
Definition: pad.cxx:567
double blres_est_time(int64_t tot_sz, int nv0, int nv1)
estimates execution time, given this processor sends a receives tot_sz across np procs ...
Definition: redist.cxx:449
void nosym_transpose(tensor *A, int all_fdim_A, int const *all_flen_A, int const *new_order, int dir)
void write_val(char const *buf)
sets value of head pair to what is in buf
Definition: algstrct.cxx:817
int64_t get_csr_size(int64_t nnz, int nrow_, int val_size)
computes the size of a serialized CSR matrix
Definition: csr.cxx:8
void wr_pairs_layout(int order, int np, int64_t inwrite, char const *alpha, char const *beta, char rw, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *virt_phys_rank, int const *bucket_lda, char *wr_pairs_buf, char *rw_data, CommData glb_comm, algstrct const *sr, bool is_sparse, int64_t nnz_loc, int64_t *nnz_blk, char *&pprs_new, int64_t &nnz_loc_new)
read or write pairs from / to tensor
Definition: sparse_rw.cxx:872
topology * topo
topology to which the tensor is mapped
int redistribute(distribution const &old_dist, int const *old_offsets=NULL, int *const *old_permutation=NULL, int const *new_offsets=NULL, int *const *new_permutation=NULL)
permutes the data of a tensor to its new layout
int64_t packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in packed symmetric (SY, SH, or AS) layout
Definition: util.cxx:38
Idx_Partition reduce_order() const
extracts non-trivial part of partition by ommitting unit dimensions
Definition: partition.cxx:55
class for execution distributed summation of tensors
Definition: summation.h:15
int read_local_nnz(int64_t *num_pair, char **mapped_data, bool unpack_sym=false) const
read tensor data pairs local to processor that have nonzero values
void spsfy_tsr(int order, int64_t size, int nvirt, int const *edge_len, int const *sym, int const *phase, int const *phys_phase, int const *virt_dim, int *phase_rank, char const *vdata, char *&vpairs, int64_t *nnz_blk, algstrct const *sr, int64_t const *edge_lda, std::function< bool(char const *)> f)
extracts all tensor values (in pair format) that pass a sparsifier function (including padded zeros i...
Definition: sparse_rw.cxx:276
void depad_tsr(int order, int64_t num_pair, int const *edge_len, int const *sym, int const *padding, int const *prepadding, char const *pairsb, char *new_pairsb, int64_t *new_num_pair, algstrct const *sr)
retrieves the unpadded pairs
Definition: pad.cxx:51
bool has_zero_edge_len
if true tensor has a zero edge length, so is zero, which short-cuts stuff
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
a tensor with an index map associated with it (necessary for overloaded operators) ...
Definition: idx_tensor.h:15
#define ABORT
Definition: util.h:162
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
Definition: util.cxx:10
char * name
name given to tensor
int np
number of processors
Definition: world.h:26
double spredist_mdl_init[]
Definition: init_models.cxx:37
int sum_tensors(bool run_diag)
PDAXPY: a*idx_map_A(A) + b*idx_map_B(B) -> idx_map_B(B). Treats symmetric as lower triangular...
Definition: summation.cxx:1384
MPI_Comm comm
set of processors making up this world
Definition: world.h:22
int read_local(int64_t *num_pair, char **mapped_data, bool unpack_sym=false) const
read tensor data pairs local to processor including those with zero values WARNING: for sparse tensor...
void clear()
resets mapping to NOT_MAPPED
Definition: mapping.cxx:94
int conv_idx(int order, type const *cidx, int **iidx)
Definition: common.cxx:50
virtual MPI_Datatype mdtype() const
MPI datatype.
Definition: algstrct.cxx:80
def np(self)
Definition: core.pyx:315
void clear_mapping()
zeros out mapping