Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
spsum_tsr.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2015, Edgar Solomonik, all rights reserved.*/
2 
3 #include "../shared/util.h"
4 #include "spsum_tsr.h"
5 #include "spr_seq_sum.h"
6 #include "../interface/fun_term.h"
7 #include "../interface/idx_tensor.h"
8 
9 
10 namespace CTF_int {
11  tspsum::tspsum(tspsum * other) : tsum(other) {
12  is_sparse_A = other->is_sparse_A;
13  nnz_A = other->nnz_A;
14  nvirt_A = other->nvirt_A;
15  is_sparse_B = other->is_sparse_B;
16  nnz_B = other->nnz_B;
17  nvirt_B = other->nvirt_B;
18  new_nnz_B = other->new_nnz_B;
19  new_B = other->new_B;
20 
21  //nnz_blk_B should be copied by pointer, they are the same pointer as in tensor object
22  nnz_blk_B = other->nnz_blk_B;
23  //nnz_blk_A should be copied by value, since it needs to be potentially set in replicate and deallocated later
24  if (is_sparse_A){
25  nnz_blk_A = (int64_t*)alloc(sizeof(int64_t)*nvirt_A);
26  memcpy(nnz_blk_A, other->nnz_blk_A, sizeof(int64_t)*nvirt_A);
27  } else nnz_blk_A = NULL;
28  }
29 
31  if (buffer != NULL) cdealloc(buffer);
32  if (nnz_blk_A != NULL) cdealloc(nnz_blk_A);
33  }
34 
35  tspsum::tspsum(summation const * s) : tsum(s) {
36  is_sparse_A = s->A->is_sparse;
37  nnz_A = s->A->nnz_loc;
38  nvirt_A = s->A->calc_nvirt();
39 
40  is_sparse_B = s->B->is_sparse;
41  nnz_B = s->B->nnz_loc;
42  nvirt_B = s->B->calc_nvirt();
43 
44  if (is_sparse_A){
45  nnz_blk_A = (int64_t*)alloc(sizeof(int64_t)*nvirt_A);
46  memcpy(nnz_blk_A, s->A->nnz_blk, sizeof(int64_t)*nvirt_A);
47  } else nnz_blk_A = NULL;
48 
49  nnz_blk_B = s->B->nnz_blk;
50  new_nnz_B = nnz_B;
51  new_B = NULL;
52  }
53 
55  cdealloc(virt_dim);
56  delete rec_tsum;
57  }
58 
60  tspsum_virt * o = (tspsum_virt*)other;
61  rec_tsum = o->rec_tsum->clone();
62  num_dim = o->num_dim;
63  virt_dim = (int*)alloc(sizeof(int)*num_dim);
64  memcpy(virt_dim, o->virt_dim, sizeof(int)*num_dim);
65 
66  order_A = o->order_A;
67  blk_sz_A = o->blk_sz_A;
68  nnz_blk_A = o->nnz_blk_A;
69  idx_map_A = o->idx_map_A;
70 
71  order_B = o->order_B;
72  blk_sz_B = o->blk_sz_B;
73  nnz_blk_B = o->nnz_blk_B;
74  idx_map_B = o->idx_map_B;
75  }
76 
78  order_A = s->A->order;
79  idx_map_A = s->idx_A;
80  order_B = s->B->order;
81  idx_map_B = s->idx_B;
82  }
83 
85  return new tspsum_virt(this);
86  }
87 
89  int i;
90  printf("tspsum_virt:\n");
91  printf("blk_sz_A = %ld, blk_sz_B = %ld\n",
93  for (i=0; i<num_dim; i++){
94  printf("virt_dim[%d] = %d\n", i, virt_dim[i]);
95  }
96  rec_tsum->print();
97  }
98 
100  return (order_A+order_B+3*num_dim)*sizeof(int);
101  }
102 
104  int * idx_arr, * lda_A, * lda_B, * beta_arr;
105  int * ilda_A, * ilda_B;
106  int64_t i, off_A, off_B;
107  int nb_A, nb_B, alloced, ret;
108  TAU_FSTART(spsum_virt);
109 
110  if (this->buffer != NULL){
111  alloced = 0;
112  idx_arr = (int*)this->buffer;
113  } else {
114  alloced = 1;
115  ret = alloc_ptr(mem_fp(), (void**)&idx_arr);
116  ASSERT(ret==0);
117  }
118 
119  lda_A = idx_arr + num_dim;
120  lda_B = lda_A + order_A;
121  ilda_A = lda_B + order_B;
122  ilda_B = ilda_A + num_dim;
123 
124 
125  #define SET_LDA_X(__X) \
126  do { \
127  nb_##__X = 1; \
128  for (i=0; i<order_##__X; i++){ \
129  lda_##__X[i] = nb_##__X; \
130  nb_##__X = nb_##__X*virt_dim[idx_map_##__X[i]]; \
131  } \
132  memset(ilda_##__X, 0, num_dim*sizeof(int)); \
133  for (i=0; i<order_##__X; i++){ \
134  ilda_##__X[idx_map_##__X[i]] += lda_##__X[i]; \
135  } \
136  } while (0)
137  SET_LDA_X(A);
138  SET_LDA_X(B);
139  #undef SET_LDA_X
140 
141  /* dynammically determined size */
142  beta_arr = (int*)alloc(sizeof(int)*nb_B);
143 
144  int64_t * sp_offsets_A = NULL;
145  if (is_sparse_A){
146  sp_offsets_A = (int64_t*)alloc(sizeof(int64_t)*nb_A);
147  sp_offsets_A[0] = 0;
148  for (int i=1; i<nb_A; i++){
149  sp_offsets_A[i] = sp_offsets_A[i-1]+nnz_blk_A[i-1];
150  }
151  }
152 
153  int64_t * sp_offsets_B = NULL;
154  int64_t * new_sp_szs_B = NULL;
155  char ** buckets_B = NULL;
156  if (is_sparse_B){
157  sp_offsets_B = (int64_t*)alloc(sizeof(int64_t)*nb_B);
158  new_sp_szs_B = nnz_blk_B; //(int64_t*)alloc(sizeof(int64_t)*nb_B);
159 // memcpy(new_sp_szs_B, blk_sz_B, sizeof(int64_t)*nb_B);
160  buckets_B = (char**)alloc(sizeof(char*)*nb_B);
161  for (int i=0; i<nb_B; i++){
162  if (i==0)
163  sp_offsets_B[0] = 0;
164  else
165  sp_offsets_B[i] = sp_offsets_B[i-1]+nnz_blk_B[i-1];
166  buckets_B[i] = this->B + sp_offsets_B[i]*this->sr_B->pair_size();
167  }
168  }
169 
170 
171  memset(idx_arr, 0, num_dim*sizeof(int));
172  memset(beta_arr, 0, nb_B*sizeof(int));
173  off_A = 0, off_B = 0;
174  rec_tsum->alpha = this->alpha;
175  rec_tsum->beta = this->beta;
176  for (;;){
177  if (is_sparse_A){
178  rec_tsum->nnz_A = nnz_blk_A[off_A];
179  rec_tsum->A = this->A + sp_offsets_A[off_A]*this->sr_A->pair_size();
180  } else
181  rec_tsum->A = this->A + off_A*blk_sz_A*this->sr_A->el_size;
182  if (is_sparse_B){
183  rec_tsum->nnz_B = new_sp_szs_B[off_B];
184  rec_tsum->B = buckets_B[off_B];
185  } else
186  rec_tsum->B = this->B + off_B*blk_sz_B*this->sr_B->el_size;
187 // sr_B->copy(rec_tsum->beta, sr_B->mulid());
188  if (beta_arr[off_B]>0)
189  rec_tsum->beta = sr_B->mulid();
190  else
191  rec_tsum->beta = this->beta;
192 
193  rec_tsum->run();
194  if (is_sparse_B){
195  new_sp_szs_B[off_B] = rec_tsum->new_nnz_B;
196  if (beta_arr[off_B] > 0) sr_B->pair_dealloc(buckets_B[off_B]);
197  buckets_B[off_B] = rec_tsum->new_B;
198  }
199  beta_arr[off_B] = 1;
200 
201  for (i=0; i<num_dim; i++){
202  off_A -= ilda_A[i]*idx_arr[i];
203  off_B -= ilda_B[i]*idx_arr[i];
204  idx_arr[i]++;
205  if (idx_arr[i] >= virt_dim[i])
206  idx_arr[i] = 0;
207  off_A += ilda_A[i]*idx_arr[i];
208  off_B += ilda_B[i]*idx_arr[i];
209  if (idx_arr[i] != 0) break;
210  }
211  if (i==num_dim) break;
212  }
213  if (is_sparse_B){
214  this->new_nnz_B = 0;
215  for (int i=0; i<nb_B; i++){
216  this->new_nnz_B += new_sp_szs_B[i];
217  }
218  new_B = sr_B->pair_alloc(this->new_nnz_B);
219  int64_t pfx = 0;
220  for (int i=0; i<nb_B; i++){
221  //memcpy(new_B+pfx, buckets_B[i], new_sp_szs_B[i]*this->sr_B->pair_size());
222  //printf("copying %ld pairs\n",new_sp_szs_B[i]);
223  sr_B->copy_pairs(new_B+pfx, buckets_B[i], new_sp_szs_B[i]);
224  pfx += new_sp_szs_B[i]*this->sr_B->pair_size();
225  if (beta_arr[i] > 0) sr_B->pair_dealloc(buckets_B[i]);
226  }
227  //FIXME: how to pass B back generally
228  //cdealloc(this->B);
229  cdealloc(buckets_B);
230  }
231  if (is_sparse_A) cdealloc(sp_offsets_A);
232  if (is_sparse_B) cdealloc(sp_offsets_B);
233  if (alloced){
234  cdealloc(idx_arr);
235  }
236  cdealloc(beta_arr);
237  TAU_FSTOP(spsum_virt);
238  }
239 
241  int i;
242  printf("tspsum_replicate: \n");
243  printf("cdt_A = %p, size_A = %ld, ncdt_A = %d\n",
244  cdt_A, size_A, ncdt_A);
245  for (i=0; i<ncdt_A; i++){
246  printf("cdt_A[%d] length = %d\n",i,cdt_A[i]->np);
247  }
248  printf("cdt_B = %p, size_B = %ld, ncdt_B = %d\n",
249  cdt_B, size_B, ncdt_B);
250  for (i=0; i<ncdt_B; i++){
251  printf("cdt_B[%d] length = %d\n",i,cdt_B[i]->np);
252  }
253 
254  rec_tsum->print();
255  }
256 
258  delete rec_tsum;
259 /* for (int i=0; i<ncdt_A; i++){
260  cdt_A[i]->deactivate();
261  }*/
262  if (ncdt_A > 0)
263  cdealloc(cdt_A);
264 /* for (int i=0; i<ncdt_B; i++){
265  cdt_B[i]->deactivate();
266  }*/
267  if (ncdt_B > 0)
268  cdealloc(cdt_B);
269  }
270 
272  tspsum_replicate * o = (tspsum_replicate*)other;
273  rec_tsum = o->rec_tsum->clone();
274  size_A = o->size_A;
275  size_B = o->size_B;
276  ncdt_A = o->ncdt_A;
277  ncdt_B = o->ncdt_B;
278  }
279 
280 
282  int const * phys_mapped,
283  int64_t blk_sz_A,
284  int64_t blk_sz_B)
285  : tspsum(s) {
286  //FIXME: might be smarter to use virtual inheritance and not replicate all the code from tsum_replicdate
287  int i;
288  int nphys_dim = s->A->topo->order;
289  this->ncdt_A = 0;
290  this->ncdt_B = 0;
291  this->size_A = blk_sz_A;
292  this->size_B = blk_sz_B;
293  this->cdt_A = NULL;
294  this->cdt_B = NULL;
295  for (i=0; i<nphys_dim; i++){
296  if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
297  this->ncdt_A++;
298  }
299  if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
300  this->ncdt_B++;
301  }
302  }
303  if (this->ncdt_A > 0)
304  CTF_int::alloc_ptr(sizeof(CommData*)*this->ncdt_A, (void**)&this->cdt_A);
305  if (this->ncdt_B > 0)
306  CTF_int::alloc_ptr(sizeof(CommData*)*this->ncdt_B, (void**)&this->cdt_B);
307  this->ncdt_A = 0;
308  this->ncdt_B = 0;
309  for (i=0; i<nphys_dim; i++){
310  if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
311  this->cdt_A[this->ncdt_A] = &s->A->topo->dim_comm[i];
312  this->ncdt_A++;
313  }
314  if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
315  this->cdt_B[this->ncdt_B] = &s->B->topo->dim_comm[i];
316  this->ncdt_B++;
317  }
318  }
319  ASSERT(this->ncdt_A == 0 || this->cdt_B == 0);
320  }
321 
323  return new tspsum_replicate(this);
324  }
325 
327  return 0;
328  }
329 
331  int brank, i;
332  char * buf = this->A;
333 // int64_t * save_nnz_blk_A = NULL;
334  if (is_sparse_A){
335  /*if (ncdt_A > 0){
336  save_nnz_blk_A = (int64_t*)alloc(sizeof(int64_t)*nvirt_A);
337  memcpy(save_nnz_blk_A,nnz_blk_A,sizeof(int64_t)*nvirt_A);
338  }*/
339  size_A = nnz_A;
340  for (i=0; i<ncdt_A; i++){
341  cdt_A[i]->bcast(&size_A, 1, MPI_INT64_T, 0);
342  cdt_A[i]->bcast(nnz_blk_A, nvirt_A, MPI_INT64_T, 0);
343  }
344  //get mpi dtype for pair object
345  MPI_Datatype md;
346  bool need_free = get_mpi_dt(size_A, sr_A->pair_size(), md);
347 
348  if (nnz_A != size_A)
349  buf = (char*)alloc(sr_A->pair_size()*size_A);
350  for (i=0; i<ncdt_A; i++){
351  cdt_A[i]->bcast(buf, size_A, md, 0);
352  }
353  if (need_free) MPI_Type_free(&md);
354  } else {
355  for (i=0; i<ncdt_A; i++){
356  cdt_A[i]->bcast(this->A, size_A, sr_A->mdtype(), 0);
357  }
358  }
359  if (is_sparse_B){
360  //FIXME: need to replicate nnz_blk_B for this
361  assert(ncdt_B == 0);
362  size_B = nnz_B;
363  for (i=0; i<ncdt_B; i++){
364  cdt_B[i]->bcast(&size_B, 1, MPI_INT64_T, 0);
365  cdt_B[i]->bcast(nnz_blk_B, nvirt_B, MPI_INT64_T, 0);
366  }
367  }
368 
369  /* for (i=0; i<ncdt_B; i++){
370  POST_BCAST(this->B, size_B*sizeof(dtype), COMM_CHAR_T, 0, cdt_B[i]-> 0);
371  }*/
372  ASSERT(ncdt_B == 0 || !is_sparse_B);
373  brank = 0;
374  for (i=0; i<ncdt_B; i++){
375  brank += cdt_B[i]->rank;
376  }
377  if (brank != 0) sr_B->set(this->B, sr_B->addid(), size_B);
378 
380  rec_tsum->A = buf;
381  rec_tsum->nnz_A = size_A;
382  rec_tsum->B = this->B;
383  rec_tsum->nnz_B = nnz_A;
384  rec_tsum->nnz_blk_B = this->nnz_blk_B;
385  rec_tsum->alpha = this->alpha;
386  if (brank != 0)
387  rec_tsum->beta = sr_B->mulid();
388  else
389  rec_tsum->beta = this->beta;
390 
391  rec_tsum->run();
392 
394  new_B = rec_tsum->new_B;
395  //printf("new_nnz_B = %ld\n",new_nnz_B);
396  if (buf != this->A) cdealloc(buf);
397 
398  for (i=0; i<ncdt_B; i++){
399  cdt_B[i]->allred(MPI_IN_PLACE, this->B, size_B, sr_B->mdtype(), sr_B->addmop());
400  }
401 
402 /* if (save_nnz_blk_A != NULL){
403  memcpy(nnz_blk_A,save_nnz_blk_A,sizeof(int64_t)*nvirt_A);
404  }*/
405 
406  }
407 
408 
410  seq_tsr_spsum * o = (seq_tsr_spsum*)other;
411 
412  order_A = o->order_A;
413  idx_map_A = o->idx_map_A;
414  sym_A = o->sym_A;
415  edge_len_A = (int*)alloc(sizeof(int)*order_A);
416  memcpy(edge_len_A, o->edge_len_A, sizeof(int)*order_A);
417 
418  order_B = o->order_B;
419  idx_map_B = o->idx_map_B;
420  sym_B = o->sym_B;
421  edge_len_B = (int*)alloc(sizeof(int)*order_B);
422  memcpy(edge_len_B, o->edge_len_B, sizeof(int)*order_B);
423 
424  is_inner = o->is_inner;
425  inr_stride = o->inr_stride;
426 
427  map_pfx = o->map_pfx;
428 
429  is_custom = o->is_custom;
430  func = o->func;
431  }
432 
434  order_A = s->A->order;
435  sym_A = s->A->sym;
436  idx_map_A = s->idx_A;
437  order_B = s->B->order;
438  sym_B = s->B->sym;
439  idx_map_B = s->idx_B;
440  is_custom = s->is_custom;
441 
442  map_pfx = 1;
443 
444  //printf("A is sparse = %d, B is sparse = %d\n", s->A->is_sparse, s->B->is_sparse);
445  if (s->A->is_sparse && s->B->is_sparse){
446  for (int i=0; i<s->B->order; i++){
447  bool mapped = true;
448  for (int j=0; j<s->A->order; j++){
449  if (s->idx_B[i] == s->idx_A[j]){
450  mapped = false;
451  }
452  }
453  if (mapped)
454  map_pfx *= s->B->pad_edge_len[i]/s->B->edge_map[i].calc_phase();
455  }
456  }
457  }
458 
460  int i;
461  printf("seq_tsr_spsum:\n");
462  for (i=0; i<order_A; i++){
463  printf("edge_len_A[%d]=%d\n",i,edge_len_A[i]);
464  }
465  for (i=0; i<order_B; i++){
466  printf("edge_len_B[%d]=%d\n",i,edge_len_B[i]);
467  }
468  printf("is inner = %d\n", is_inner);
469  if (is_inner) printf("inner stride = %d\n", inr_stride);
470  printf("map_pfx = %ld\n", map_pfx);
471  }
472 
474  return new seq_tsr_spsum(this);
475  }
476 
477  int64_t seq_tsr_spsum::mem_fp(){ return 0; }
478 
480  if (is_sparse_A && !is_sparse_B){
481  spA_dnB_seq_sum(this->alpha,
482  this->A,
483  this->nnz_A,
484  this->sr_A,
485  this->beta,
486  this->B,
487  this->sr_B,
488  order_B,
489  edge_len_B,
490  sym_B,
491  func);
492  } else if (!is_sparse_A && is_sparse_B){
493  dnA_spB_seq_sum(this->alpha,
494  this->A,
495  this->sr_A,
496  order_A,
497  edge_len_A,
498  sym_A,
499  this->beta,
500  this->B,
501  this->nnz_B,
502  this->new_B,
503  this->new_nnz_B,
504  this->sr_B,
505  func);
506  } else if (is_sparse_A && is_sparse_B){
507  spA_spB_seq_sum(this->alpha,
508  this->A,
509  this->nnz_A,
510  this->sr_A,
511  this->beta,
512  this->B,
513  this->nnz_B,
514  this->new_B,
515  this->new_nnz_B,
516  this->sr_B,
517  func,
518  this->map_pfx);
519  } else {
520  assert(0); //we should be doing dense summation then
521  }
522  }
523 
525  delete rec_tsum;
526  cdealloc(map_idx_len);
527  cdealloc(map_idx_lda);
528  }
529 
530 
532  tspsum_map * o = (tspsum_map*)other;
533  rec_tsum = o->rec_tsum->clone();
534  nmap_idx = o->nmap_idx;
535  map_idx_len = (int64_t*)alloc(sizeof(int64_t)*nmap_idx);
536  map_idx_lda = (int64_t*)alloc(sizeof(int64_t)*nmap_idx);
537  memcpy(map_idx_len, o->map_idx_len, sizeof(int64_t)*nmap_idx);
538  memcpy(map_idx_lda, o->map_idx_lda, sizeof(int64_t)*nmap_idx);
539  }
540 
542  nmap_idx = 0;
543  map_idx_len = (int64_t*)alloc(sizeof(int64_t)*s->B->order);
544  map_idx_lda = (int64_t*)alloc(sizeof(int64_t)*s->B->order);
545  int map_idx_rev[s->B->order];
546 
547  int64_t lda_B[s->B->order];
548  lda_B[0] = 1;
549  for (int o=1; o<s->B->order; o++){
550  if (s->B->is_sparse)
551  lda_B[o] = lda_B[o-1]*s->B->lens[o];
552  else
553  lda_B[o] = lda_B[o-1]*s->B->pad_edge_len[o]/s->B->edge_map[o].calc_phase();
554  }
555 
556  for (int oB=0; oB<s->B->order; oB++){
557  bool inA = false;
558  for (int oA=0; oA<s->A->order; oA++){
559  if (s->idx_A[oA] == s->idx_B[oB]){
560  inA = true;
561  }
562  }
563  if (!inA){
564  bool is_rep=false;
565  for (int ooB=0; ooB<oB; ooB++){
566  if (s->idx_B[ooB] == s->idx_B[oB]){
567  is_rep = true;
568  map_idx_lda[map_idx_rev[ooB]] += lda_B[oB];
569  break;
570  }
571  }
572  if (!is_rep){
573  map_idx_len[nmap_idx] = s->B->lens[oB]/s->B->edge_map[oB].calc_phase() + (s->B->lens[oB]/s->B->edge_map[oB].calc_phase() > s->B->edge_map[oB].calc_phase());
574  map_idx_lda[nmap_idx] = lda_B[oB];
575  map_idx_rev[nmap_idx] = oB;
576  nmap_idx++;
577  }
578  }
579  }
580  }
581 
583  return new tspsum_map(this);
584  }
585 
587  printf("tspsum_map:\n");
588  printf("namp_idx = %d\n",nmap_idx);
589  rec_tsum->print();
590  }
591 
593  int64_t mem = nnz_A*this->sr_A->pair_size();
594  if (nmap_idx > 0){
595  int64_t tot_rep=1;
596  for (int midx=0; midx<nmap_idx; midx++){
597  tot_rep *= map_idx_len[midx];
598  }
599  return tot_rep*mem;
600  } else return mem;
601  }
602 
605  int64_t tot_rep=1;
606  for (int midx=0; midx<nmap_idx; midx++){
607  tot_rep *= map_idx_len[midx];
608  }
609  PairIterator pi(this->sr_A, A);
610  char * buf;
611  alloc_ptr(this->sr_A->pair_size()*nnz_A*tot_rep, (void**)&buf);
612  //printf("pair size is %d, nnz is %ld\n",this->sr_A->pair_size(), nnz_A);
613  PairIterator pi_new(this->sr_A, buf);
614 #ifdef USE_OMP
615  #pragma omp parallel for
616 #endif
617  for (int64_t i=0; i<nnz_A; i++){
618  for (int64_t r=0; r<tot_rep; r++){
619  this->sr_A->copy(pi_new[i*tot_rep+r].ptr, pi[i].ptr);
620  }
621  }
622 #ifdef USE_OMP
623  #pragma omp parallel for
624 #endif
625  for (int64_t i=0; i<nnz_A; i++){
626  int64_t phase=1;
627  for (int midx=0; midx<nmap_idx; midx++){
628  int64_t stride=phase;
629  phase *= map_idx_len[midx];
630  for (int64_t r=0; r<tot_rep/phase; r++){
631  for (int64_t m=1; m<map_idx_len[midx]; m++){
632  for (int64_t s=0; s<stride; s++){
633  ((int64_t*)(pi_new[i*tot_rep + r*phase + m*stride + s].ptr))[0] += m*map_idx_lda[midx];
634  }
635  }
636  }
637  }
638  }
639  nnz_A *= tot_rep;
640  rec_tsum->nnz_A = nnz_A;
641  rec_tsum->A = buf;
642  rec_tsum->nnz_B = nnz_B;
643  rec_tsum->B = B;
644  for (int v=0; v<nvirt_A; v++){
645  nnz_blk_A[v] *= tot_rep;
646  }
649  rec_tsum->run();
652  new_B = rec_tsum->new_B;
653  cdealloc(buf);
655  }
656 
658  delete rec_tsum;
659  cdealloc(p);
660  cdealloc(lens_new);
661  cdealloc(lens_old);
662  }
663 
665  tspsum_permute * o = (tspsum_permute*)other;
666  rec_tsum = o->rec_tsum->clone();
667  A_or_B = o->A_or_B;
668  order = o->order;
669  skip = o->skip;
670  p = (int*)alloc(sizeof(int)*order);
671  lens_old = (int*)alloc(sizeof(int)*order);
672  lens_new = (int*)alloc(sizeof(int)*order);
673  memcpy(p, o->p, sizeof(int)*order);
674  memcpy(lens_old, o->lens_old, sizeof(int)*order);
675  memcpy(lens_new, o->lens_new, sizeof(int)*order);
676  }
677 
678  tspsum_permute::tspsum_permute(summation const * s, bool A_or_B_, int const * lens) : tspsum(s) {
679  tensor * X, * Y;
680  int const * idx_X, * idx_Y;
681  A_or_B = A_or_B_;
682  if (A_or_B){
683  X = s->A;
684  Y = s->B;
685  idx_X = s->idx_A;
686  idx_Y = s->idx_B;
687  } else {
688  X = s->B;
689  Y = s->A;
690  idx_X = s->idx_B;
691  idx_Y = s->idx_A;
692  }
693  order = X->order;
694 
695  p = (int*)alloc(sizeof(int)*order);
696  lens_old = (int*)alloc(sizeof(int)*order);
697  lens_new = (int*)alloc(sizeof(int)*order);
698 
699  memcpy(lens_old, lens, sizeof(int)*this->order);
700  memcpy(lens_new, lens, sizeof(int)*this->order);
701 /* for (int i=0; i<this->order; i++){
702  memcpy(lens_new, lens, sizeof(int)*this->order);
703  }
704  if (Y->is_sparse){
705  } else {
706  lens_new[i] = X->pad_edge_len[i]/X->edge_map[i].calc_phase();
707  }*/
708  if (A_or_B){
709  // if A then ignore 'reduced' indices
710  for (int i=0; i<this->order; i++){
711  p[i] = -1;
712  for (int j=0; j<Y->order; j++){
713  if (idx_X[i] == idx_Y[j]){
714  ASSERT(p[i] == -1); // no repeating indices allowed here!
715  p[i] = j;
716  }
717  }
718  }
719  //adjust p in case there are mapped indices in B
720  int lesser[this->order];
721  for (int i=0; i<this->order; i++){
722  if (p[i] != -1){
723  lesser[i] = 0;
724  for (int j=0; j<this->order; j++){
725  if (i!=j && p[j] != -1 && p[j] < p[i]) lesser[i]++;
726  }
727  }
728  }
729  for (int i=0; i<this->order; i++){
730  if (p[i] != -1)
731  p[i] = lesser[i];
732  }
733  } else {
734  // if B then put 'map' indices first
735  int nmap_idx = 0;
736  for (int i=0; i<this->order; i++){
737  bool mapped = true;
738  for (int j=0; j<Y->order; j++){
739  if (idx_X[i] == idx_Y[j]){
740  mapped = false;
741  }
742  }
743  if (mapped) nmap_idx++;
744  }
745 
746  int nm = 0;
747  int nnm = 0;
748  for (int i=0; i<this->order; i++){
749  p[i] = nm;
750  for (int j=0; j<Y->order; j++){
751  if (idx_X[i] == idx_Y[j]){
752  ASSERT(p[i] == nm); // no repeating indices allowed here!
753  p[i] = nnm+nmap_idx;
754  nnm++;
755  }
756  }
757  if (p[i] == nm) nm++;
758  }
759  }
760  skip = true;
761  for (int i=0; i<this->order; i++){
762  if (p[i] != i) skip = false;
763 // printf("p[%d] = %d order = %d\n", i, p[i], this->order);
764  if (p[i] != -1) lens_new[p[i]] = lens[i];
765  }
766  }
767 
769  return new tspsum_permute(this);
770  }
771 
773  printf("tspsum_permute:\n");
774  if (A_or_B) printf("permuting A\n");
775  else printf("permuting B\n");
776  for (int i=0; i<order; i++){
777  printf("p[%d] = %d ",i,p[i]);
778  }
779  printf("\n");
780  rec_tsum->print();
781  }
782 
784  int64_t mem = 0;
785  if (A_or_B) mem+=nnz_A*sr_A->pair_size();
786  else mem+=nnz_B*sr_B->pair_size();
787  return mem;
788  }
789 
791  char * buf;
792 
793  if (skip){
794  rec_tsum->A = A;
795  rec_tsum->B = B;
796  rec_tsum->nnz_A = nnz_A;
797  rec_tsum->nnz_B = nnz_B;
798 
799  rec_tsum->run();
801  new_B = rec_tsum->new_B;
802  return;
803  }
804 
805  TAU_FSTART(spsum_permute);
806  if (A_or_B){
807  alloc_ptr(nnz_A*sr_A->pair_size(), (void**)&buf);
808  rec_tsum->A = buf;
809  rec_tsum->B = B;
810  sr_A->copy_pairs(buf, A, nnz_A);
811  int64_t new_lda_A[order];
812  memset(new_lda_A, 0, order*sizeof(int64_t));
813  int64_t lda=1;
814  for (int i=0; i<order; i++){
815  for (int j=0; j<order; j++){
816  if (p[j] == i){
817  new_lda_A[j] = lda;
818  lda *= lens_new[i];
819  }
820  }
821  }
822  ConstPairIterator rA(sr_A, A);
823  PairIterator wA(sr_A, buf);
824  rA.permute(nnz_A, order, lens_old, new_lda_A, wA);
825 
826  PairIterator mwA = wA;
827  for (int v=0; v<nvirt_A; v++){
828  mwA.sort(nnz_blk_A[v]);
829  mwA = mwA[nnz_blk_A[v]];
830  }
831  rec_tsum->A = buf;
832  } else {
833  alloc_ptr(nnz_B*sr_B->pair_size(), (void**)&buf);
834  rec_tsum->A = A;
835  rec_tsum->B = buf;
836  sr_B->copy(buf, B, nnz_B);
837  int64_t old_lda_B[order];
838  int64_t lda=1;
839  for (int i=0; i<order; i++){
840  old_lda_B[i] = lda;
841  lda *= lens_new[i];
842  }
843  int64_t new_lda_B[order];
844  std::fill(new_lda_B, new_lda_B+order, 0);
845  for (int i=0; i<order; i++){
846  new_lda_B[i] = old_lda_B[p[i]];
847  }
848  ConstPairIterator rB(sr_B, B);
849  PairIterator wB(sr_B, buf);
850  rB.permute(nnz_B, order, lens_old, new_lda_B, wB);
851 
852  PairIterator mwB = wB;
853  for (int v=0; v<nvirt_B; v++){
854  mwB.sort(nnz_blk_B[v]);
855  mwB = mwB[nnz_blk_B[v]];
856  }
857 
858  rec_tsum->B = buf;
859  }
860 
861  rec_tsum->nnz_A = nnz_A;
862  rec_tsum->nnz_B = nnz_B;
863  TAU_FSTOP(spsum_permute);
864  rec_tsum->run();
865  TAU_FSTART(spsum_permute);
866 
868  if (A_or_B){
869  new_B = rec_tsum->new_B;
870  cdealloc(buf);
871  } else {
872  if (nnz_B == new_nnz_B){
873  new_B = B;
874  } else {
875  new_B = (char*)alloc(new_nnz_B*sr_B->pair_size());
876  }
877  int inv_p[order];
878  for (int i=0; i<order; i++){
879  inv_p[p[i]] = i;
880  }
881  int64_t new_lda_B[order];
882  int64_t old_lda_B[order];
883  int64_t lda=1;
884  for (int i=0; i<order; i++){
885  old_lda_B[i] = lda;
886  lda *= lens_old[i];
887  }
888  for (int i=0; i<order; i++){
889  new_lda_B[i] = old_lda_B[inv_p[i]];
890  }
892  PairIterator wB(sr_B, new_B);
893 
894  rB.permute(new_nnz_B, order, lens_new, new_lda_B, wB);
895  PairIterator mwB = wB;
896  for (int v=0; v<nvirt_B; v++){
897  /*for (int i=0; i<nnz_blk_B[v]; i++){
898  printf("i=%d/%ld\n",i,nnz_blk_B[v]);
899  sr_B->print(mwB[i].d());
900  }*/
901  mwB.sort(nnz_blk_B[v]);
902  mwB = mwB[nnz_blk_B[v]];
903  }
904 
905  if (buf != rec_tsum->new_B && new_B != rec_tsum->new_B){
907  }
908  cdealloc(buf);
909  }
910  TAU_FSTOP(spsum_permute);
911  }
912 
913  void inv_idx(int order_A,
914  int const * idx_A,
915  int order_B,
916  int const * idx_B,
917  int * order_tot,
918  int ** idx_arr){
919  int i, dim_max;
920 
921  dim_max = -1;
922  for (i=0; i<order_A; i++){
923  if (idx_A[i] > dim_max) dim_max = idx_A[i];
924  }
925  for (i=0; i<order_B; i++){
926  if (idx_B[i] > dim_max) dim_max = idx_B[i];
927  }
928  dim_max++;
929  *order_tot = dim_max;
930  *idx_arr = (int*)alloc(sizeof(int)*2*dim_max);
931  std::fill((*idx_arr), (*idx_arr)+2*dim_max, -1);
932 
933  for (i=0; i<order_A; i++){
934  (*idx_arr)[2*idx_A[i]] = i;
935  }
936  for (i=0; i<order_B; i++){
937  (*idx_arr)[2*idx_B[i]+1] = i;
938  }
939  }
940 
942  delete rec_tsum;
943  cdealloc(divisor);
944  cdealloc(virt_dim);
945  cdealloc(phys_rank);
946  }
947 
949  tspsum_pin_keys * o = (tspsum_pin_keys*)other;
950 
951  rec_tsum = o->rec_tsum->clone();
952  A_or_B = o->A_or_B;
953  order = o->order;
954  lens = o->lens;
955  divisor = (int*)alloc(sizeof(int)*order);
956  phys_rank = (int*)alloc(sizeof(int)*order);
957  virt_dim = (int*)alloc(sizeof(int)*order);
958  memcpy(divisor, o->divisor, sizeof(int)*order);
959  memcpy(phys_rank, o->phys_rank, sizeof(int)*order);
960  memcpy(virt_dim, o->virt_dim, sizeof(int)*order);
961  }
962 
964  return new tspsum_pin_keys(this);
965  }
966 
968  printf("tspsum_pin_keys:\n");
969  if (A_or_B) printf("transforming global keys of A to local keys\n");
970  else printf("transforming global keys of B to local keys\n");
971  rec_tsum->print();
972  }
973 
975  return 3*order*sizeof(int);
976  }
977 
978  tspsum_pin_keys::tspsum_pin_keys(summation const * s, bool A_or_B_) : tspsum(s) {
979  tensor * X;
980  A_or_B = A_or_B_;
981  if (A_or_B){
982  X = s->A;
983  } else {
984  X = s->B;
985  }
986  order = X->order;
987  lens = X->lens;
988 
989  divisor = (int*)alloc(sizeof(int)*order);
990  phys_rank = (int*)alloc(sizeof(int)*order);
991  virt_dim = (int*)alloc(sizeof(int*)*order);
992 
993  for (int i=0; i<order; i++){
994  divisor[i] = X->edge_map[i].calc_phase();
995  phys_rank[i] = X->edge_map[i].calc_phys_rank(X->topo);
996  virt_dim[i] = divisor[i]/X->edge_map[i].calc_phys_phase();
997  }
998  }
999 
1001  TAU_FSTART(spsum_pin);
1002  char * X;
1003  algstrct const * sr;
1004  int64_t nnz;
1005  if (A_or_B){
1006  X = this->A;
1007  sr = this->sr_A;
1008  nnz = this->nnz_A;
1009  } else {
1010  X = this->B;
1011  sr = this->sr_B;
1012  nnz = this->nnz_B;
1013  }
1014 
1015 /* int * div_lens;
1016  alloc_ptr(order*sizeof(int), (void**)&div_lens);
1017  for (int j=0; j<order; j++){
1018  div_lens[j] = (lens[j]/divisor[j] + (lens[j]%divisor[j] > 0));
1019 // printf("lens[%d] = %d divisor[%d] = %d div_lens[%d] = %d\n",j,lens[j],j,divisor[j],j,div_lens[j]);
1020  }*/
1021 
1022  ConstPairIterator pi(sr, X);
1023  PairIterator pi_new(sr, X);
1024 
1025  pi.pin(nnz, order, lens, divisor, pi_new);
1026 
1027  if (A_or_B){
1028  rec_tsum->A = X;
1029  rec_tsum->B = B;
1030  } else {
1031  rec_tsum->A = A;
1032  rec_tsum->B = X;
1033  }
1034  rec_tsum->nnz_A = nnz_A;
1035  rec_tsum->nnz_B = nnz_B;
1036  TAU_FSTOP(spsum_pin);
1037  rec_tsum->run();
1038  TAU_FSTART(spsum_pin);
1039 
1041  new_B = rec_tsum->new_B;
1042  if (A_or_B){
1043  depin(sr_A, order, lens, divisor, nvirt_A, virt_dim, phys_rank, A, nnz_A, (int64_t*)nnz_blk_A, A, false);
1044  } else {
1045  char * old_B = new_B;
1047  if (old_B != new_B && old_B != B) sr->pair_dealloc(old_B);
1048  }
1049  TAU_FSTOP(spsum_pin);
1050  }
1051 }
void dnA_spB_seq_sum(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, char const *beta, char const *B, int64_t size_B, char *&new_B, int64_t &new_size_B, algstrct const *sr_B, univar_function const *func)
performs summation between two sparse tensors assumes B contain key value pairs sorted by key...
int calc_phys_rank(topology const *topo) const
compute the physical rank of a mapping
Definition: mapping.cxx:74
bool is_custom
whether there is a elementwise custom function
Definition: summation.h:32
algstrct const * sr_A
Definition: sum_tsr.h:70
void * buffer
Definition: sum_tsr.h:75
int64_t * nnz_blk
nonzero elements in each block owned locally
bool get_mpi_dt(int64_t count, int64_t datum_size, MPI_Datatype &dt)
gives a datatype for arbitrary datum_size, errors if exceeding 32-bits
Definition: common.cxx:587
int * sym
symmetries among tensor dimensions
int * idx_A
indices of left operand
Definition: summation.h:28
virtual int pair_size() const
gets pair size el_size plus the key size
Definition: algstrct.h:46
int calc_phase() const
compute the phase of a mapping
Definition: mapping.cxx:39
virtual char * pair_alloc(int64_t n) const
allocate space for n (int64_t,dtype) pairs, necessary for object types
Definition: algstrct.cxx:681
tspsum_replicate(tspsum *other)
Definition: spsum_tsr.cxx:271
tspsum(tspsum *other)
Definition: spsum_tsr.cxx:11
int64_t * nnz_blk_B
Definition: spsum_tsr.h:17
int * pad_edge_len
padded tensor edge lengths
tspsum_virt(tspsum *other)
iterates over the dense virtualization block grid and contracts
Definition: spsum_tsr.cxx:59
int calc_phys_phase() const
compute the physical phase of a mapping
Definition: mapping.cxx:57
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
Definition: ctr_tsr.cxx:592
void allred(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op)
allreduce, same interface as MPI_Allreduce, but excluding the comm
Definition: common.cxx:360
void run()
wraps user sequential function signature
Definition: spsum_tsr.cxx:479
void spA_dnB_seq_sum(char const *alpha, char const *A, int64_t size_A, algstrct const *sr_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, univar_function const *func)
performs summation between two sparse tensors assumes A contains key value pairs sorted by key...
Definition: spr_seq_sum.cxx:95
virtual void copy(char *a, char const *b) const
copies element b to element a
Definition: algstrct.cxx:538
int64_t mem_fp()
returns the number of bytes of buffer space needed
Definition: spsum_tsr.cxx:783
void pin(int64_t n, int order, int const *lens, int const *divisor, PairIterator pi_new)
pins keys of n pairs
Definition: algstrct.cxx:849
bool is_sparse_A
Definition: spsum_tsr.h:10
int const * idx_map_B
Definition: spsum_tsr.h:101
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
void permute(int64_t n, int order, int const *old_lens, int64_t const *new_lda, PairIterator wA)
permutes keys of n pairs
Definition: algstrct.cxx:829
virtual void set_nnz_blk_A(int64_t const *nnbA)
Definition: spsum_tsr.h:25
bool is_sparse_B
Definition: spsum_tsr.h:14
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
void sort(int64_t n)
sorts set of pairs using std::sort
Definition: algstrct.cxx:825
int64_t mem_fp()
returns the number of bytes of buffer space needed
Definition: spsum_tsr.cxx:326
bool is_sparse
whether only the non-zero elements of the tensor are stored
int order
number of tensor dimensions
int64_t new_nnz_B
Definition: spsum_tsr.h:18
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
int64_t mem_fp()
returns the number of bytes of buffer space needed
Definition: spsum_tsr.cxx:974
int * lens
unpadded tensor edge lengths
int64_t * map_idx_lda
Definition: spsum_tsr.h:140
#define SET_LDA_X(__X)
int const * idx_map_A
Definition: spsum_tsr.h:97
void depin(algstrct const *sr, int order, int const *lens, int const *divisor, int nvirt, int const *virt_dim, int const *phys_rank, char *X, int64_t &new_nnz_B, int64_t *nnz_blk, char *&new_B, bool check_padding)
depins keys of n pairs
Definition: algstrct.cxx:883
seq_tsr_spsum(tspsum *other)
copies sum object
Definition: spsum_tsr.cxx:409
int64_t mem_fp()
returns the number of bytes of buffer space needed
Definition: spsum_tsr.cxx:477
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
virtual void run()
Definition: sum_tsr.h:77
virtual void pair_dealloc(char *ptr) const
deallocate given pointer containing contiguous array of pairs
Definition: algstrct.cxx:693
virtual MPI_Op addmop() const
MPI addition operation for reductions.
Definition: algstrct.cxx:73
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
CommData * dim_comm
Definition: topology.h:20
int * idx_B
indices of output
Definition: summation.h:30
#define TAU_FSTOP(ARG)
Definition: util.h:281
int64_t nnz_B
Definition: spsum_tsr.h:15
#define TAU_FSTART(ARG)
Definition: util.h:280
char const * alpha
Definition: sum_tsr.h:71
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
Definition: common.cxx:336
int64_t mem_fp()
returns the number of bytes of buffer space needed
Definition: spsum_tsr.cxx:99
univar_function const * func
Definition: spsum_tsr.h:111
char * B
Definition: sum_tsr.h:72
int64_t calc_nvirt() const
calculate virtualization factor of tensor return virtualization factor
tspsum_pin_keys(tspsum *other)
Definition: spsum_tsr.cxx:948
int64_t mem_fp()
returns the number of bytes of buffer space needed
Definition: spsum_tsr.cxx:592
virtual tspsum * clone()
Definition: spsum_tsr.h:23
int64_t nnz_loc
number of local nonzero elements
int64_t nnz_A
Definition: spsum_tsr.h:11
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
virtual void copy_pairs(char *a, char const *b, int64_t n) const
copies n pair from array b to array a
Definition: algstrct.cxx:550
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
int const * idx_map_A
Definition: spsum_tsr.h:39
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
int const * idx_map_B
Definition: spsum_tsr.h:42
tensor * B
output
Definition: summation.h:20
internal distributed tensor class
algstrct const * sr_B
Definition: sum_tsr.h:73
tspsum_map(tspsum *other)
Definition: spsum_tsr.cxx:531
tspsum_permute(tspsum *other)
Definition: spsum_tsr.cxx:664
int64_t * nnz_blk_A
Definition: spsum_tsr.h:13
int64_t * map_idx_len
Definition: spsum_tsr.h:139
topology * topo
topology to which the tensor is mapped
performs replication along a dimension, generates 2.5D algs
Definition: spsum_tsr.h:64
tensor * A
left operand
Definition: summation.h:18
class for execution distributed summation of tensors
Definition: summation.h:15
char * new_B
Definition: spsum_tsr.h:19
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
void spA_spB_seq_sum(char const *alpha, char const *A, int64_t size_A, algstrct const *sr_A, char const *beta, char *B, int64_t size_B, char *&new_B, int64_t &new_size_B, algstrct const *sr_B, univar_function const *func, int64_t map_pfx)
performs summation between two sparse tensors assumes A and B contain key value pairs sorted by key...
char * A
Definition: sum_tsr.h:69
virtual void print()
Definition: sum_tsr.h:78
virtual MPI_Datatype mdtype() const
MPI datatype.
Definition: algstrct.cxx:80
def np(self)
Definition: core.pyx:315
char const * beta
Definition: sum_tsr.h:74