Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
spctr_tsr.cxx
Go to the documentation of this file.
1 
2 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
3 
4 #include "../shared/util.h"
5 #include "spctr_tsr.h"
6 #include "sp_seq_ctr.h"
7 #include "contraction.h"
8 #include "../sparse_formats/coo.h"
9 #include "../sparse_formats/csr.h"
10 #include "../tensor/untyped_tensor.h"
11 
12 namespace CTF_int {
14  : ctr(c) {
15  is_sparse_A = c->A->is_sparse;
16  is_sparse_B = c->B->is_sparse;
17  is_sparse_C = c->C->is_sparse;
18  }
19 
21  : ctr(other) {
22  is_sparse_A = other->is_sparse_A;
23  is_sparse_B = other->is_sparse_B;
24  is_sparse_C = other->is_sparse_C;
25  }
26 
28 
29 
31  int krnl_type_,
32  iparam const * inner_params_,
33  int * virt_blk_len_A,
34  int * virt_blk_len_B,
35  int * virt_blk_len_C,
36  int64_t vrt_sz_C)
37  : spctr(c) {
38 
39  int i, j, k;
40  int * new_sym_A, * new_sym_B, * new_sym_C;
41  CTF_int::alloc_ptr(sizeof(int)*c->A->order, (void**)&new_sym_A);
42  memcpy(new_sym_A, c->A->sym, sizeof(int)*c->A->order);
43  CTF_int::alloc_ptr(sizeof(int)*c->B->order, (void**)&new_sym_B);
44  memcpy(new_sym_B, c->B->sym, sizeof(int)*c->B->order);
45  CTF_int::alloc_ptr(sizeof(int)*c->C->order, (void**)&new_sym_C);
46  memcpy(new_sym_C, c->C->sym, sizeof(int)*c->C->order);
47 
48  this->krnl_type = krnl_type_;
49  this->inner_params = *inner_params_;
50  if (krnl_type > 0){
51  if (c->A->wrld->cdt.rank == 0){
52  DPRINTF(2,"Folded tensor n=%ld m=%ld k=%ld\n", inner_params_->n,
53  inner_params_->m, inner_params_->k);
54  }
55 
56  this->inner_params.sz_C = vrt_sz_C;
57  tensor * itsr;
58  itsr = c->A->rec_tsr;
59  for (i=0; i<itsr->order; i++){
60  j = c->A->inner_ordering[i];
61  for (k=0; k<c->A->order; k++){
62  if (c->A->sym[k] == NS) j--;
63  if (j<0) break;
64  }
65  j = k;
66  while (k>0 && c->A->sym[k-1] != NS){
67  k--;
68  }
69  for (; k<=j; k++){
70  /* printf("inner_ordering[%d]=%d setting dim %d of A, to len %d from len %d\n",
71  i, c->A->inner_ordering[i], k, 1, virt_blk_len_A[k]);*/
72  virt_blk_len_A[k] = 1;
73  new_sym_A[k] = NS;
74  }
75  }
76  itsr = c->B->rec_tsr;
77  for (i=0; i<itsr->order; i++){
78  j = c->B->inner_ordering[i];
79  for (k=0; k<c->B->order; k++){
80  if (c->B->sym[k] == NS) j--;
81  if (j<0) break;
82  }
83  j = k;
84  while (k>0 && c->B->sym[k-1] != NS){
85  k--;
86  }
87  for (; k<=j; k++){
88  /* printf("inner_ordering[%d]=%d setting dim %d of B, to len %d from len %d\n",
89  i, c->B->inner_ordering[i], k, 1, virt_blk_len_B[k]);*/
90  virt_blk_len_B[k] = 1;
91  new_sym_B[k] = NS;
92  }
93  }
94  itsr = c->C->rec_tsr;
95  for (i=0; i<itsr->order; i++){
96  j = c->C->inner_ordering[i];
97  for (k=0; k<c->C->order; k++){
98  if (c->C->sym[k] == NS) j--;
99  if (j<0) break;
100  }
101  j = k;
102  while (k>0 && c->C->sym[k-1] != NS){
103  k--;
104  }
105  for (; k<=j; k++){
106  /* printf("inner_ordering[%d]=%d setting dim %d of C, to len %d from len %d\n",
107  i, c->C->inner_ordering[i], k, 1, virt_blk_len_C[k]);*/
108  virt_blk_len_C[k] = 1;
109  new_sym_C[k] = NS;
110  }
111  }
112 
113  }
114 
115  this->is_custom = c->is_custom;
116  this->alpha = c->alpha;
117  if (is_custom){
118  this->func = c->func;
119  } else {
120  this->func = NULL;
121  }
122  this->order_A = c->A->order;
123  this->idx_map_A = c->idx_A;
124  this->edge_len_A = virt_blk_len_A;
125  this->sym_A = new_sym_A;
126  this->order_B = c->B->order;
127  this->idx_map_B = c->idx_B;
128  this->edge_len_B = virt_blk_len_B;
129  this->sym_B = new_sym_B;
130  this->order_C = c->C->order;
131  this->idx_map_C = c->idx_C;
132  this->edge_len_C = virt_blk_len_C;
133  this->sym_C = new_sym_C;
134 
135  }
136 
137 
139  int i;
140  printf("seq_tsr_spctr:\n");
141  for (i=0; i<order_A; i++){
142  printf("edge_len_A[%d]=%d\n",i,edge_len_A[i]);
143  }
144  for (i=0; i<order_B; i++){
145  printf("edge_len_B[%d]=%d\n",i,edge_len_B[i]);
146  }
147  for (i=0; i<order_C; i++){
148  printf("edge_len_C[%d]=%d\n",i,edge_len_C[i]);
149  }
150  printf("kernel type is %d\n", krnl_type);
151  if (krnl_type>0) printf("inner n = %ld m= %ld k = %ld sz_C=%ld\n",
153  }
154 
156  seq_tsr_spctr * o = (seq_tsr_spctr*)other;
157  alpha = o->alpha;
158 
159  order_A = o->order_A;
160  idx_map_A = o->idx_map_A;
161  sym_A = (int*)CTF_int::alloc(sizeof(int)*order_A);
162  memcpy(sym_A, o->sym_A, sizeof(int)*order_A);
163  edge_len_A = (int*)CTF_int::alloc(sizeof(int)*order_A);
164  memcpy(edge_len_A, o->edge_len_A, sizeof(int)*order_A);
165 
166  order_B = o->order_B;
167  idx_map_B = o->idx_map_B;
168  sym_B = (int*)CTF_int::alloc(sizeof(int)*order_B);
169  memcpy(sym_B, o->sym_B, sizeof(int)*order_B);
170  edge_len_B = (int*)CTF_int::alloc(sizeof(int)*order_B);
171  memcpy(edge_len_B, o->edge_len_B, sizeof(int)*order_B);
172 
173  order_C = o->order_C;
174  idx_map_C = o->idx_map_C;
175  sym_C = (int*)CTF_int::alloc(sizeof(int)*order_C);
176  memcpy(sym_C, o->sym_C, sizeof(int)*order_C);
177  edge_len_C = (int*)CTF_int::alloc(sizeof(int)*order_C);
178  memcpy(edge_len_C, o->edge_len_C, sizeof(int)*order_C);
179 
180  krnl_type = o->krnl_type;
182  is_custom = o->is_custom;
183  func = o->func;
184  }
185 
187  return new seq_tsr_spctr(this);
188  }
189 
190 
191  int64_t seq_tsr_spctr::spmem_fp(){ return 0; }
192 
193  double seq_tsr_spctr::est_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
194  int idx_max, * rev_idx_map;
198  &idx_max, &rev_idx_map);
199 
200  double flops = 2.0;
201  if (krnl_type>0) {
202  flops *= inner_params.m;
203  flops *= inner_params.n;
204  flops *= inner_params.k;
205  } else {
206  for (int i=0; i<idx_max; i++){
207  if (rev_idx_map[3*i+0] != -1) flops*=edge_len_A[rev_idx_map[3*i+0]];
208  else if (rev_idx_map[3*i+1] != -1) flops*=edge_len_B[rev_idx_map[3*i+1]];
209  else if (rev_idx_map[3*i+2] != -1) flops*=edge_len_C[rev_idx_map[3*i+2]];
210  }
211  }
212  if (is_sparse_A) flops *= nnz_frac_A*3.;
213  if (is_sparse_B) flops *= nnz_frac_B*3.;
214  //if (is_sparse_C) flops *= nnz_frac_C*10;
215  ASSERT(flops >= 0.0);
216  CTF_int::cdealloc(rev_idx_map);
217  return flops;
218  }
219 
220  uint64_t seq_tsr_spctr::est_membw(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
221  uint64_t size_A = sy_packed_size(order_A, edge_len_A, sym_A)*sr_A->el_size;
222  uint64_t size_B = sy_packed_size(order_B, edge_len_B, sym_B)*sr_B->el_size;
223  uint64_t size_C = sy_packed_size(order_C, edge_len_C, sym_C)*sr_C->el_size;
224  if (krnl_type>0) size_A *= ((int64_t)inner_params.m)*inner_params.k;
225  if (krnl_type>0) size_B *= ((int64_t)inner_params.n)*inner_params.k;
226  if (krnl_type>0) size_C *= ((int64_t)inner_params.m)*inner_params.n;
227  if (is_sparse_A) size_A *= nnz_frac_A*10;
228  if (is_sparse_B) size_B *= nnz_frac_B*10;
229  if (is_sparse_C) size_C *= nnz_frac_C*10;
230 
231  return size_A+size_B+size_C;
232  }
233 
250 
251  double seq_tsr_spctr::est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
252 // return COST_MEMBW*(size_A+size_B+size_C)+COST_FLOP*flops;
253  double ps[] = {1.0, (double)est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C), est_fp(nnz_frac_A, nnz_frac_B, nnz_frac_C)};
254  switch (krnl_type){
255  case 0:
256  if (is_custom){
257  if (inner_params.offload)
258  return seq_tsr_spctr_cst_off_k0.est_time(ps);
259  else
260  return seq_tsr_spctr_cst_k0.est_time(ps);
261  } else {
262  if (inner_params.offload)
263  return seq_tsr_spctr_off_k0.est_time(ps);
264  else
265  return seq_tsr_spctr_k0.est_time(ps);
266  }
267  break;
268  case 1:
269  if (is_custom){
270  if (inner_params.offload)
271  return seq_tsr_spctr_cst_off_k1.est_time(ps);
272  else
273  return seq_tsr_spctr_cst_k1.est_time(ps);
274  } else {
275  if (inner_params.offload)
276  return seq_tsr_spctr_off_k1.est_time(ps);
277  else
278  return seq_tsr_spctr_k1.est_time(ps);
279  }
280  break;
281  case 2:
282  if (is_custom){
283  if (inner_params.offload)
284  return seq_tsr_spctr_cst_off_k2.est_time(ps);
285  else
286  return seq_tsr_spctr_cst_k2.est_time(ps);
287  } else {
288  if (inner_params.offload)
289  return seq_tsr_spctr_off_k2.est_time(ps);
290  else
291  return seq_tsr_spctr_k2.est_time(ps);
292  }
293  break;
294  case 3:
295  if (is_custom){
296  return seq_tsr_spctr_cst_k3.est_time(ps);
297  } else {
298  return seq_tsr_spctr_k3.est_time(ps);
299  }
300  break;
301  case 4:
302  if (is_custom){
303  return seq_tsr_spctr_cst_k4.est_time(ps);
304  } else {
305  return seq_tsr_spctr_k4.est_time(ps);
306  }
307  break;
308  }
309  assert(0); //wont make it here
310  return 0.0;
311  }
312 
313  double seq_tsr_spctr::est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
314  return est_time_fp(nlyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
315  }
316 
317  void seq_tsr_spctr::run(char * A, int nblk_A, int64_t const * size_blk_A,
318  char * B, int nblk_B, int64_t const * size_blk_B,
319  char * C, int nblk_C, int64_t * size_blk_C,
320  char *& new_C){
321 
322  // not-quite-sure
323 
324  ASSERT(idx_lyr == 0 && num_lyr == 1);
325  ASSERT(nblk_A == 1);
326  ASSERT(nblk_B == 1);
327  ASSERT(nblk_C == 1);
328 
329 #ifdef TUNE
330  double st_time = MPI_Wtime();
331 #endif
332 
333  if (krnl_type > 0){
334  if (!sr_C->isequal(beta,sr_C->mulid())){
335  if (sr_C->isequal(beta,sr_C->addid())){
336  sr_C->set(C, beta, inner_params.sz_C);
337  } else {
338  sr_C->scal(inner_params.sz_C, beta, C, 1);
339  }
340  }
341  }
342 
343  new_C = C;
344 
345 
346 #ifdef TUNE
347  // Generate tps
348  double nnz_frac_A = 1.0, nnz_frac_B = 1.0, nnz_frac_C = 1.0;
349  if (is_sparse_A){
350  nnz_frac_A = size_blk_A[0]/sr_A->pair_size();
351  for (int i=0; i<order_A; i++){
352  nnz_frac_A = nnz_frac_A / edge_len_A[i];
353  }
354  }
355  if (is_sparse_B){
356  nnz_frac_B = size_blk_B[0]/sr_B->pair_size();
357  for (int i=0; i<order_B; i++){
358  nnz_frac_B = nnz_frac_B / edge_len_B[i];
359  }
360  }
361  if (krnl_type > 0){
362  if (is_sparse_A) nnz_frac_A = nnz_frac_A / (inner_params.m*inner_params.k);
363  if (is_sparse_B) nnz_frac_B = nnz_frac_B / (inner_params.k*inner_params.n);
364  if (is_sparse_C) nnz_frac_C = std::min(1.0,nnz_frac_A*nnz_frac_B*inner_params.k / (inner_params.k*inner_params.n));
365 
366  }
367 
368  double tps_[] = {0.0, 1.0, (double)est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C), est_fp(nnz_frac_B, nnz_frac_B, nnz_frac_C)};
369  // Check if we need to execute this function for the sake of training
370  bool bsr = true;
371  switch (krnl_type){
372  case 0:
373  if (is_custom){
374  bsr = seq_tsr_spctr_cst_k0.should_observe(tps_);
375  } else {
376  bsr = seq_tsr_spctr_k0.should_observe(tps_);
377  }
378  break;
379  case 1:
380  if (is_custom){
381  if (inner_params.offload)
382  bsr = seq_tsr_spctr_cst_off_k1.should_observe(tps_);
383  else
384  bsr = seq_tsr_spctr_cst_k1.should_observe(tps_);
385  } else {
386  if (inner_params.offload)
387  bsr = seq_tsr_spctr_off_k1.should_observe(tps_);
388  else
389  bsr = seq_tsr_spctr_k1.should_observe(tps_);
390  }
391  break;
392  case 2:
393  if (is_custom){
394  if (inner_params.offload)
395  bsr = seq_tsr_spctr_cst_off_k2.should_observe(tps_);
396  else
397  bsr = seq_tsr_spctr_cst_k2.should_observe(tps_);
398  } else {
399  if (inner_params.offload)
400  bsr = seq_tsr_spctr_off_k2.should_observe(tps_);
401  else
402  bsr = seq_tsr_spctr_k2.should_observe(tps_);
403  }
404  break;
405  case 3:
406  if (is_custom){
407  bsr = seq_tsr_spctr_cst_k3.should_observe(tps_);
408  } else {
409  bsr = seq_tsr_spctr_k3.should_observe(tps_);
410  }
411  break;
412  case 4:
413  if (is_custom){
414  // to-be-complete
415  // should always observe
416  //seq_tsr_spctr_cst_k4.observe(tps);
417  bsr = true;
418  } else {
419  bsr = seq_tsr_spctr_k4.should_observe(tps_);
420  }
421  break;
422  }
423 
424  if(!bsr){
425  return;
426  }
427 #endif
428  switch (krnl_type){
429  case 0:
430  {
431  ASSERT(size_blk_A[0]%sr_A->pair_size() == 0);
432 
433  int64_t nnz_A = size_blk_A[0]/sr_A->pair_size();
434 
435  TAU_FSTART(spA_dnB_dnC_seq);
437  A,
438  nnz_A,
439  sr_A,
440  order_A,
441  edge_len_A,
442  sym_A,
443  idx_map_A,
444  B,
445  sr_B,
446  order_B,
447  edge_len_B,
448  sym_B,
449  idx_map_B,
450  this->beta,
451  C,
452  sr_C,
453  order_C,
454  edge_len_C,
455  sym_C,
456  idx_map_C,
457  func);
458  TAU_FSTOP(spA_dnB_dnC_seq);
459  }
460  break;
461 
462  case 1:
463  {
464  // Do mm using coordinate format
465  TAU_FSTART(COOMM);
467  alpha, B, sr_B, sr_C->mulid(), C, sr_C, func);
468  TAU_FSTOP(COOMM);
469  }
470  break;
471 
472  case 2:
473  {
474  // Do mm using CSR format for A
475  TAU_FSTART(CSRMM);
477  alpha, B, sr_B, sr_C->mulid(), C, sr_C, func, inner_params.offload);
478  TAU_FSTOP(CSRMM);
479  }
480  break;
481 
482  case 3:
483  {
484  // Do mm using CSR format for A and B
485  TAU_FSTART(CSRMULTD);
487  alpha, B, sr_B, sr_C->mulid(), C, sr_C, func, inner_params.offload);
488  TAU_FSTOP(CSRMULTD);
489  }
490  break;
491 
492  case 4:
493  {
494  // Do mm using CSR format for A and B and C
495  TAU_FSTART(CSRMULTCSR);
498  size_blk_C[0] = ((CSR_Matrix)new_C).size();
499  //printf("new size = %ld nnz = %ld\n",size_blk_C[0],((CSR_Matrix)new_C).nnz());
500  TAU_FSTOP(CSRMULTCSR);
501  }
502  break;
503  }
504 #ifdef TUNE
505  nnz_frac_A = 1.0;
506  nnz_frac_B = 1.0;
507  nnz_frac_C = 1.0;
508  if (is_sparse_A){
509  nnz_frac_A = size_blk_A[0]/sr_A->pair_size();
510  for (int i=0; i<order_A; i++){
511  nnz_frac_A = nnz_frac_A / edge_len_A[i];
512  }
513  }
514  if (is_sparse_B){
515  nnz_frac_B = size_blk_B[0]/sr_B->pair_size();
516  for (int i=0; i<order_B; i++){
517  nnz_frac_B = nnz_frac_B / edge_len_B[i];
518  }
519  }
520  if (krnl_type > 0){
521  if (is_sparse_A) nnz_frac_A = nnz_frac_A / (inner_params.m*inner_params.k);
522  if (is_sparse_B) nnz_frac_B = nnz_frac_B / (inner_params.k*inner_params.n);
523  if (is_sparse_C) nnz_frac_C = std::min(1.0,nnz_frac_A*nnz_frac_B*inner_params.k / (inner_params.k*inner_params.n));
524 
525  }
526 
527  double exe_time = MPI_Wtime() - st_time;
528  double tps[] = {exe_time, 1.0, (double)est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C), est_fp(nnz_frac_B, nnz_frac_B, nnz_frac_C)};
529  switch (krnl_type){
530  case 0:
531  if (is_custom){
532  seq_tsr_spctr_cst_k0.observe(tps);
533  } else {
534  seq_tsr_spctr_k0.observe(tps);
535  }
536  break;
537  case 1:
538  if (is_custom){
539  if (inner_params.offload)
540  seq_tsr_spctr_cst_off_k1.observe(tps);
541  else
542  seq_tsr_spctr_cst_k1.observe(tps);
543  } else {
544  if (inner_params.offload)
545  seq_tsr_spctr_off_k1.observe(tps);
546  else
547  seq_tsr_spctr_k1.observe(tps);
548  }
549  break;
550  case 2:
551  if (is_custom){
552  if (inner_params.offload)
553  seq_tsr_spctr_cst_off_k2.observe(tps);
554  else
555  seq_tsr_spctr_cst_k2.observe(tps);
556  } else {
557  if (inner_params.offload)
558  seq_tsr_spctr_off_k2.observe(tps);
559  else
560  seq_tsr_spctr_k2.observe(tps);
561  }
562  break;
563  case 3:
564  if (is_custom){
565  seq_tsr_spctr_cst_k3.observe(tps);
566  } else {
567  seq_tsr_spctr_k3.observe(tps);
568  }
569  break;
570  case 4:
571  if (is_custom){
572  // to-be-complete
573  // should always observe
574  seq_tsr_spctr_cst_k4.observe(tps);
575  } else {
576  seq_tsr_spctr_k4.observe(tps);
577  }
578  break;
579  }
580 
581 #endif
582  }
583 
584 
586  int num_tot,
587  int * virt_dim,
588  int64_t vrt_sz_A,
589  int64_t vrt_sz_B,
590  int64_t vrt_sz_C)
591  : spctr(c) {
592  this->num_dim = num_tot;
593  this->virt_dim = virt_dim;
594  this->order_A = c->A->order;
595  this->blk_sz_A = vrt_sz_A;
596  this->idx_map_A = c->idx_A;
597  this->order_B = c->B->order;
598  this->blk_sz_B = vrt_sz_B;
599  this->idx_map_B = c->idx_B;
600  this->order_C = c->C->order;
601  this->blk_sz_C = vrt_sz_C;
602  this->idx_map_C = c->idx_C;
603  }
604 
605 
608  delete rec_ctr;
609  }
610 
611  spctr_virt::spctr_virt(spctr * other) : spctr(other) {
612  spctr_virt * o = (spctr_virt*)other;
613  rec_ctr = o->rec_ctr->clone();
614  num_dim = o->num_dim;
615  virt_dim = (int*)CTF_int::alloc(sizeof(int)*num_dim);
616  memcpy(virt_dim, o->virt_dim, sizeof(int)*num_dim);
617 
618  order_A = o->order_A;
619  blk_sz_A = o->blk_sz_A;
620  idx_map_A = o->idx_map_A;
621 
622  order_B = o->order_B;
623  blk_sz_B = o->blk_sz_B;
624  idx_map_B = o->idx_map_B;
625 
626  order_C = o->order_C;
627  blk_sz_C = o->blk_sz_C;
628  idx_map_C = o->idx_map_C;
629  }
630 
632  return new spctr_virt(this);
633  }
634 
636  int i;
637  printf("spctr_virt:\n");
638  printf("blk_sz_A = %ld, blk_sz_B = %ld, blk_sz_C = %ld\n",
640  for (i=0; i<num_dim; i++){
641  printf("virt_dim[%d] = %d\n", i, virt_dim[i]);
642  }
643  rec_ctr->print();
644  }
645 
646 
647  double spctr_virt::est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C) {
648  /* FIXME: for now treat flops like comm, later make proper cost */
649  int64_t nblk = 1;
650  for (int dim=0; dim<num_dim; dim++){
651  nblk *= virt_dim[dim];
652  }
653  return nblk*rec_ctr->est_time_rec(nlyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
654  }
655 
656 
657  #define VIRT_NTD 1
658 
660  return (order_A+order_B+order_C+(3+VIRT_NTD)*num_dim)*sizeof(int);
661  }
662 
663  int64_t spctr_virt::spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C) {
664  return rec_ctr->spmem_rec(nnz_frac_A, nnz_frac_B, nnz_frac_C) + spmem_fp();
665  }
666 
667  void spctr_virt::run(char * A, int nblk_A, int64_t const * size_blk_A,
668  char * B, int nblk_B, int64_t const * size_blk_B,
669  char * C, int nblk_C, int64_t * size_blk_C,
670  char *& new_C){
672  int * idx_arr, * tidx_arr, * lda_A, * lda_B, * lda_C, * beta_arr;
673  int * ilda_A, * ilda_B, * ilda_C;
674  int64_t i, off_A, off_B, off_C;
675  int nb_A, nb_B, nb_C, alloced, ret;
676 
677  /*if (this->buffer != NULL){
678  alloced = 0;
679  idx_arr = (int*)this->buffer;
680  } else {*/
681  alloced = 1;
682  ret = CTF_int::alloc_ptr(spmem_fp(), (void**)&idx_arr);
683  ASSERT(ret==0);
684 // }
685 
686 
687  lda_A = idx_arr + VIRT_NTD*num_dim;
688  lda_B = lda_A + order_A;
689  lda_C = lda_B + order_B;
690  ilda_A = lda_C + order_C;
691  ilda_B = ilda_A + num_dim;
692  ilda_C = ilda_B + num_dim;
693 
694  #define SET_LDA_X(__X) \
695  do { \
696  nb_##__X = 1; \
697  for (i=0; i<order_##__X; i++){ \
698  lda_##__X[i] = nb_##__X; \
699  nb_##__X = nb_##__X*virt_dim[idx_map_##__X[i]]; \
700  } \
701  memset(ilda_##__X, 0, num_dim*sizeof(int)); \
702  for (i=0; i<order_##__X; i++){ \
703  ilda_##__X[idx_map_##__X[i]] += lda_##__X[i]; \
704  } \
705  } while (0)
706  SET_LDA_X(A);
707  SET_LDA_X(B);
708  SET_LDA_X(C);
709  #undef SET_LDA_X
710 
711  /* dynammically determined size */
712  beta_arr = (int*)CTF_int::alloc(sizeof(int)*nb_C);
713  memset(beta_arr, 0, nb_C*sizeof(int));
714 
715  int64_t * sp_offsets_A = NULL;
716  if (is_sparse_A){
717  sp_offsets_A = (int64_t*)alloc(sizeof(int64_t)*nb_A);
718  sp_offsets_A[0] = 0;
719  for (int i=1; i<nb_A; i++){
720  sp_offsets_A[i] = sp_offsets_A[i-1]+size_blk_A[i-1];
721  }
722  }
723  int64_t * sp_offsets_B = NULL;
724  if (is_sparse_B){
725  sp_offsets_B = (int64_t*)alloc(sizeof(int64_t)*nb_B);
726  sp_offsets_B[0] = 0;
727  for (int i=1; i<nb_B; i++){
728  sp_offsets_B[i] = sp_offsets_B[i-1]+size_blk_B[i-1];
729  }
730  }
731 
732  int64_t * sp_offsets_C = NULL;
733  int64_t * new_sp_szs_C = NULL;
734  char ** buckets_C = NULL;
735  if (is_sparse_C){
736  sp_offsets_C = (int64_t*)alloc(sizeof(int64_t)*nb_C);
737  new_sp_szs_C = size_blk_C; //(int64_t*)alloc(sizeof(int64_t)*nb_C);
738 // memcpy(new_sp_szs_C, blk_sz_C, sizeof(int64_t)*nb_C);
739  buckets_C = (char**)alloc(sizeof(char*)*nb_C);
740  for (int i=0; i<nb_C; i++){
741  if (i==0)
742  sp_offsets_C[0] = 0;
743  else
744  sp_offsets_C[i] = sp_offsets_C[i-1]+size_blk_C[i-1];
745  buckets_C[i] = C + sp_offsets_C[i];
746  }
747  }
748 
749  #if (VIRT_NTD>1)
750 // #pragma omp parallel private(off_A,off_B,off_C,tidx_arr,i)
751  #endif
752  {
753  int tid, ntd, start_off, end_off;
754  #if (VIRT_NTD>1)
755  tid = omp_get_thread_num();
756  ntd = MIN(VIRT_NTD, omp_get_num_threads());
757  #else
758  tid = 0;
759  ntd = 1;
760  #endif
761  #if (VIRT_NTD>1)
762  DPRINTF(2,"%d/%d %d %d\n",tid,ntd,VIRT_NTD,omp_get_num_threads());
763  #endif
764  if (tid < ntd){
765  tidx_arr = idx_arr + tid*num_dim;
766  memset(tidx_arr, 0, num_dim*sizeof(int));
767 
768  start_off = (nb_C/ntd)*tid;
769  if (tid < nb_C%ntd){
770  start_off += tid;
771  end_off = start_off + nb_C/ntd + 1;
772  } else {
773  start_off += nb_C%ntd;
774  end_off = start_off + nb_C/ntd;
775  }
776 
777  spctr * tid_rec_ctr;
778  if (tid > 0)
779  tid_rec_ctr = rec_ctr->clone();
780  else
781  tid_rec_ctr = rec_ctr;
782 
783  tid_rec_ctr->num_lyr = this->num_lyr;
784  tid_rec_ctr->idx_lyr = this->idx_lyr;
785 
786  off_A = 0, off_B = 0, off_C = 0;
787  for (;;){
788  if (off_C >= start_off && off_C < end_off) {
789  char * rec_A, * rec_B, * rec_C;
790  if (is_sparse_A){
791  rec_A = A + sp_offsets_A[off_A];
792  } else
793  rec_A = A + off_A*blk_sz_A*sr_A->el_size;
794  if (is_sparse_B){
795  rec_B = B + sp_offsets_B[off_B];
796  } else
797  rec_B = B + off_B*blk_sz_B*sr_B->el_size;
798  if (is_sparse_C){
799  rec_C = buckets_C[off_C];
800  } else
801  rec_C = C + off_C*blk_sz_C*sr_C->el_size;
802  if (beta_arr[off_C]>0)
803  rec_ctr->beta = sr_C->mulid();
804  else
805  rec_ctr->beta = this->beta;
806  bool do_dealloc = beta_arr[off_C] > 0;
807  beta_arr[off_C] = 1;
808  char * pass_C = is_sparse_C ? buckets_C[off_C] : rec_C;
809  tid_rec_ctr->run(rec_A, 1, size_blk_A+off_A,
810  rec_B, 1, size_blk_B+off_B,
811  rec_C, 1, new_sp_szs_C+off_C,
812  pass_C);
813  if (is_sparse_C){
814  if (do_dealloc) cdealloc(buckets_C[off_C]);
815  buckets_C[off_C] = pass_C;
816  }
817  }
818 
819 
820  for (i=0; i<num_dim; i++){
821  off_A -= ilda_A[i]*tidx_arr[i];
822  off_B -= ilda_B[i]*tidx_arr[i];
823  off_C -= ilda_C[i]*tidx_arr[i];
824  tidx_arr[i]++;
825  if (tidx_arr[i] >= virt_dim[i])
826  tidx_arr[i] = 0;
827  off_A += ilda_A[i]*tidx_arr[i];
828  off_B += ilda_B[i]*tidx_arr[i];
829  off_C += ilda_C[i]*tidx_arr[i];
830  if (tidx_arr[i] != 0) break;
831  }
832 #ifdef MICROBENCH
833  break;
834 #else
835  if (i==num_dim) break;
836 #endif
837  }
838  if (tid > 0){
839  delete tid_rec_ctr;
840  }
841  }
842  }
843  if (is_sparse_C){
844  int64_t new_size_C = 0;
845  for (int i=0; i<nb_C; i++){
846  new_size_C += new_sp_szs_C[i];
847  }
848  new_C = (char*)alloc(new_size_C);
849  int64_t pfx = 0;
850  for (int i=0; i<nb_C; i++){
851  memcpy(new_C+pfx, buckets_C[i], new_sp_szs_C[i]);
852  pfx += new_sp_szs_C[i];
853  if (beta_arr[i] > 0) cdealloc(buckets_C[i]);
854  }
855  //FIXME: how to pass C back generally
856  //cdealloc(C);
857  cdealloc(buckets_C);
858  } else new_C = C;
859  if (is_sparse_A) cdealloc(sp_offsets_A);
860  if (is_sparse_B) cdealloc(sp_offsets_B);
861  if (is_sparse_B) cdealloc(sp_offsets_C);
862  if (alloced){
863  CTF_int::cdealloc(idx_arr);
864  }
865  CTF_int::cdealloc(beta_arr);
867  }
868 
870  delete rec_ctr;
871  cdealloc(divisor);
873  cdealloc(phys_rank);
874  }
875 
877  spctr_pin_keys * o = (spctr_pin_keys*)other;
878 
879  rec_ctr = o->rec_ctr->clone();
880  AxBxC = o->AxBxC;
881  order = o->order;
882  lens = o->lens;
883  divisor = (int*)alloc(sizeof(int)*order);
884  phys_rank = (int*)alloc(sizeof(int)*order);
885  virt_dim = (int*)alloc(sizeof(int)*order);
886  memcpy(divisor, o->divisor, sizeof(int)*order);
887  memcpy(phys_rank, o->phys_rank, sizeof(int)*order);
888  memcpy(virt_dim, o->virt_dim, sizeof(int)*order);
889  }
890 
892  return new spctr_pin_keys(this);
893  }
894 
896  printf("spctr_pin_keys:\n");
897  switch (AxBxC){
898  case 0:
899  printf("transforming global keys of A to local keys\n");
900  break;
901  case 1:
902  printf("transforming global keys of B to local keys\n");
903  break;
904  case 2:
905  printf("transforming global keys of C to local keys\n");
906  break;
907  }
908  rec_ctr->print();
909  }
910 
911  int64_t spctr_pin_keys::spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
912  int64_t mem_usage = 0;
913  switch (AxBxC){
914  case 0:
915  {
916  mem_usage += dns_blk_sz*nnz_frac_A*sr_A->pair_size();
917  }
918  case 1:
919  {
920  mem_usage += dns_blk_sz*nnz_frac_B*sr_B->pair_size();
921  }
922  case 2:
923  {
924  mem_usage += dns_blk_sz*nnz_frac_C*sr_C->pair_size();
925  }
926  }
927  return mem_usage;
928  }
929 
930  int64_t spctr_pin_keys::spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
931  return spmem_fp(nnz_frac_A, nnz_frac_B, nnz_frac_C) + rec_ctr->spmem_rec(nnz_frac_A, nnz_frac_B, nnz_frac_C);
932  }
933 
934  spctr_pin_keys::spctr_pin_keys(contraction const * s, int AxBxC_) : spctr(s) {
935  tensor * X = NULL;
936  AxBxC = AxBxC_;
937  switch (AxBxC){
938  case 0:
939  X = s->A;
940  dns_blk_sz = s->A->size;
941  break;
942  case 1:
943  X = s->B;
944  dns_blk_sz = s->B->size;
945  break;
946  case 2:
947  X = s->C;
948  dns_blk_sz = s->C->size;
949  break;
950  }
951 
952  order = X->order;
953  lens = X->lens;
954 
955  divisor = (int*)alloc(sizeof(int)*order);
956  phys_rank = (int*)alloc(sizeof(int)*order);
957  virt_dim = (int*)alloc(sizeof(int*)*order);
958 
959  for (int i=0; i<order; i++){
960  divisor[i] = X->edge_map[i].calc_phase();
961  phys_rank[i] = X->edge_map[i].calc_phys_rank(X->topo);
962  virt_dim[i] = divisor[i]/X->edge_map[i].calc_phys_phase();
963  }
964  }
965 
967  double spctr_pin_keys::est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C) {
968  switch (AxBxC){
969  case 0:
970  {
971  double ps[] = {1.0, dns_blk_sz*nnz_frac_A};
972  return pin_keys_mdl.est_time(ps);
973  }
974  case 1:
975  {
976  double ps[] = {1.0, dns_blk_sz*nnz_frac_B};
977  return pin_keys_mdl.est_time(ps);
978  }
979  case 2:
980  {
981  double ps[] = {1.0, dns_blk_sz*nnz_frac_C};
982  return 2.*pin_keys_mdl.est_time(ps);
983  }
984  }
985  return 0;
986  }
987 
988  double spctr_pin_keys::est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C) {
989  return est_time_fp(nlyr, nnz_frac_A, nnz_frac_B, nnz_frac_C)+rec_ctr->est_time_rec(nlyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
990  }
991 
992  void spctr_pin_keys::run(char * A, int nblk_A, int64_t const * size_blk_A,
993  char * B, int nblk_B, int64_t const * size_blk_B,
994  char * C, int nblk_C, int64_t * size_blk_C,
995  char *& new_C){
997  char * X = NULL;
998  algstrct const * sr = NULL;
999  int64_t nnz = 0;
1000  switch (AxBxC){
1001  case 0:
1002  X = A;
1003  sr = this->sr_A;
1004  for (int i=0; i<nblk_A; i++){
1005  ASSERT(size_blk_A[i]%sr_A->pair_size() == 0);
1006  nnz += size_blk_A[i]/sr_A->pair_size();
1007  }
1008  break;
1009  case 1:
1010  X = B;
1011  sr = this->sr_B;
1012  for (int i=0; i<nblk_B; i++){
1013  ASSERT(size_blk_B[i]%sr_B->pair_size() == 0);
1014  nnz += size_blk_B[i]/sr_B->pair_size();
1015  }
1016  break;
1017  case 2:
1018  X = C;
1019  sr = this->sr_C;
1020  for (int i=0; i<nblk_C; i++){
1021  ASSERT(size_blk_C[i]%sr_C->pair_size() == 0);
1022  nnz += size_blk_C[i]/sr_C->pair_size();
1023  }
1024  break;
1025  }
1026 
1027 /* int * div_lens;
1028  alloc_ptr(order*sizeof(int), (void**)&div_lens);
1029  for (int j=0; j<order; j++){
1030  div_lens[j] = (lens[j]/divisor[j] + (lens[j]%divisor[j] > 0));
1031 // printf("lens[%d] = %d divisor[%d] = %d div_lens[%d] = %d\n",j,lens[j],j,divisor[j],j,div_lens[j]);
1032  }*/
1033  double st_time = MPI_Wtime();
1034  char * nA, * nB, * nC;
1035  nA = A; nB = B; nC = C;
1036  char * nX = NULL;
1037  switch (AxBxC){
1038  case 0:
1039  mst_alloc_ptr(sr->pair_size()*nnz, (void**)&nX);
1040  memcpy(nX, X, sr->pair_size()*nnz);
1041  nA = nX;
1042  break;
1043  case 1:
1044  mst_alloc_ptr(sr->pair_size()*nnz, (void**)&nX);
1045  memcpy(nX, X, sr->pair_size()*nnz);
1046  nB = nX;
1047  break;
1048  }
1049  ConstPairIterator pi(sr, X);
1050  PairIterator pi_new(sr, nX);
1051 
1052  pi.pin(nnz, order, lens, divisor, pi_new);
1053 
1054  double exe_time = MPI_Wtime()-st_time;
1055  double tps[] = {exe_time, 1.0, (double)nnz};
1056  pin_keys_mdl.observe(tps);
1057 
1059  rec_ctr->run(nA, nblk_A, size_blk_A,
1060  nB, nblk_B, size_blk_B,
1061  nC, nblk_C, size_blk_C,
1062  new_C);
1064 
1065 
1066  switch (AxBxC){
1067  case 0:
1068  case 1:
1069  cdealloc(nX);
1070  break;
1071  case 2:
1072  st_time = MPI_Wtime();
1073  int64_t new_nnz_C=0;
1074  for (int i=0; i<nblk_C; i++){
1075  ASSERT(size_blk_C[i] % sr_C->pair_size() == 0);
1076  new_nnz_C += size_blk_C[i] / sr_C->pair_size();
1077  }
1078  depin(sr_C, order, lens, divisor, nblk_C, virt_dim, phys_rank, new_C, new_nnz_C, size_blk_C, new_C, true);
1079  double exe_time = MPI_Wtime()-st_time;
1080  double tps[] = {exe_time, 1.0, (double)nnz};
1081  pin_keys_mdl.observe(tps);
1082  break;
1083  }
1085  }
1086 }
bivar_function const * func
Definition: spctr_tsr.h:78
LinModel< 3 > seq_tsr_spctr_off_k1(seq_tsr_spctr_off_k1_init,"seq_tsr_spctr_off_k1")
int calc_phys_rank(topology const *topo) const
compute the physical rank of a mapping
Definition: mapping.cxx:74
double seq_tsr_spctr_k0_init[]
Definition: init_models.cxx:13
double seq_tsr_spctr_cst_k0_init[]
Definition: init_models.cxx:8
virtual int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
Definition: spctr_tsr.h:45
spctr_virt(spctr *other)
copies spctr_virt object
Definition: spctr_tsr.cxx:611
CTF_int::CommData cdt
communicator data for MPI comm defining this world
Definition: world.h:32
int64_t spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
Definition: spctr_tsr.cxx:911
double seq_tsr_spctr_cst_off_k0_init[]
Definition: init_models.cxx:2
double seq_tsr_spctr_k1_init[]
Definition: init_models.cxx:14
int * sym
symmetries among tensor dimensions
double est_time(double const *param)
estimates model time based on observarions
Definition: model.cxx:530
tensor * A
left operand
Definition: contraction.h:19
#define DPRINTF(...)
Definition: util.h:235
virtual int pair_size() const
gets pair size el_size plus the key size
Definition: algstrct.h:46
double seq_tsr_spctr_cst_k4_init[]
Definition: init_models.cxx:12
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values
Definition: model.cxx:168
int calc_phase() const
compute the phase of a mapping
Definition: mapping.cxx:39
bool offload
Definition: ctr_tsr.h:84
int64_t k
Definition: ctr_tsr.h:79
double seq_tsr_spctr_cst_k1_init[]
Definition: init_models.cxx:9
LinModel< 3 > seq_tsr_spctr_cst_k1(seq_tsr_spctr_cst_k1_init,"seq_tsr_spctr_cst_k1")
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340
LinModel< 2 > pin_keys_mdl(pin_keys_mdl_init,"pin_keys_mdl")
LinModel< 3 > seq_tsr_spctr_cst_k4(seq_tsr_spctr_cst_k4_init,"seq_tsr_spctr_cst_k4")
int * inner_ordering
ordering of the dimensions according to which the tensori s folded
int calc_phys_phase() const
compute the physical phase of a mapping
Definition: mapping.cxx:57
tensor * B
right operand
Definition: contraction.h:21
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
Definition: ctr_tsr.cxx:592
virtual double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
Definition: spctr_tsr.h:39
double seq_tsr_spctr_off_k1_init[]
Definition: init_models.cxx:6
int const * idx_map_A
Definition: spctr_tsr.h:124
LinModel< 3 > seq_tsr_spctr_off_k0(seq_tsr_spctr_off_k0_init,"seq_tsr_spctr_off_k0")
double seq_tsr_spctr_cst_k3_init[]
Definition: init_models.cxx:11
void pin(int64_t n, int order, int const *lens, int const *divisor, PairIterator pi_new)
pins keys of n pairs
Definition: algstrct.cxx:849
int64_t size
current size of local tensor data chunk (mapping-dependent)
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
Definition: common.h:37
LinModel< 3 > seq_tsr_spctr_k0(seq_tsr_spctr_k0_init,"seq_tsr_spctr_k0")
int64_t m
Definition: ctr_tsr.h:78
double seq_tsr_spctr_off_k0_init[]
Definition: init_models.cxx:5
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
bool is_sparse_C
Definition: spctr_tsr.h:14
LinModel< 3 > seq_tsr_spctr_k3(seq_tsr_spctr_k3_init,"seq_tsr_spctr_k3")
bivar_function const * func
function to execute on elements
Definition: contraction.h:39
static void csrmultcsr(char const *A, algstrct const *sr_A, int m, int n, int k, char const *alpha, char const *B, algstrct const *sr_B, char const *beta, char *&C, algstrct const *sr_C, bivar_function const *func, bool do_offload)
computes C = beta*C + func(alpha*A*B) where A, B, and C are CSR_Matrices, while C is dense ...
Definition: csr.cxx:188
bool is_sparse
whether only the non-zero elements of the tensor are stored
double pin_keys_mdl_init[]
Definition: init_models.cxx:18
int order
number of tensor dimensions
int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
Definition: spctr_tsr.cxx:663
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
iterates over the dense virtualization block grid and contracts
Definition: spctr_tsr.cxx:667
double seq_tsr_spctr_off_k2_init[]
Definition: init_models.cxx:7
LinModel< 3 > seq_tsr_spctr_cst_off_k2(seq_tsr_spctr_cst_off_k2_init,"seq_tsr_spctr_cst_off_k2")
bool is_custom
whether there is a elementwise custom function
Definition: contraction.h:37
double est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time the local part this kernel is estimated to take
Definition: spctr_tsr.cxx:967
bool is_sparse_A
Definition: spctr_tsr.h:12
algstrct const * sr_B
Definition: ctr_comm.h:168
LinModel< 3 > seq_tsr_spctr_cst_k3(seq_tsr_spctr_cst_k3_init,"seq_tsr_spctr_cst_k3")
LinModel< 3 > seq_tsr_spctr_k1(seq_tsr_spctr_k1_init,"seq_tsr_spctr_k1")
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
Definition: spctr_tsr.cxx:313
CTF::World * wrld
distributed processor context on which tensor is defined
LinModel< 3 > seq_tsr_spctr_cst_k2(seq_tsr_spctr_cst_k2_init,"seq_tsr_spctr_cst_k2")
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction
Definition: memcontrol.cxx:269
algstrct const * sr_C
Definition: ctr_comm.h:169
class for execution distributed contraction of tensors
Definition: contraction.h:16
double seq_tsr_spctr_k3_init[]
Definition: init_models.cxx:16
double seq_tsr_spctr_k4_init[]
Definition: init_models.cxx:17
int * lens
unpadded tensor edge lengths
int64_t n
Definition: ctr_tsr.h:77
LinModel< 3 > seq_tsr_spctr_cst_k0(seq_tsr_spctr_cst_k0_init,"seq_tsr_spctr_cst_k0")
virtual void print()
Definition: ctr_comm.h:175
char const * alpha
Definition: spctr_tsr.h:58
char * new_C
Definition: spctr_tsr.h:15
int64_t sz_C
Definition: ctr_tsr.h:80
void depin(algstrct const *sr, int order, int const *lens, int const *divisor, int nvirt, int const *virt_dim, int const *phys_rank, char *X, int64_t &new_nnz_B, int64_t *nnz_blk, char *&new_B, bool check_padding)
depins keys of n pairs
Definition: algstrct.cxx:883
int * idx_B
indices of right operand
Definition: contraction.h:33
tensor * C
output
Definition: contraction.h:23
Linear performance models, which given measurements, provides new model guess.
Definition: model.h:32
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
double seq_tsr_spctr_k2_init[]
Definition: init_models.cxx:15
algstrct const * sr_A
Definition: ctr_comm.h:167
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
wraps user sequential function signature
Definition: spctr_tsr.cxx:317
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
Definition: csr.h:22
seq_tsr_spctr(spctr *other)
copies ctr object
Definition: spctr_tsr.cxx:155
static void coomm(char const *A, algstrct const *sr_A, int m, int n, int k, char const *alpha, char const *B, algstrct const *sr_B, char const *beta, char *C, algstrct const *sr_C, bivar_function const *func)
computes C = beta*C + func(alpha*A*B) where A is a COO_Matrix, while B and C are dense ...
Definition: coo.cxx:230
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
Definition: spctr_tsr.cxx:988
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
Definition: spctr_tsr.cxx:992
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
tensor * rec_tsr
representation of folded tensor (shares data pointer)
uint64_t est_membw(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
Definition: spctr_tsr.cxx:220
double seq_tsr_spctr_cst_k2_init[]
Definition: init_models.cxx:10
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
Definition: algstrct.cxx:262
int const * idx_map_B
Definition: spctr_tsr.h:127
~spctr_virt()
deallocates spctr_virt object
Definition: spctr_tsr.cxx:606
int const * idx_map_A
Definition: spctr_tsr.h:61
double est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time the local part this kernel is estimated to take
Definition: spctr_tsr.cxx:251
void spA_dnB_dnC_seq_ctr(char const *alpha, char const *A, int64_t size_A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, bivar_function const *func)
Definition: sp_seq_ctr.cxx:358
bool should_observe(double const *time_param)
decides whether the current instance should be observed
Definition: model.cxx:215
int const * idx_map_C
Definition: spctr_tsr.h:130
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
Definition: spctr_tsr.cxx:647
int * idx_C
indices of output
Definition: contraction.h:35
double est_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
Definition: spctr_tsr.cxx:193
int const * idx_map_B
Definition: spctr_tsr.h:66
int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
Definition: spctr_tsr.cxx:930
#define VIRT_NTD
Definition: spctr_tsr.cxx:657
spctr_pin_keys(spctr *other)
Definition: spctr_tsr.cxx:876
LinModel< 3 > seq_tsr_spctr_cst_off_k0(seq_tsr_spctr_cst_off_k0_init,"seq_tsr_spctr_cst_off_k0")
LinModel< 3 > seq_tsr_spctr_k4(seq_tsr_spctr_k4_init,"seq_tsr_spctr_k4")
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
static void csrmm(char const *A, algstrct const *sr_A, int m, int n, int k, char const *alpha, char const *B, algstrct const *sr_B, char const *beta, char *C, algstrct const *sr_C, bivar_function const *func, bool do_offload)
computes C = beta*C + func(alpha*A*B) where A is a CSR_Matrix, while B and C are dense ...
Definition: csr.cxx:134
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
double seq_tsr_spctr_cst_off_k1_init[]
Definition: init_models.cxx:3
int const * idx_map_C
Definition: spctr_tsr.h:71
LinModel< 3 > seq_tsr_spctr_k2(seq_tsr_spctr_k2_init,"seq_tsr_spctr_k2")
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
internal distributed tensor class
virtual spctr * clone()
Definition: spctr_tsr.h:19
double seq_tsr_spctr_cst_off_k2_init[]
Definition: init_models.cxx:4
LinModel< 3 > seq_tsr_spctr_cst_off_k1(seq_tsr_spctr_cst_off_k1_init,"seq_tsr_spctr_cst_off_k1")
char const * alpha
scaling of A*B
Definition: contraction.h:26
topology * topo
topology to which the tensor is mapped
#define MIN(a, b)
Definition: util.h:176
static void csrmultd(char const *A, algstrct const *sr_A, int m, int n, int k, char const *alpha, char const *B, algstrct const *sr_B, char const *beta, char *C, algstrct const *sr_C, bivar_function const *func, bool do_offload)
computes C = beta*C + func(alpha*A*B) where A and B are CSR_Matrices, while C is dense ...
Definition: csr.cxx:158
int * idx_A
indices of left operand
Definition: contraction.h:31
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
void run(char *A, char *B, char *C)
Definition: spctr_tsr.h:48
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
Definition: util.cxx:10
char const * beta
Definition: ctr_comm.h:170
bool is_sparse_B
Definition: spctr_tsr.h:13
#define SET_LDA_X(__X)
LinModel< 3 > seq_tsr_spctr_off_k2(seq_tsr_spctr_off_k2_init,"seq_tsr_spctr_off_k2")
spctr(spctr *other)
Definition: spctr_tsr.cxx:20