Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
ctr_tsr.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #include "../shared/util.h"
4 #include "ctr_comm.h"
5 #include "ctr_tsr.h"
6 #include "sym_seq_ctr.h"
7 #include "contraction.h"
8 #include "../tensor/untyped_tensor.h"
9 #include "../shared/model.h"
10 #ifdef USE_OMP
11 #include <omp.h>
12 #endif
13 
14 namespace CTF_int {
15 
16  #ifndef VIRT_NTD
17  #define VIRT_NTD 1
18  #endif
19 
21  int num_tot,
22  int * virt_dim,
23  int64_t vrt_sz_A,
24  int64_t vrt_sz_B,
25  int64_t vrt_sz_C)
26  : ctr(c) {
27  this->num_dim = num_tot;
28  this->virt_dim = virt_dim;
29  this->order_A = c->A->order;
30  this->blk_sz_A = vrt_sz_A;
31  this->idx_map_A = c->idx_A;
32  this->order_B = c->B->order;
33  this->blk_sz_B = vrt_sz_B;
34  this->idx_map_B = c->idx_B;
35  this->order_C = c->C->order;
36  this->blk_sz_C = vrt_sz_C;
37  this->idx_map_C = c->idx_C;
38  }
39 
40 
43  delete rec_ctr;
44  }
45 
46  ctr_virt::ctr_virt(ctr * other) : ctr(other) {
47  ctr_virt * o = (ctr_virt*)other;
48  rec_ctr = o->rec_ctr->clone();
49  num_dim = o->num_dim;
50  virt_dim = (int*)CTF_int::alloc(sizeof(int)*num_dim);
51  memcpy(virt_dim, o->virt_dim, sizeof(int)*num_dim);
52 
53  order_A = o->order_A;
54  blk_sz_A = o->blk_sz_A;
55  idx_map_A = o->idx_map_A;
56 
57  order_B = o->order_B;
58  blk_sz_B = o->blk_sz_B;
59  idx_map_B = o->idx_map_B;
60 
61  order_C = o->order_C;
62  blk_sz_C = o->blk_sz_C;
63  idx_map_C = o->idx_map_C;
64  }
65 
67  return new ctr_virt(this);
68  }
69 
70  void ctr_virt::print() {
71  int i;
72  printf("ctr_virt:\n");
73  printf("blk_sz_A = %ld, blk_sz_B = %ld, blk_sz_C = %ld\n",
75  for (i=0; i<num_dim; i++){
76  printf("virt_dim[%d] = %d\n", i, virt_dim[i]);
77  }
78  rec_ctr->print();
79  }
80 
81 
82  double ctr_virt::est_time_rec(int nlyr) {
83  /* FIXME: for now treat flops like comm, later make proper cost */
84  int64_t nvirt = 1;
85  for (int dim=0; dim<num_dim; dim++){
86  nvirt *= virt_dim[dim];
87  }
88  return nvirt*rec_ctr->est_time_rec(nlyr);
89  }
90 
91 
92  int64_t ctr_virt::mem_fp(){
93  return (order_A+order_B+order_C+(3+VIRT_NTD)*num_dim)*sizeof(int);
94  }
95 
96  int64_t ctr_virt::mem_rec() {
97  return rec_ctr->mem_rec() + mem_fp();
98  }
99 
100 
101  void ctr_virt::run(char * A, char * B, char * C){
103  int * idx_arr, * tidx_arr, * lda_A, * lda_B, * lda_C, * beta_arr;
104  int * ilda_A, * ilda_B, * ilda_C;
105  int64_t i, off_A, off_B, off_C;
106  int nb_A, nb_B, nb_C, alloced, ret;
107 
108  /*if (this->buffer != NULL){
109  alloced = 0;
110  idx_arr = (int*)this->buffer;
111  } else {*/
112  alloced = 1;
113  ret = CTF_int::alloc_ptr(mem_fp(), (void**)&idx_arr);
114  ASSERT(ret==0);
115 // }
116 
117 
118  lda_A = idx_arr + VIRT_NTD*num_dim;
119  lda_B = lda_A + order_A;
120  lda_C = lda_B + order_B;
121  ilda_A = lda_C + order_C;
122  ilda_B = ilda_A + num_dim;
123  ilda_C = ilda_B + num_dim;
124 
125  #define SET_LDA_X(__X) \
126  do { \
127  nb_##__X = 1; \
128  for (i=0; i<order_##__X; i++){ \
129  lda_##__X[i] = nb_##__X; \
130  nb_##__X = nb_##__X*virt_dim[idx_map_##__X[i]]; \
131  } \
132  memset(ilda_##__X, 0, num_dim*sizeof(int)); \
133  for (i=0; i<order_##__X; i++){ \
134  ilda_##__X[idx_map_##__X[i]] += lda_##__X[i]; \
135  } \
136  } while (0)
137  SET_LDA_X(A);
138  SET_LDA_X(B);
139  SET_LDA_X(C);
140  #undef SET_LDA_X
141 
142  /* dynammically determined size */
143  beta_arr = (int*)CTF_int::alloc(sizeof(int)*nb_C);
144  memset(beta_arr, 0, nb_C*sizeof(int));
145  #if (VIRT_NTD>1)
146  #pragma omp parallel private(off_A,off_B,off_C,tidx_arr,i)
147  #endif
148  {
149  int tid, ntd, start_off, end_off;
150  #if (VIRT_NTD>1)
151  tid = omp_get_thread_num();
152  ntd = MIN(VIRT_NTD, omp_get_num_threads());
153  #else
154  tid = 0;
155  ntd = 1;
156  #endif
157  #if (VIRT_NTD>1)
158  DPRINTF(2,"%d/%d %d %d\n",tid,ntd,VIRT_NTD,omp_get_num_threads());
159  #endif
160  if (tid < ntd){
161  tidx_arr = idx_arr + tid*num_dim;
162  memset(tidx_arr, 0, num_dim*sizeof(int));
163 
164  start_off = (nb_C/ntd)*tid;
165  if (tid < nb_C%ntd){
166  start_off += tid;
167  end_off = start_off + nb_C/ntd + 1;
168  } else {
169  start_off += nb_C%ntd;
170  end_off = start_off + nb_C/ntd;
171  }
172 
173  ctr * tid_rec_ctr;
174  if (tid > 0)
175  tid_rec_ctr = rec_ctr->clone();
176  else
177  tid_rec_ctr = rec_ctr;
178 
179  tid_rec_ctr->num_lyr = this->num_lyr;
180  tid_rec_ctr->idx_lyr = this->idx_lyr;
181 
182  off_A = 0, off_B = 0, off_C = 0;
183  for (;;){
184  if (off_C >= start_off && off_C < end_off) {
185  if (beta_arr[off_C]>0)
186  rec_ctr->beta = sr_C->mulid();
187  else
188  rec_ctr->beta = this->beta;
189  beta_arr[off_C] = 1;
190  tid_rec_ctr->run(
191  A + off_A*blk_sz_A*sr_A->el_size,
192  B + off_B*blk_sz_B*sr_A->el_size,
193  C + off_C*blk_sz_C*sr_A->el_size);
194  }
195 
196  for (i=0; i<num_dim; i++){
197  off_A -= ilda_A[i]*tidx_arr[i];
198  off_B -= ilda_B[i]*tidx_arr[i];
199  off_C -= ilda_C[i]*tidx_arr[i];
200  tidx_arr[i]++;
201  if (tidx_arr[i] >= virt_dim[i])
202  tidx_arr[i] = 0;
203  off_A += ilda_A[i]*tidx_arr[i];
204  off_B += ilda_B[i]*tidx_arr[i];
205  off_C += ilda_C[i]*tidx_arr[i];
206  if (tidx_arr[i] != 0) break;
207  }
208 #ifdef MICROBENCH
209  break;
210 #else
211  if (i==num_dim) break;
212 #endif
213  }
214  if (tid > 0){
215  delete tid_rec_ctr;
216  }
217  }
218  }
219  if (alloced){
220  CTF_int::cdealloc(idx_arr);
221  }
222  CTF_int::cdealloc(beta_arr);
224  }
225 
226 
227 
229  bool is_inner,
230  iparam const * inner_params,
231  int * virt_blk_len_A,
232  int * virt_blk_len_B,
233  int * virt_blk_len_C,
234  int64_t vrt_sz_C)
235  : ctr(c) {
236 
237  int i, j, k;
238  int * new_sym_A, * new_sym_B, * new_sym_C;
239  CTF_int::alloc_ptr(sizeof(int)*c->A->order, (void**)&new_sym_A);
240  memcpy(new_sym_A, c->A->sym, sizeof(int)*c->A->order);
241  CTF_int::alloc_ptr(sizeof(int)*c->B->order, (void**)&new_sym_B);
242  memcpy(new_sym_B, c->B->sym, sizeof(int)*c->B->order);
243  CTF_int::alloc_ptr(sizeof(int)*c->C->order, (void**)&new_sym_C);
244  memcpy(new_sym_C, c->C->sym, sizeof(int)*c->C->order);
245 
246  this->inner_params = *inner_params;
247  if (!is_inner){
248  this->is_inner = 0;
249  } else if (is_inner == 1) {
250  if (c->A->wrld->cdt.rank == 0){
251  DPRINTF(3,"Folded tensor l=%ld n=%ld m=%ld k=%ld\n", inner_params->l, inner_params->n,
252  inner_params->m, inner_params->k);
253  }
254 
255  this->is_inner = 1;
256  this->inner_params.sz_C = vrt_sz_C;
257  tensor * itsr;
258  itsr = c->A->rec_tsr;
259  for (i=0; i<itsr->order; i++){
260  j = c->A->inner_ordering[i];
261  for (k=0; k<c->A->order; k++){
262  if (c->A->sym[k] == NS) j--;
263  if (j<0) break;
264  }
265  j = k;
266  while (k>0 && c->A->sym[k-1] != NS){
267  k--;
268  }
269  for (; k<=j; k++){
270  /* printf("inner_ordering[%d]=%d setting dim %d of A, to len %d from len %d\n",
271  i, c->A->inner_ordering[i], k, 1, virt_blk_len_A[k]);*/
272  virt_blk_len_A[k] = 1;
273  new_sym_A[k] = NS;
274  }
275  }
276  itsr = c->B->rec_tsr;
277  for (i=0; i<itsr->order; i++){
278  j = c->B->inner_ordering[i];
279  for (k=0; k<c->B->order; k++){
280  if (c->B->sym[k] == NS) j--;
281  if (j<0) break;
282  }
283  j = k;
284  while (k>0 && c->B->sym[k-1] != NS){
285  k--;
286  }
287  for (; k<=j; k++){
288  /* printf("inner_ordering[%d]=%d setting dim %d of B, to len %d from len %d\n",
289  i, c->B->inner_ordering[i], k, 1, virt_blk_len_B[k]);*/
290  virt_blk_len_B[k] = 1;
291  new_sym_B[k] = NS;
292  }
293  }
294  itsr = c->C->rec_tsr;
295  for (i=0; i<itsr->order; i++){
296  j = c->C->inner_ordering[i];
297  for (k=0; k<c->C->order; k++){
298  if (c->C->sym[k] == NS) j--;
299  if (j<0) break;
300  }
301  j = k;
302  while (k>0 && c->C->sym[k-1] != NS){
303  k--;
304  }
305  for (; k<=j; k++){
306  /* printf("inner_ordering[%d]=%d setting dim %d of C, to len %d from len %d\n",
307  i, c->C->inner_ordering[i], k, 1, virt_blk_len_C[k]);*/
308  virt_blk_len_C[k] = 1;
309  new_sym_C[k] = NS;
310  }
311  }
312  }
313  this->is_custom = c->is_custom;
314  this->alpha = c->alpha;
315  if (is_custom){
316  this->func = c->func;
317  } else {
318  this->func = NULL;
319  }
320  this->order_A = c->A->order;
321  this->idx_map_A = c->idx_A;
322  this->edge_len_A = virt_blk_len_A;
323  this->sym_A = new_sym_A;
324  this->order_B = c->B->order;
325  this->idx_map_B = c->idx_B;
326  this->edge_len_B = virt_blk_len_B;
327  this->sym_B = new_sym_B;
328  this->order_C = c->C->order;
329  this->idx_map_C = c->idx_C;
330  this->edge_len_C = virt_blk_len_C;
331  this->sym_C = new_sym_C;
332 
333 
334  }
335 
336 
338  int i;
339  printf("seq_tsr_ctr:\n");
340  for (i=0; i<order_A; i++){
341  printf("edge_len_A[%d]=%d\n",i,edge_len_A[i]);
342  }
343  for (i=0; i<order_B; i++){
344  printf("edge_len_B[%d]=%d\n",i,edge_len_B[i]);
345  }
346  for (i=0; i<order_C; i++){
347  printf("edge_len_C[%d]=%d\n",i,edge_len_C[i]);
348  }
349  printf("is inner = %d\n", is_inner);
350  if (is_inner) printf("inner n = %ld m= %ld k = %ld l = %ld\n",
352  }
353 
354  seq_tsr_ctr::seq_tsr_ctr(ctr * other) : ctr(other) {
355  seq_tsr_ctr * o = (seq_tsr_ctr*)other;
356  alpha = o->alpha;
357 
358  order_A = o->order_A;
359  idx_map_A = o->idx_map_A;
360  sym_A = (int*)CTF_int::alloc(sizeof(int)*order_A);
361  memcpy(sym_A, o->sym_A, sizeof(int)*order_A);
362  edge_len_A = (int*)CTF_int::alloc(sizeof(int)*order_A);
363  memcpy(edge_len_A, o->edge_len_A, sizeof(int)*order_A);
364 
365  order_B = o->order_B;
366  idx_map_B = o->idx_map_B;
367  sym_B = (int*)CTF_int::alloc(sizeof(int)*order_B);
368  memcpy(sym_B, o->sym_B, sizeof(int)*order_B);
369  edge_len_B = (int*)CTF_int::alloc(sizeof(int)*order_B);
370  memcpy(edge_len_B, o->edge_len_B, sizeof(int)*order_B);
371 
372  order_C = o->order_C;
373  idx_map_C = o->idx_map_C;
374  sym_C = (int*)CTF_int::alloc(sizeof(int)*order_C);
375  memcpy(sym_C, o->sym_C, sizeof(int)*order_C);
376  edge_len_C = (int*)CTF_int::alloc(sizeof(int)*order_C);
377  memcpy(edge_len_C, o->edge_len_C, sizeof(int)*order_C);
378 
379  is_inner = o->is_inner;
381  is_custom = o->is_custom;
382  func = o->func;
383  }
384 
386  return new seq_tsr_ctr(this);
387  }
388 
389  int64_t seq_tsr_ctr::mem_fp(){ return 0; }
390 
391  //double seq_tsr_ctr_mig[] = {1e-6, 9.30e-11, 5.61e-10};
398 
400  uint64_t size_A = sy_packed_size(order_A, edge_len_A, sym_A)*sr_A->el_size;
401  uint64_t size_B = sy_packed_size(order_B, edge_len_B, sym_B)*sr_B->el_size;
402  uint64_t size_C = sy_packed_size(order_C, edge_len_C, sym_C)*sr_C->el_size;
403  if (is_inner) size_A *= inner_params.m*inner_params.k;
404  if (is_inner) size_B *= inner_params.n*inner_params.k;
405  if (is_inner) size_C *= inner_params.m*inner_params.n;
406 
407  ASSERT(size_A > 0);
408  ASSERT(size_B > 0);
409  ASSERT(size_C > 0);
410  return size_A+size_B+size_C;
411  }
412 
414  int idx_max, * rev_idx_map;
418  &idx_max, &rev_idx_map);
419 
420  double flops = 2.0;
421  if (is_inner) {
422  flops *= inner_params.m;
423  flops *= inner_params.n;
424  flops *= inner_params.k;
425  }
426  for (int i=0; i<idx_max; i++){
427  if (rev_idx_map[3*i+0] != -1) flops*=edge_len_A[rev_idx_map[3*i+0]];
428  else if (rev_idx_map[3*i+1] != -1) flops*=edge_len_B[rev_idx_map[3*i+1]];
429  else if (rev_idx_map[3*i+2] != -1) flops*=edge_len_C[rev_idx_map[3*i+2]];
430  }
431  ASSERT(flops >= 0.0);
432  CTF_int::cdealloc(rev_idx_map);
433  return flops;
434  }
435 
436  double seq_tsr_ctr::est_time_fp(int nlyr){
437  //return COST_MEMBW*(size_A+size_B+size_C)+COST_FLOP*flops;
438  double ps[] = {1.0, (double)est_membw(), est_fp()};
439 // printf("time estimate is %lf\n", seq_tsr_ctr_mdl.est_time(ps));
440  if (is_custom && !is_inner){
441  return seq_tsr_ctr_mdl_cst.est_time(ps);
442  } else if (is_inner){
443  if (is_custom){
444  if (inner_params.offload)
445  return seq_tsr_ctr_mdl_cst_off.est_time(ps);
446  else
447  return seq_tsr_ctr_mdl_cst_inr.est_time(ps);
448  } else {
449  if (inner_params.offload)
450  return seq_tsr_ctr_mdl_off.est_time(ps);
451  else
452  return seq_tsr_ctr_mdl_inr.est_time(ps);
453  }
454  } else
455  return seq_tsr_ctr_mdl_ref.est_time(ps);
456  assert(0); //wont make it here
457  return 0.0;
458  }
459 
460  double seq_tsr_ctr::est_time_rec(int nlyr){
461  return est_time_fp(nlyr);
462  }
463 
464  void seq_tsr_ctr::run(char * A, char * B, char * C){
465  ASSERT(idx_lyr == 0 && num_lyr == 1);
466 
467 #ifdef TUNE
468  // Check if we need to execute this function for the sake of training
469  bool sr;
470  if (is_custom && !is_inner){
471  double tps[] = {0, 1.0, (double)est_membw(), est_fp()};
472  sr = seq_tsr_ctr_mdl_cst.should_observe(tps);
473  } else if (is_inner){
474  ASSERT(is_custom || func == NULL);
475  double tps[] = {0.0, 1.0, (double)est_membw(), est_fp()};
476  if (is_custom){
477  if (inner_params.offload)
478  sr = seq_tsr_ctr_mdl_cst_off.should_observe(tps);
479  else
480  sr = seq_tsr_ctr_mdl_cst_inr.should_observe(tps);
481  } else {
482  if (inner_params.offload)
483  sr = seq_tsr_ctr_mdl_off.should_observe(tps);
484  else
485  sr = seq_tsr_ctr_mdl_inr.should_observe(tps);
486  }
487 
488  } else {
489  double tps[] = {0.0, 1.0, (double)est_membw(), est_fp()};
490  sr = seq_tsr_ctr_mdl_ref.should_observe(tps);
491  }
492 
493  if (!sr) return;
494 #endif
495  if (is_custom && !is_inner){
496  double st_time = MPI_Wtime();
497  ASSERT(is_inner == 0);
498  sym_seq_ctr_cust(this->alpha,
499  A,
500  sr_A,
501  order_A,
502  edge_len_A,
503  sym_A,
504  idx_map_A,
505  B,
506  sr_B,
507  order_B,
508  edge_len_B,
509  sym_B,
510  idx_map_B,
511  this->beta,
512  C,
513  sr_C,
514  order_C,
515  edge_len_C,
516  sym_C,
517  idx_map_C,
518  func);
519  double exe_time = MPI_Wtime()-st_time;
520  double tps[] = {exe_time, 1.0, (double)est_membw(), est_fp()};
521  seq_tsr_ctr_mdl_cst.observe(tps);
522  } else if (is_inner){
523  ASSERT(is_custom || func == NULL);
524 // double ps[] = {1.0, (double)est_membw(), est_fp()};
525 // double est_time = seq_tsr_ctr_mdl_inr.est_time(ps);
526  double st_time = MPI_Wtime();
527  sym_seq_ctr_inr(this->alpha,
528  A,
529  sr_A,
530  order_A,
531  edge_len_A,
532  sym_A,
533  idx_map_A,
534  B,
535  sr_B,
536  order_B,
537  edge_len_B,
538  sym_B,
539  idx_map_B,
540  this->beta,
541  C,
542  sr_C,
543  order_C,
544  edge_len_C,
545  sym_C,
546  idx_map_C,
547  &inner_params,
548  func);
549  double exe_time = MPI_Wtime()-st_time;
550  // printf("exe_time = %E est_time = %E abs_err = %e rel_err = %lf\n", exe_time,est_time,fabs(exe_time-est_time),fabs(exe_time-est_time)/exe_time);
551  double tps[] = {exe_time, 1.0, (double)est_membw(), est_fp()};
552  if (is_custom){
553  if (inner_params.offload)
554  seq_tsr_ctr_mdl_cst_off.observe(tps);
555  else
556  seq_tsr_ctr_mdl_cst_inr.observe(tps);
557  } else {
558  if (inner_params.offload)
559  seq_tsr_ctr_mdl_off.observe(tps);
560  else
561  seq_tsr_ctr_mdl_inr.observe(tps);
562  }
563 // seq_tsr_ctr_mdl_inr.print_param_guess();
564  } else {
565  double st_time = MPI_Wtime();
566  sym_seq_ctr_ref(this->alpha,
567  A,
568  sr_A,
569  order_A,
570  edge_len_A,
571  sym_A,
572  idx_map_A,
573  B,
574  sr_B,
575  order_B,
576  edge_len_B,
577  sym_B,
578  idx_map_B,
579  this->beta,
580  C,
581  sr_C,
582  order_C,
583  edge_len_C,
584  sym_C,
585  idx_map_C);
586  double exe_time = MPI_Wtime()-st_time;
587  double tps[] = {exe_time, 1.0, (double)est_membw(), est_fp()};
588  seq_tsr_ctr_mdl_ref.observe(tps);
589  }
590  }
591 
592  void inv_idx(int order_A,
593  int const * idx_A,
594  int order_B,
595  int const * idx_B,
596  int order_C,
597  int const * idx_C,
598  int * order_tot,
599  int ** idx_arr){
600  int i, dim_max;
601 
602  dim_max = -1;
603  for (i=0; i<order_A; i++){
604  if (idx_A[i] > dim_max) dim_max = idx_A[i];
605  }
606  for (i=0; i<order_B; i++){
607  if (idx_B[i] > dim_max) dim_max = idx_B[i];
608  }
609  for (i=0; i<order_C; i++){
610  if (idx_C[i] > dim_max) dim_max = idx_C[i];
611  }
612  dim_max++;
613  *order_tot = dim_max;
614  *idx_arr = (int*)CTF_int::alloc(sizeof(int)*3*dim_max);
615  std::fill((*idx_arr), (*idx_arr)+3*dim_max, -1);
616 
617  for (i=0; i<order_A; i++){
618  (*idx_arr)[3*idx_A[i]] = i;
619  }
620  for (i=0; i<order_B; i++){
621  (*idx_arr)[3*idx_B[i]+1] = i;
622  }
623  for (i=0; i<order_C; i++){
624  (*idx_arr)[3*idx_C[i]+2] = i;
625  }
626  }
627 
628 
629 /* ctr_dgemm::~ctr_dgemm() { }
630 
631  ctr_dgemm::ctr_dgemm(ctr * other) : ctr(other) {
632  ctr_dgemm * o = (ctr_dgemm*)other;
633  n = o->n;
634  m = o->m;
635  k = o->k;
636  alpha = o->alpha;
637  transp_A = o->transp_A;
638  transp_B = o->transp_B;
639  }
640  ctr * ctr_dgemm::clone() {
641  return new ctr_dgemm(this);
642  }
643 
644 
645  int64_t ctr_dgemm::mem_fp(){
646  return 0;
647  }
648 
649 
650  double ctr_dgemm::est_time_fp(int nlyr) {
651  // FIXME make cost proper, for now return sizes of each submatrix scaled by .2
652  ASSERT(0);
653  return n*m+m*k+n*k;
654  }
655 
656  double ctr_dgemm::est_time_rec(int nlyr) {
657  return est_time_fp(nlyr);
658  }*/
659 /*
660  template<> inline
661  void ctr_dgemm< std::complex<double> >::run(){
662  const int lda_A = transp_A == 'n' ? m : k;
663  const int lda_B = transp_B == 'n' ? k : n;
664  const int lda_C = m;
665  if (this->idx_lyr == 0){
666  czgemm(transp_A,
667  transp_B,
668  m,
669  n,
670  k,
671  alpha,
672  A,
673  lda_A,
674  B,
675  lda_B,
676  this->beta,
677  C,
678  lda_C);
679  }
680  }
681 
682  void ctr_dgemm::run(){
683  const int lda_A = transp_A == 'n' ? m : k;
684  const int lda_B = transp_B == 'n' ? k : n;
685  const int lda_C = m;
686  if (this->idx_lyr == 0){
687  cdgemm(transp_A,
688  transp_B,
689  m,
690  n,
691  k,
692  alpha,
693  A,
694  lda_A,
695  B,
696  lda_B,
697  this->beta,
698  C,
699  lda_C);
700  }
701  }*/
702 
703 
704 }
int64_t blk_sz_B
Definition: ctr_tsr.h:18
int const * idx_map_B
Definition: ctr_tsr.h:96
CTF_int::CommData cdt
communicator data for MPI comm defining this world
Definition: world.h:32
int * sym
symmetries among tensor dimensions
double est_time(double const *param)
estimates model time based on observarions
Definition: model.cxx:530
tensor * A
left operand
Definition: contraction.h:19
#define DPRINTF(...)
Definition: util.h:235
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values
Definition: model.cxx:168
double seq_tsr_ctr_mdl_ref_init[]
Definition: init_models.cxx:20
bool offload
Definition: ctr_tsr.h:84
int64_t k
Definition: ctr_tsr.h:79
virtual int64_t mem_rec()
Definition: ctr_comm.h:177
double seq_tsr_ctr_mdl_inr_init[]
Definition: init_models.cxx:21
int * inner_ordering
ordering of the dimensions according to which the tensori s folded
tensor * B
right operand
Definition: contraction.h:21
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
Definition: ctr_tsr.cxx:592
int const * idx_map_C
Definition: ctr_tsr.h:100
int64_t mem_fp()
Definition: ctr_tsr.cxx:92
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
uint64_t est_membw()
Definition: ctr_tsr.cxx:399
Definition: common.h:37
int64_t m
Definition: ctr_tsr.h:78
~ctr_virt()
deallocates ctr_virt object
Definition: ctr_tsr.cxx:41
seq_tsr_ctr(ctr *other)
clones ctr object
Definition: ctr_tsr.cxx:354
LinModel< 3 > seq_tsr_ctr_mdl_cst(seq_tsr_ctr_mdl_cst_init,"seq_tsr_ctr_mdl_cst")
bivar_function const * func
function to execute on elements
Definition: contraction.h:39
int const * idx_map_C
Definition: ctr_tsr.h:22
int order
number of tensor dimensions
bool is_custom
whether there is a elementwise custom function
Definition: contraction.h:37
int64_t blk_sz_C
Definition: ctr_tsr.h:21
algstrct const * sr_B
Definition: ctr_comm.h:168
CTF::World * wrld
distributed processor context on which tensor is defined
ctr_virt(ctr *other)
copies ctr_virt object
Definition: ctr_tsr.cxx:46
algstrct const * sr_C
Definition: ctr_comm.h:169
class for execution distributed contraction of tensors
Definition: contraction.h:16
int const * idx_map_B
Definition: ctr_tsr.h:19
int64_t n
Definition: ctr_tsr.h:77
virtual void print()
Definition: ctr_comm.h:175
int64_t sz_C
Definition: ctr_tsr.h:80
int * idx_B
indices of right operand
Definition: contraction.h:33
tensor * C
output
Definition: contraction.h:23
void run(char *A, char *B, char *C)
iterates over the dense virtualization block grid and contracts
Definition: ctr_tsr.cxx:101
Linear performance models, which given measurements, provides new model guess.
Definition: model.h:32
double seq_tsr_ctr_mdl_cst_inr_init[]
Definition: init_models.cxx:23
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
algstrct const * sr_A
Definition: ctr_comm.h:167
int const * idx_map_A
Definition: ctr_tsr.h:92
int sym_seq_ctr_cust(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, bivar_function const *func)
performs symmetric contraction with custom elementwise function
double seq_tsr_ctr_mdl_off_init[]
Definition: init_models.cxx:22
int64_t l
Definition: ctr_tsr.h:76
virtual ctr * clone()
Definition: ctr_comm.h:180
double est_time_rec(int nlyr)
Definition: ctr_tsr.cxx:460
int sym_seq_ctr_ref(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C)
performs symmetric contraction with reference (unblocked) kernel
LinModel< 3 > seq_tsr_ctr_mdl_cst_inr(seq_tsr_ctr_mdl_cst_inr_init,"seq_tsr_ctr_mdl_cst_inr")
tensor * rec_tsr
representation of folded tensor (shares data pointer)
LinModel< 3 > seq_tsr_ctr_mdl_ref(seq_tsr_ctr_mdl_ref_init,"seq_tsr_ctr_mdl_ref")
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
LinModel< 3 > seq_tsr_ctr_mdl_off(seq_tsr_ctr_mdl_off_init,"seq_tsr_ctr_mdl_off")
double est_time_rec(int nlyr)
Definition: ctr_tsr.cxx:82
virtual double est_time_rec(int nlyr)
Definition: ctr_comm.h:179
bool should_observe(double const *time_param)
decides whether the current instance should be observed
Definition: model.cxx:215
int * idx_C
indices of output
Definition: contraction.h:35
int sym_seq_ctr_inr(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, iparam const *prm, bivar_function const *func)
performs symmetric contraction with blocked gemm
bivar_function const * func
Definition: ctr_tsr.h:108
double seq_tsr_ctr_mdl_cst_off_init[]
Definition: init_models.cxx:24
virtual void run(char *A, char *B, char *C)
Definition: ctr_comm.h:174
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
int * virt_dim
Definition: ctr_tsr.h:13
void run(char *A, char *B, char *C)
wraps user sequential function signature
Definition: ctr_tsr.cxx:464
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
#define SET_LDA_X(__X)
char const * alpha
Definition: ctr_tsr.h:89
internal distributed tensor class
LinModel< 3 > seq_tsr_ctr_mdl_cst_off(seq_tsr_ctr_mdl_cst_off_init,"seq_tsr_ctr_mdl_cst_off")
double seq_tsr_ctr_mdl_cst_init[]
Definition: init_models.cxx:19
int64_t blk_sz_A
Definition: ctr_tsr.h:15
int64_t mem_rec()
Definition: ctr_tsr.cxx:96
LinModel< 3 > seq_tsr_ctr_mdl_inr(seq_tsr_ctr_mdl_inr_init,"seq_tsr_ctr_mdl_inr")
char const * alpha
scaling of A*B
Definition: contraction.h:26
#define MIN(a, b)
Definition: util.h:176
#define VIRT_NTD
Definition: ctr_tsr.cxx:17
int * idx_A
indices of left operand
Definition: contraction.h:31
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
double est_time_fp(int nlyr)
Definition: ctr_tsr.cxx:436
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
Definition: util.cxx:10
char const * beta
Definition: ctr_comm.h:170
int const * idx_map_A
Definition: ctr_tsr.h:16