Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
ctr_offload.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #include "../shared/util.h"
4 #include "ctr_offload.h"
5 #ifdef OFFLOAD
6 namespace CTF_int {
7  ctr_offload::ctr_offload(contraction const * c,
8  int64_t size_A_,
9  int64_t size_B_,
10  int64_t size_C_,
11  int total_iter_,
12  int upload_phase_A_,
13  int upload_phase_B_,
14  int download_phase_C_) : ctr(c) {
15  size_A = size_A_;
16  size_B = size_B_;
17  size_C = size_C_;
18  total_iter = total_iter_;
19  upload_phase_A = upload_phase_A_;
20  upload_phase_B = upload_phase_B_;
21  download_phase_C = download_phase_C_;
22 
23  iter_counter = 0;
24  ptr_A = NULL;
25  ptr_B = NULL;
26  ptr_C = NULL;
27  }
28 
29  ctr_offload::~ctr_offload(){
30  delete rec_ctr;
31  }
32 
33  ctr_offload::ctr_offload(ctr * other) : ctr(other) {
34  ctr_offload * o = (ctr_offload*)other;
35  rec_ctr = o->rec_ctr->clone();
36  size_A = o->size_A;
37  size_B = o->size_B;
38  size_C = o->size_C;
39  iter_counter = o->iter_counter;
40  total_iter = o->total_iter;
41  upload_phase_A = o->upload_phase_A;
42  upload_phase_B = o->upload_phase_B;
43  download_phase_C = o->download_phase_C;
44  ptr_A = o->ptr_A;
45  ptr_B = o->ptr_B;
46  ptr_C = o->ptr_C;
47  }
48 
49  ctr * ctr_offload::clone() {
50  return new ctr_offload(this);
51  }
52 
53  void ctr_offload::print() {
54  printf("ctr_offload: \n");
55  printf("total_iter = %d\n", total_iter);
56  printf("size_A = %ld, upload_phase_A = %d\n",
57  size_A, upload_phase_A);
58  printf("size_B = %ld, upload_phase_B = %d\n",
59  size_B, upload_phase_B);
60  printf("size_C = %ld, download_phase_C = %d\n",
61  size_C, download_phase_C);
62  rec_ctr->print();
63  }
64 
65  double ctr_offload::est_time_fp(int nlyr){
66  double tot_time = 0.0;
67  tot_time += estimate_upload_time(size_A*sr_A->el_size)*(total_iter/upload_phase_A);
68  tot_time += estimate_upload_time(size_B*sr_B->el_size)*(total_iter/upload_phase_B);
69  tot_time += estimate_download_time(size_C*sr_C->el_size)*(total_iter/download_phase_C);
70  return tot_time;
71  }
72 
73  double ctr_offload::est_time_rec(int nlyr) {
74  return rec_ctr->est_time_rec(nlyr) + est_time_fp(nlyr);
75  }
76 
77  int64_t ctr_offload::mem_fp(){
78  return size_C*sr_C->el_size;
79  }
80 
81  int64_t ctr_offload::mem_rec() {
82  return rec_ctr->mem_rec() + mem_fp();
83  }
84 
85  void ctr_offload::run(char * A, char * B, char * C){
86  TAU_FSTART(ctr_offload);
87  ASSERT(iter_counter < total_iter);
88  if (iter_counter == 0){
89  ptr_A = new offload_tsr(sr_A, size_A);
90  ptr_B = new offload_tsr(sr_B, size_B);
91  ptr_C = new offload_tsr(sr_C, size_C);
92 
93  ptr_A->upload(A);
94  ptr_B->upload(B);
95 
96  ptr_C->set_zero();
97  } else {
98  if (iter_counter % upload_phase_A == 0)
99  ptr_A->upload(A);
100  if (iter_counter % upload_phase_B == 0)
101  ptr_B->upload(B);
102  }
103  if (!sr_C->isequal(this->beta, sr_C->mulid())){
104  ASSERT(iter_counter % download_phase_C == 0);
105  //FIXME daxpy
107  if (sr_C->isequal(this->beta, sr_C->addid()))
108  sr_C->set(C, sr_C->addid(), size_C);
109  else
110  sr_C->scal(size_C, this->beta, C, 1);
111  /*for (int i=0; i<size_C; i++){
112  this->C[i] = this->C[i]*this->beta;
113  }*/
114  }
115 
116  rec_ctr->beta = sr_C->mulid();
117  rec_ctr->num_lyr = this->num_lyr;
118  rec_ctr->idx_lyr = this->idx_lyr;
119 
120  TAU_FSTOP(ctr_offload);
121  rec_ctr->run(ptr_A->dev_spr, ptr_B->dev_spr, ptr_C->dev_spr);
122  TAU_FSTART(ctr_offload);
123 
124  iter_counter++;
125 
126  if (iter_counter % download_phase_C == 0){
127  char * C_host_ptr;
128  host_pinned_alloc((void**)&C_host_ptr, size_C*sr_C->el_size);
129  ptr_C->download(C_host_ptr);
130  sr_C->axpy(size_C, sr_C->mulid(), C_host_ptr, 1, C, 1);
131 /* for (int i=0; i<size_C; i++){
132  this->C[i] += C_host_ptr[i];
133  }*/
134  host_pinned_free(C_host_ptr);
135  if (iter_counter != total_iter)
136  ptr_C->set_zero();
137  }
138 
139 
140  if (iter_counter == total_iter){
141  delete ptr_A;
142  delete ptr_B;
143  delete ptr_C;
144  iter_counter = 0;
145  }
146  TAU_FSTOP(ctr_offload);
147  }
148 }
149 #endif
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340
virtual int64_t mem_rec()
Definition: ctr_comm.h:177
int64_t mem_fp()
returns the number of bytes of buffer space we need
Definition: ctr_comm.cxx:196
void host_pinned_alloc(void **ptr, int64_t size)
allocate a pinned host buffer
#define ASSERT(...)
Definition: util.h:88
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
#define CTF_FLOPS_ADD(n)
Definition: util.h:138
algstrct const * sr_B
Definition: ctr_comm.h:168
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
ctr(ctr *other)
copies generic ctr object
Definition: ctr_comm.cxx:31
algstrct const * sr_C
Definition: ctr_comm.h:169
void host_pinned_free(void *ptr)
free a pinned host buffer
virtual void print()
Definition: ctr_comm.h:175
algstrct const * sr_A
Definition: ctr_comm.h:167
virtual ctr * clone()
Definition: ctr_comm.h:180
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
Definition: algstrct.cxx:262
virtual double est_time_rec(int nlyr)
Definition: ctr_comm.h:179
virtual void axpy(int n, char const *alpha, char const *X, int incX, char *Y, int incY) const
Y["i"]+=alpha*X["i"];.
Definition: algstrct.cxx:280
virtual void run(char *A, char *B, char *C)
Definition: ctr_comm.h:174
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
double estimate_upload_time(int64_t size)
estimate time it takes to download
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
char const * beta
Definition: ctr_comm.h:170
double estimate_download_time(int64_t size)
estimate time it takes to upload
double est_time_fp(int nlyr)
returns the execution time the local part this kernel is estimated to take
Definition: ctr_comm.cxx:173