Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
spctr_offload.cxx
Go to the documentation of this file.
1 
2 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
3 
4 #include "../shared/util.h"
5 #include "spctr_offload.h"
6 #ifdef OFFLOAD
7 namespace CTF_int {
8  spctr_offload::spctr_offload(contraction const * c,
9  int64_t size_A_,
10  int64_t size_B_,
11  int64_t size_C_,
12  int total_iter_,
13  int upload_phase_A_,
14  int upload_phase_B_,
15  int download_phase_C_) : spctr(c) {
16  size_A = size_A_;
17  size_B = size_B_;
18  size_C = size_C_;
19  total_iter = total_iter_;
20  upload_phase_A = upload_phase_A_;
21  upload_phase_B = upload_phase_B_;
22  download_phase_C = download_phase_C_;
23 
24  iter_counter = 0;
25  spr_A = NULL;
26  spr_B = NULL;
27  spr_C = NULL;
28  }
29 
30  spctr_offload::~spctr_offload(){
31  delete rec_ctr;
32  }
33 
34  spctr_offload::spctr_offload(spctr * other) : spctr(other) {
35  spctr_offload * o = (spctr_offload*)other;
36  rec_ctr = o->rec_ctr->clone();
37  size_A = o->size_A;
38  size_B = o->size_B;
39  size_C = o->size_C;
40  iter_counter = o->iter_counter;
41  total_iter = o->total_iter;
42  upload_phase_A = o->upload_phase_A;
43  upload_phase_B = o->upload_phase_B;
44  download_phase_C = o->download_phase_C;
45  spr_A = o->spr_A;
46  spr_B = o->spr_B;
47  spr_C = o->spr_C;
48  }
49 
50  spctr * spctr_offload::clone() {
51  return new spctr_offload(this);
52  }
53 
54  void spctr_offload::print() {
55  printf("spctr_offload: \n");
56  printf("total_iter = %d\n", total_iter);
57  printf("upload_phase_A = %d\n",
58  upload_phase_A);
59  printf("upload_phase_B = %d\n",
60  upload_phase_B);
61  printf("download_phase_C = %d\n",
62  download_phase_C);
63  rec_ctr->print();
64  }
65 
66  double spctr_offload::est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
67  double tot_time = 0.0;
68  tot_time += estimate_upload_time(nnz_frac_A*size_A*sr_A->el_size)*(total_iter/upload_phase_A);
69  tot_time += estimate_upload_time(nnz_frac_B*size_B*sr_B->el_size)*(total_iter/upload_phase_B);
70  tot_time += estimate_download_time(nnz_frac_C*size_C*sr_C->el_size)*(total_iter/download_phase_C);
71  tot_time += estimate_download_time(nnz_frac_C*size_C*sr_C->el_size)*(total_iter/download_phase_C);
72  //tot_time += 1.E-9*2.*nnz_frac_C*size_C*sr_C->el_size*(total_iter/download_phase_C);
73  return tot_time;
74  }
75 
76  double spctr_offload::est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
77  return rec_ctr->est_time_rec(nlyr, nnz_frac_A, nnz_frac_B, nnz_frac_C) + est_time_fp(nlyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
78  }
79 
80  int64_t spctr_offload::spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
81  return 0;
82  }
83 
84  int64_t spctr_offload::mem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C) {
85  return rec_ctr->mem_rec(nnz_frac_A, nnz_frac_B, nnz_frac_C) + spmem_fp(nnz_frac_A, nnz_frac_B, nnz_frac_C);
86  }
87 
88  void spctr_offload::run(char * A, int nblk_A, int64_t const * size_blk_A,
89  char * B, int nblk_B, int64_t const * size_blk_B,
90  char * C, int nblk_C, int64_t * size_blk_C,
91  char *& new_C){
92  TAU_FSTART(spctr_offload);
93  ASSERT(iter_counter < total_iter);
94  if (iter_counter % upload_phase_A == 0){
95  if (is_sparse_A){
96  if (iter_counter != 0){
97  delete spr_A;
98  }
99  int64_t sp_size_A = 0;
100  for (int i=0; i<nblk_A; i++){
101  sp_size_A += size_blk_A[i];
102  }
103  spr_A = new offload_arr(sp_size_A);
104  } else {
105  if (iter_counter == 0){
106  spr_A = new offload_tsr(sr_A, size_A);
107  }
108  }
109  spr_A->upload(A);
110  }
111  if (iter_counter % upload_phase_B == 0){
112  if (is_sparse_B){
113  if (iter_counter != 0){
114  delete spr_B;
115  }
116  int64_t sp_size_B = 0;
117  for (int i=0; i<nblk_B; i++){
118  sp_size_B += size_blk_B[i];
119  }
120  spr_B = new offload_arr(sp_size_B);
121  } else {
122  if (iter_counter == 0){
123  spr_B = new offload_tsr(sr_B, size_B);
124  }
125  }
126  spr_B->upload(B);
127  }
128  if (iter_counter == 0){
129  if (is_sparse_C){
130  int64_t sp_size_C = 0;
131  for (int i=0; i<nblk_C; i++){
132  sp_size_C += size_blk_C[i];
133  }
134  spr_C = new offload_arr(sp_size_C);
135  ASSERT(0); assert(0);
136  } else {
137  offload_tsr * tspr_C = new offload_tsr(sr_C, size_C);
138  spr_C = tspr_C;
139  tspr_C->set_zero();
140  }
141  }
142 
143  TAU_FSTART(offload_scale);
144  if (!sr_C->isequal(this->beta, sr_C->mulid())){
145  ASSERT(iter_counter % download_phase_C == 0);
146  //FIXME daxpy
148  if (sr_C->isequal(this->beta, sr_C->addid()))
149  sr_C->set(C, sr_C->addid(), size_C);
150  else
151  sr_C->scal(size_C, this->beta, C, 1);
152  /*for (int i=0; i<size_C; i++){
153  this->C[i] = this->C[i]*this->beta;
154  }*/
155  }
156  TAU_FSTOP(offload_scale);
157 
158  rec_ctr->beta = sr_C->mulid();
159  rec_ctr->num_lyr = this->num_lyr;
160  rec_ctr->idx_lyr = this->idx_lyr;
161 
162  TAU_FSTOP(spctr_offload);
163  rec_ctr->run(spr_A->dev_spr, nblk_A, size_blk_A,
164  spr_B->dev_spr, nblk_B, size_blk_B,
165  spr_C->dev_spr, nblk_C, size_blk_C,
166  new_C);
167  TAU_FSTART(spctr_offload);
168 
169  iter_counter++;
170 
171  if (iter_counter % download_phase_C == 0){
173  char * C_host_ptr;
174  host_pinned_alloc((void**)&C_host_ptr, size_C*sr_C->el_size);
175  spr_C->download(C_host_ptr);
176  /*for (int i=0; i<size_C; i++){
177  memcpy(C_host_ptr+i*sr_C->el_size, sr_C->addid(), sr_C->el_size);
178  memcpy(C+i*sr_C->el_size, sr_C->addid(), sr_C->el_size);
179  }*/
180  TAU_FSTART(offload_axpy);
181  sr_C->axpy(size_C, sr_C->mulid(), C_host_ptr, 1, C, 1);
182  TAU_FSTOP(offload_axpy);
183 /* for (int i=0; i<size_C; i++){
184  this->C[i] += C_host_ptr[i];
185  }*/
186  host_pinned_free(C_host_ptr);
187  if (iter_counter != total_iter)
188  ((offload_tsr*)spr_C)->set_zero();
189  }
190 
191 
192  if (iter_counter == total_iter){
193  delete spr_A;
194  delete spr_B;
195  delete spr_C;
196  iter_counter = 0;
197  }
198  TAU_FSTOP(spctr_offload);
199  }
200 }
201 #endif
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340
virtual int64_t mem_rec()
Definition: ctr_comm.h:177
virtual double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
Definition: spctr_tsr.h:39
void host_pinned_alloc(void **ptr, int64_t size)
allocate a pinned host buffer
#define ASSERT(...)
Definition: util.h:88
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
bool is_sparse_C
Definition: spctr_tsr.h:14
#define CTF_FLOPS_ADD(n)
Definition: util.h:138
bool is_sparse_A
Definition: spctr_tsr.h:12
algstrct const * sr_B
Definition: ctr_comm.h:168
int64_t spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes of buffer space we need
Definition: spctr_comm.cxx:173
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
algstrct const * sr_C
Definition: ctr_comm.h:169
void host_pinned_free(void *ptr)
free a pinned host buffer
virtual void print()
Definition: ctr_comm.h:175
char * new_C
Definition: spctr_tsr.h:15
double est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time the local part this kernel is estimated to take
Definition: spctr_comm.cxx:140
algstrct const * sr_A
Definition: ctr_comm.h:167
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
Definition: algstrct.cxx:262
virtual void axpy(int n, char const *alpha, char const *X, int incX, char *Y, int incY) const
Y["i"]+=alpha*X["i"];.
Definition: algstrct.cxx:280
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
double estimate_upload_time(int64_t size)
estimate time it takes to download
virtual spctr * clone()
Definition: spctr_tsr.h:19
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
void run(char *A, char *B, char *C)
Definition: spctr_tsr.h:48
char const * beta
Definition: ctr_comm.h:170
double estimate_download_time(int64_t size)
estimate time it takes to upload
bool is_sparse_B
Definition: spctr_tsr.h:13
spctr(spctr *other)
Definition: spctr_tsr.cxx:20