Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
spctr_comm.cxx
Go to the documentation of this file.
1 
2 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
3 
4 #include "../shared/util.h"
5 #include "spctr_comm.h"
6 #include "contraction.h"
7 #include "../tensor/untyped_tensor.h"
8 
9 namespace CTF_int {
11  int const * phys_mapped,
12  int64_t blk_sz_A,
13  int64_t blk_sz_B,
14  int64_t blk_sz_C)
15  : spctr(c) {
16  int i;
17  int nphys_dim = c->A->topo->order;
18  this->ncdt_A = 0;
19  this->ncdt_B = 0;
20  this->ncdt_C = 0;
21  this->size_A = blk_sz_A;
22  this->size_B = blk_sz_B;
23  this->size_C = blk_sz_C;
24  this->cdt_A = NULL;
25  this->cdt_B = NULL;
26  this->cdt_C = NULL;
27  for (i=0; i<nphys_dim; i++){
28  if (phys_mapped[3*i+0] == 0 &&
29  phys_mapped[3*i+1] == 0 &&
30  phys_mapped[3*i+2] == 0){
31  /* printf("ERROR: ALL-TENSOR REPLICATION NO LONGER DONE\n");
32  ABORT;
33  ASSERT(this->num_lyr == 1);
34  hspctr->idx_lyr = A->topo->dim_comm[i].rank;
35  hspctr->num_lyr = A->topo->dim_comm[i]->np;
36  this->idx_lyr = A->topo->dim_comm[i].rank;
37  this->num_lyr = A->topo->dim_comm[i]->np;*/
38  } else {
39  if (phys_mapped[3*i+0] == 0){
40  this->ncdt_A++;
41  }
42  if (phys_mapped[3*i+1] == 0){
43  this->ncdt_B++;
44  }
45  if (phys_mapped[3*i+2] == 0){
46  this->ncdt_C++;
47  }
48  }
49  }
50  if (this->ncdt_A > 0)
51  CTF_int::alloc_ptr(sizeof(CommData*)*this->ncdt_A, (void**)&this->cdt_A);
52  if (this->ncdt_B > 0)
53  CTF_int::alloc_ptr(sizeof(CommData*)*this->ncdt_B, (void**)&this->cdt_B);
54  if (this->ncdt_C > 0)
55  CTF_int::alloc_ptr(sizeof(CommData*)*this->ncdt_C, (void**)&this->cdt_C);
56  this->ncdt_A = 0;
57  this->ncdt_B = 0;
58  this->ncdt_C = 0;
59  for (i=0; i<nphys_dim; i++){
60  if (!(phys_mapped[3*i+0] == 0 &&
61  phys_mapped[3*i+1] == 0 &&
62  phys_mapped[3*i+2] == 0)){
63  if (phys_mapped[3*i+0] == 0){
64  this->cdt_A[this->ncdt_A] = &c->A->topo->dim_comm[i];
65  /* if (is_used && this->cdt_A[this->ncdt_A].alive == 0)
66  this->cdt_A[this->ncdt_A].activate(global_comm.cm);*/
67  this->ncdt_A++;
68  }
69  if (phys_mapped[3*i+1] == 0){
70  this->cdt_B[this->ncdt_B] = &c->B->topo->dim_comm[i];
71  /* if (is_used && this->cdt_B[this->ncdt_B].alive == 0)
72  this->cdt_B[this->ncdt_B].activate(global_comm.cm);*/
73  this->ncdt_B++;
74  }
75  if (phys_mapped[3*i+2] == 0){
76  this->cdt_C[this->ncdt_C] = &c->C->topo->dim_comm[i];
77  /* if (is_used && this->cdt_C[this->ncdt_C].alive == 0)
78  this->cdt_C[this->ncdt_C].activate(global_comm.cm);*/
79  this->ncdt_C++;
80  }
81  }
82  }
83  }
84 
86  delete rec_ctr;
87 /* for (int i=0; i<ncdt_A; i++){
88  cdt_A[i]->deactivate();
89  }*/
90  if (ncdt_A > 0)
92 /* for (int i=0; i<ncdt_B; i++){
93  cdt_B[i]->deactivate();
94  }*/
95  if (ncdt_B > 0)
97 /* for (int i=0; i<ncdt_C; i++){
98  cdt_C[i]->deactivate();
99  }*/
100  if (ncdt_C > 0)
102  }
103 
105  spctr_replicate * o = (spctr_replicate*)other;
106  rec_ctr = o->rec_ctr->clone();
107  size_A = o->size_A;
108  size_B = o->size_B;
109  size_C = o->size_C;
110  ncdt_A = o->ncdt_A;
111  ncdt_B = o->ncdt_B;
112  ncdt_C = o->ncdt_C;
113  }
114 
116  return new spctr_replicate(this);
117  }
118 
120  int i;
121  printf("spctr_replicate: \n");
122  printf("cdt_A = %p, size_A = %ld, ncdt_A = %d\n",
123  cdt_A, size_A, ncdt_A);
124  for (i=0; i<ncdt_A; i++){
125  printf("cdt_A[%d] length = %d\n",i,cdt_A[i]->np);
126  }
127  printf("cdt_B = %p, size_B = %ld, ncdt_B = %d\n",
128  cdt_B, size_B, ncdt_B);
129  for (i=0; i<ncdt_B; i++){
130  printf("cdt_B[%d] length = %d\n",i,cdt_B[i]->np);
131  }
132  printf("cdt_C = %p, size_C = %ld, ncdt_C = %d\n",
133  cdt_C, size_C, ncdt_C);
134  for (i=0; i<ncdt_C; i++){
135  printf("cdt_C[%d] length = %d\n",i,cdt_C[i]->np);
136  }
137  rec_ctr->print();
138  }
139 
140  double spctr_replicate::est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
141  int i;
142  double tot_sz;
143  tot_sz = 0.0;
144  for (i=0; i<ncdt_A; i++){
145  ASSERT(cdt_A[i]->np > 0);
146  //FIXME estimates off since multiplying by element size vs pair/CSR size
147  if (is_sparse_A)
148  tot_sz += cdt_A[i]->estimate_bcast_time(nnz_frac_A*size_A*sr_A->pair_size());
149  else
150  tot_sz += cdt_A[i]->estimate_bcast_time(nnz_frac_A*size_A*sr_A->el_size);
151  }
152  for (i=0; i<ncdt_B; i++){
153  ASSERT(cdt_B[i]->np > 0);
154  if (is_sparse_B)
155  tot_sz += cdt_B[i]->estimate_bcast_time(nnz_frac_B*size_B*sr_B->pair_size());
156  else
157  tot_sz += cdt_B[i]->estimate_bcast_time(nnz_frac_B*size_B*sr_B->el_size);
158  }
159  for (i=0; i<ncdt_C; i++){
160  ASSERT(cdt_C[i]->np > 0);
161  if (is_sparse_C)
162  tot_sz += sr_C->estimate_csr_red_time(nnz_frac_C*size_C*sr_C->pair_size(), cdt_C[i]);
163  else
164  tot_sz += cdt_C[i]->estimate_red_time(nnz_frac_C*size_C*sr_C->el_size, sr_C->addmop());
165  }
166  return tot_sz;
167  }
168 
169  double spctr_replicate::est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C) {
170  return rec_ctr->est_time_rec(nlyr, nnz_frac_A, nnz_frac_B, nnz_frac_C) + est_time_fp(nlyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
171  }
172 
173  int64_t spctr_replicate::spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
174  int64_t mem_usage = 0;
175  if (is_sparse_A) mem_usage += nnz_frac_A*size_A*sr_A->pair_size();
176  if (is_sparse_B) mem_usage += nnz_frac_B*size_B*sr_B->pair_size();
177  if (is_sparse_C) mem_usage += nnz_frac_C*size_C*sr_C->pair_size();
178  return mem_usage;
179  }
180 
181  int64_t spctr_replicate::spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C){
182  return rec_ctr->spmem_rec(nnz_frac_A, nnz_frac_B, nnz_frac_C) + spmem_fp(nnz_frac_A, nnz_frac_B, nnz_frac_C);
183  }
184 
185 
186  void spctr_replicate::run(char * A, int nblk_A, int64_t const * size_blk_A,
187  char * B, int nblk_B, int64_t const * size_blk_B,
188  char * C, int nblk_C, int64_t * size_blk_C,
189  char *& new_C){
190  int arank, brank, crank, i;
192  arank = 0, brank = 0, crank = 0;
193  for (i=0; i<ncdt_A; i++)
194  arank += cdt_A[i]->rank;
195  for (i=0; i<ncdt_B; i++)
196  brank += cdt_B[i]->rank;
197  for (i=0; i<ncdt_C; i++)
198  crank += cdt_C[i]->rank;
199  char * buf_A = A;
200  char * buf_B = B;
201  int64_t new_size_blk_A[nblk_A];
202  int64_t new_size_blk_B[nblk_B];
203  int64_t new_size_blk_C[nblk_C];
204  if (is_sparse_A){
205  memcpy(new_size_blk_A, size_blk_A, nblk_A*sizeof(int64_t));
206  /*if (ncdt_A > 0){
207  save_size_blk_A = (int64_t*)alloc(sizeof(int64_t)*nblk_A);
208  memcpy(save_size_blk_A,size_blk_A,sizeof(int64_t)*nblk_A);
209  }*/
210  for (i=0; i<ncdt_A; i++){
211  cdt_A[i]->bcast(new_size_blk_A, nblk_A, MPI_INT64_T, 0);
212  }
213  int64_t new_size_A = 0;
214  for (i=0; i<nblk_A; i++){
215  new_size_A += new_size_blk_A[i];
216  }
217  if (arank != 0)
218  buf_A = (char*)alloc(new_size_A);
219  for (i=0; i<ncdt_A; i++){
220  cdt_A[i]->bcast(buf_A, new_size_A, MPI_CHAR, 0);
221  }
222  } else {
223  for (i=0; i<ncdt_A; i++){
224  cdt_A[i]->bcast(A, size_A, sr_A->mdtype(), 0);
225  }
226  }
227  if (is_sparse_B){
228  memcpy(new_size_blk_B, size_blk_B, nblk_B*sizeof(int64_t));
229  /*if (ncdt_B > 0){
230  save_size_blk_B = (int64_t*)alloc(sizeof(int64_t)*nblk_B);
231  memcpy(save_size_blk_B,size_blk_B,sizeof(int64_t)*nblk_B);
232  }*/
233  for (i=0; i<ncdt_B; i++){
234  cdt_B[i]->bcast(new_size_blk_B, nblk_B, MPI_INT64_T, 0);
235  }
236  int64_t new_size_B = 0;
237  for (i=0; i<nblk_B; i++){
238  new_size_B += new_size_blk_B[i];
239  }
240  if (brank != 0)
241  buf_B = (char*)alloc(new_size_B);
242  for (i=0; i<ncdt_B; i++){
243  cdt_B[i]->bcast(buf_B, new_size_B, MPI_CHAR, 0);
244  }
245  } else {
246  for (i=0; i<ncdt_B; i++){
247  cdt_B[i]->bcast(B, size_B, sr_B->mdtype(), 0);
248  }
249  }
250  if (is_sparse_C){
251  memset(new_size_blk_C, 0, sizeof(int64_t)*nblk_C);
252  }
253 // if (crank != 0) this->sr_C->set(C, this->sr_C->addid(), size_C);
254  if (crank == 0 && !sr_C->isequal(this->beta, sr_C->mulid())){
256  if (sr_C->isequal(sr_C->addid(), beta))
257  sr_C->set(C, sr_C->addid(), size_C);
258  else sr_C->scal(size_C, this->beta, C, 1);
259 /* for (i=0; i<size_C; i++){
260  sr_C->mul(this->beta, C+i*sr_C->el_size, C+i*sr_C->el_size);
261  }*/
262  }
263  if (crank != 0)
264  rec_ctr->beta = sr_C->addid();
265  else
266  rec_ctr->beta = sr_C->mulid();
267 
268  rec_ctr->num_lyr = this->num_lyr;
269  rec_ctr->idx_lyr = this->idx_lyr;
270 
272  rec_ctr->run(buf_A, nblk_A, new_size_blk_A,
273  buf_B, nblk_B, new_size_blk_B,
274  C, nblk_C, size_blk_C,
275  new_C);
277  /*for (i=0; i<size_C; i++){
278  printf("P%d C[%d] = %lf\n",crank,i, ((double*)C)[i]);
279  }*/
280  for (i=0; i<ncdt_C; i++){
281  if (i>0 && cdt_C[i-1]->rank != 0) break;
282  //ALLREDUCE(MPI_IN_PLACE, C, size_C, sr_C->mdtype(), sr_C->addmop(), cdt_C[i]->;
283  if (is_sparse_C){
284  int64_t csr_sz_acc = 0;
285  int64_t new_csr_sz_acc = 0;
286  char * new_Cs[nblk_C];
287  for (int blk=0; blk<nblk_C; blk++){
288  new_Cs[blk] = sr_C->csr_reduce(new_C+csr_sz_acc, 0, cdt_C[i]->cm);
289  // printf("%d out of %d reds complete, size = %ld should be %ld\n",blk,nblk_C,size_blk_C[blk],((CSR_Matrix)(new_C+csr_sz_acc)).size());
290 
291  csr_sz_acc += size_blk_C[blk];
292  size_blk_C[blk] = cdt_C[i]->rank == 0 ? ((CSR_Matrix)(new_Cs[blk])).size() : 0;
293  new_csr_sz_acc += size_blk_C[blk];
294 // printf("rank %d blk size %ld tot sz %ld\n", cdt_C[i]->rank, size_blk_C[blk], new_csr_sz_acc);
295  }
296  cdealloc(new_C);
297  alloc_ptr(new_csr_sz_acc, (void**)&new_C);
298  new_csr_sz_acc = 0;
299  for (int blk=0; blk<nblk_C; blk++){
300  memcpy(new_C+new_csr_sz_acc, new_Cs[blk], size_blk_C[blk]);
301  cdealloc(new_Cs[blk]);
302  new_csr_sz_acc += size_blk_C[blk];
303  }
304  } else {
305  if (cdt_C[i]->rank == 0){
306  cdt_C[i]->red(MPI_IN_PLACE, new_C, size_C, sr_C->mdtype(), sr_C->addmop(), 0);
307  } else {
308  cdt_C[i]->red(new_C, NULL, size_C, sr_C->mdtype(), sr_C->addmop(), 0);
309  }
310  }
311  }
312 
313  if (is_sparse_A && buf_A != A) cdealloc(buf_A);
314  if (is_sparse_B && buf_B != B) cdealloc(buf_B);
315  if (!is_sparse_A && arank != 0){
316  this->sr_A->set(A, this->sr_A->addid(), size_A);
317  }
318  if (!is_sparse_B && brank != 0){
319  this->sr_B->set(B, this->sr_B->addid(), size_B);
320  }
322  }
323 }
virtual int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
Definition: spctr_tsr.h:45
tensor * A
left operand
Definition: contraction.h:19
virtual int pair_size() const
gets pair size el_size plus the key size
Definition: algstrct.h:46
void red(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op, int root)
reduce, same interface as MPI_Reduce, but excluding the comm
Definition: common.cxx:392
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
Definition: spctr_comm.cxx:169
def rank(self)
Definition: core.pyx:312
tensor * B
right operand
Definition: contraction.h:21
virtual double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
Definition: spctr_tsr.h:39
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
bool is_sparse_C
Definition: spctr_tsr.h:14
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
Definition: spctr_comm.cxx:186
bool is_sparse_A
Definition: spctr_tsr.h:12
spctr_replicate(spctr *other)
Definition: spctr_comm.cxx:104
algstrct const * sr_B
Definition: ctr_comm.h:168
double estimate_red_time(int64_t msg_sz, MPI_Op op)
Definition: common.cxx:308
double estimate_csr_red_time(int64_t msg_sz, CommData const *cdt) const
Definition: algstrct.cxx:508
int64_t spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes of buffer space we need
Definition: spctr_comm.cxx:173
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
algstrct const * sr_C
Definition: ctr_comm.h:169
class for execution distributed contraction of tensors
Definition: contraction.h:16
virtual void print()
Definition: ctr_comm.h:175
char * new_C
Definition: spctr_tsr.h:15
tensor * C
output
Definition: contraction.h:23
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
double est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time the local part this kernel is estimated to take
Definition: spctr_comm.cxx:140
algstrct const * sr_A
Definition: ctr_comm.h:167
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
Definition: csr.h:22
virtual char * csr_reduce(char *cA, int root, MPI_Comm cm) const
reduces CSR matrices stored in cA on each processor in cm and returns result on processor root ...
Definition: algstrct.cxx:367
double estimate_bcast_time(int64_t msg_sz)
Definition: common.cxx:295
virtual MPI_Op addmop() const
MPI addition operation for reductions.
Definition: algstrct.cxx:73
CommData * dim_comm
Definition: topology.h:20
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
Definition: algstrct.cxx:262
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
Definition: common.cxx:336
int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
Definition: spctr_comm.cxx:181
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
virtual spctr * clone()
Definition: spctr_tsr.h:19
topology * topo
topology to which the tensor is mapped
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
void run(char *A, char *B, char *C)
Definition: spctr_tsr.h:48
char const * beta
Definition: ctr_comm.h:170
bool is_sparse_B
Definition: spctr_tsr.h:13
virtual MPI_Datatype mdtype() const
MPI datatype.
Definition: algstrct.cxx:80
def np(self)
Definition: core.pyx:315