Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
ctr_comm.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #include "../shared/util.h"
4 #include "ctr_comm.h"
5 #include "contraction.h"
6 #include "../interface/fun_term.h"
7 #include "../interface/idx_tensor.h"
8 #include "../tensor/untyped_tensor.h"
9 
10 namespace CTF_int {
11  Bifun_Term bivar_function::operator()(Term const & A, Term const & B) const {
12  return Bifun_Term(A.clone(), B.clone(), this);
13  }
14 
15  void bivar_function::operator()(Term const & A, Term const & B, Term const & C) const {
16  Bifun_Term ft(A.clone(), B.clone(), this);
17  ft.execute(C.execute(C.get_uniq_inds()));
18  }
19 
20 
21 
22  ctr::ctr(contraction const * c){
23  sr_A = c->A->sr;
24  sr_B = c->B->sr;
25  sr_C = c->C->sr;
26  beta = c->beta;
27  idx_lyr = 0;
28  num_lyr = 1;
29  }
30 
31  ctr::ctr(ctr * other){
32  sr_A = other->sr_A;
33  sr_B = other->sr_B;
34  sr_C = other->sr_C;
35  beta = other->beta;
36  num_lyr = other->num_lyr;
37  idx_lyr = other->idx_lyr;
38  }
39 
41  }
42 
44  int const * phys_mapped,
45  int64_t blk_sz_A,
46  int64_t blk_sz_B,
47  int64_t blk_sz_C)
48  : ctr(c) {
49  int i;
50  int nphys_dim = c->A->topo->order;
51  this->ncdt_A = 0;
52  this->ncdt_B = 0;
53  this->ncdt_C = 0;
54  this->size_A = blk_sz_A;
55  this->size_B = blk_sz_B;
56  this->size_C = blk_sz_C;
57  this->cdt_A = NULL;
58  this->cdt_B = NULL;
59  this->cdt_C = NULL;
60  for (i=0; i<nphys_dim; i++){
61  if (phys_mapped[3*i+0] == 0 &&
62  phys_mapped[3*i+1] == 0 &&
63  phys_mapped[3*i+2] == 0){
64  /* printf("ERROR: ALL-TENSOR REPLICATION NO LONGER DONE\n");
65  ABORT;
66  ASSERT(this->num_lyr == 1);
67  hctr->idx_lyr = A->topo->dim_comm[i].rank;
68  hctr->num_lyr = A->topo->dim_comm[i]->np;
69  this->idx_lyr = A->topo->dim_comm[i].rank;
70  this->num_lyr = A->topo->dim_comm[i]->np;*/
71  } else {
72  if (phys_mapped[3*i+0] == 0){
73  this->ncdt_A++;
74  }
75  if (phys_mapped[3*i+1] == 0){
76  this->ncdt_B++;
77  }
78  if (phys_mapped[3*i+2] == 0){
79  this->ncdt_C++;
80  }
81  }
82  }
83  if (this->ncdt_A > 0)
84  CTF_int::alloc_ptr(sizeof(CommData*)*this->ncdt_A, (void**)&this->cdt_A);
85  if (this->ncdt_B > 0)
86  CTF_int::alloc_ptr(sizeof(CommData*)*this->ncdt_B, (void**)&this->cdt_B);
87  if (this->ncdt_C > 0)
88  CTF_int::alloc_ptr(sizeof(CommData*)*this->ncdt_C, (void**)&this->cdt_C);
89  this->ncdt_A = 0;
90  this->ncdt_B = 0;
91  this->ncdt_C = 0;
92  for (i=0; i<nphys_dim; i++){
93  if (!(phys_mapped[3*i+0] == 0 &&
94  phys_mapped[3*i+1] == 0 &&
95  phys_mapped[3*i+2] == 0)){
96  if (phys_mapped[3*i+0] == 0){
97  this->cdt_A[this->ncdt_A] = &c->A->topo->dim_comm[i];
98  /* if (is_used && this->cdt_A[this->ncdt_A].alive == 0)
99  this->cdt_A[this->ncdt_A].activate(global_comm.cm);*/
100  this->ncdt_A++;
101  }
102  if (phys_mapped[3*i+1] == 0){
103  this->cdt_B[this->ncdt_B] = &c->B->topo->dim_comm[i];
104  /* if (is_used && this->cdt_B[this->ncdt_B].alive == 0)
105  this->cdt_B[this->ncdt_B].activate(global_comm.cm);*/
106  this->ncdt_B++;
107  }
108  if (phys_mapped[3*i+2] == 0){
109  this->cdt_C[this->ncdt_C] = &c->C->topo->dim_comm[i];
110  /* if (is_used && this->cdt_C[this->ncdt_C].alive == 0)
111  this->cdt_C[this->ncdt_C].activate(global_comm.cm);*/
112  this->ncdt_C++;
113  }
114  }
115  }
116  }
117 
119  delete rec_ctr;
120 /* for (int i=0; i<ncdt_A; i++){
121  cdt_A[i]->deactivate();
122  }*/
123  if (ncdt_A > 0)
125 /* for (int i=0; i<ncdt_B; i++){
126  cdt_B[i]->deactivate();
127  }*/
128  if (ncdt_B > 0)
130 /* for (int i=0; i<ncdt_C; i++){
131  cdt_C[i]->deactivate();
132  }*/
133  if (ncdt_C > 0)
135  }
136 
138  ctr_replicate * o = (ctr_replicate*)other;
139  rec_ctr = o->rec_ctr->clone();
140  size_A = o->size_A;
141  size_B = o->size_B;
142  size_C = o->size_C;
143  ncdt_A = o->ncdt_A;
144  ncdt_B = o->ncdt_B;
145  ncdt_C = o->ncdt_C;
146  }
147 
149  return new ctr_replicate(this);
150  }
151 
153  int i;
154  printf("ctr_replicate: \n");
155  printf("cdt_A = %p, size_A = %ld, ncdt_A = %d\n",
156  cdt_A, size_A, ncdt_A);
157  for (i=0; i<ncdt_A; i++){
158  printf("cdt_A[%d] length = %d\n",i,cdt_A[i]->np);
159  }
160  printf("cdt_B = %p, size_B = %ld, ncdt_B = %d\n",
161  cdt_B, size_B, ncdt_B);
162  for (i=0; i<ncdt_B; i++){
163  printf("cdt_B[%d] length = %d\n",i,cdt_B[i]->np);
164  }
165  printf("cdt_C = %p, size_C = %ld, ncdt_C = %d\n",
166  cdt_C, size_C, ncdt_C);
167  for (i=0; i<ncdt_C; i++){
168  printf("cdt_C[%d] length = %d\n",i,cdt_C[i]->np);
169  }
170  rec_ctr->print();
171  }
172 
173  double ctr_replicate::est_time_fp(int nlyr){
174  int i;
175  double tot_sz;
176  tot_sz = 0.0;
177  for (i=0; i<ncdt_A; i++){
178  ASSERT(cdt_A[i]->np > 0);
179  tot_sz += cdt_A[i]->estimate_bcast_time(size_A*sr_A->el_size);
180  }
181  for (i=0; i<ncdt_B; i++){
182  ASSERT(cdt_B[i]->np > 0);
183  tot_sz += cdt_B[i]->estimate_bcast_time(size_B*sr_B->el_size);
184  }
185  for (i=0; i<ncdt_C; i++){
186  ASSERT(cdt_C[i]->np > 0);
187  tot_sz += cdt_C[i]->estimate_red_time(size_C*sr_C->el_size, sr_C->addmop());
188  }
189  return tot_sz;
190  }
191 
192  double ctr_replicate::est_time_rec(int nlyr) {
193  return rec_ctr->est_time_rec(nlyr) + est_time_fp(nlyr);
194  }
195 
197  return 0;
198  }
199 
201  return rec_ctr->mem_rec() + mem_fp();
202  }
203 
204 
205  void ctr_replicate::run(char * A, char * B, char * C){
206  int arank, brank, crank, i;
207 
208  arank = 0, brank = 0, crank = 0;
209  for (i=0; i<ncdt_A; i++){
210  arank += cdt_A[i]->rank;
211 // POST_BCAST(A, size_A*sr_A->el_size, COMM_CHAR_T, 0, cdt_A[i]-> 0);
212  cdt_A[i]->bcast(A, size_A, sr_A->mdtype(), 0);
213  }
214  for (i=0; i<ncdt_B; i++){
215  brank += cdt_B[i]->rank;
216 // POST_BCAST(B, size_B*sr_B->el_size, COMM_CHAR_T, 0, cdt_B[i]-> 0);
217  cdt_B[i]->bcast(B, size_B, sr_B->mdtype(), 0);
218  }
219  for (i=0; i<ncdt_C; i++){
220  crank += cdt_C[i]->rank;
221  }
222 // if (crank != 0) this->sr_C->set(C, this->sr_C->addid(), size_C);
223 // else {
224  if (crank == 0 && !sr_C->isequal(this->beta, sr_C->mulid())){
225  if (sr_C->isequal(this->beta, sr_C->addid())){
226  sr_C->set(C, sr_C->addid(), size_C);
227  } else {
228  sr_C->scal(size_C, this->beta, C, 1);
229  }
230 
231 /* for (i=0; i<size_C; i++){
232  sr_C->mul(this->beta, C+i*sr_C->el_size, C+i*sr_C->el_size);
233  }*/
234  }
235 //
236  //sr_C->set(C, sr_C->addid(), size_C);
237  if (crank != 0)
238  rec_ctr->beta = sr_C->addid();
239  else
240  rec_ctr->beta = sr_C->mulid();
241 
242  rec_ctr->num_lyr = this->num_lyr;
243  rec_ctr->idx_lyr = this->idx_lyr;
244 
245  rec_ctr->run(A, B, C);
246 
247  /*for (i=0; i<size_C; i++){
248  printf("P%d C[%d] = %lf\n",crank,i, ((double*)C)[i]);
249  }*/
250  for (i=0; i<ncdt_C; i++){
251  //ALLREDUCE(MPI_IN_PLACE, C, size_C, sr_C->mdtype(), sr_C->addmop(), cdt_C[i]->;
252  if (cdt_C[i]->rank == 0)
253  cdt_C[i]->red(MPI_IN_PLACE, C, size_C, sr_C->mdtype(), sr_C->addmop(), 0);
254  else
255  cdt_C[i]->red(C, NULL, size_C, sr_C->mdtype(), sr_C->addmop(), 0);
256  }
257 
258  if (arank != 0 && this->sr_A->addid() != NULL){
259  this->sr_A->set(A, this->sr_A->addid(), size_A);
260  }
261  if (brank != 0 && this->sr_B->addid() != NULL){
262  this->sr_B->set(B, this->sr_B->addid(), size_B);
263  }
264  }
265 }
a term is an abstract object representing some expression of tensors
Definition: term.h:33
virtual void execute(CTF::Idx_Tensor output) const =0
evalues the expression, which just scales by default
double est_time_rec(int nlyr)
returns the execution time this kernel and its recursive calls are estimated to take ...
Definition: ctr_comm.cxx:192
CommData ** cdt_C
Definition: ctr_comm.h:209
tensor * A
left operand
Definition: contraction.h:19
void red(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op, int root)
reduce, same interface as MPI_Reduce, but excluding the comm
Definition: common.cxx:392
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340
virtual int64_t mem_rec()
Definition: ctr_comm.h:177
def rank(self)
Definition: core.pyx:312
int64_t mem_fp()
returns the number of bytes of buffer space we need
Definition: ctr_comm.cxx:196
tensor * B
right operand
Definition: contraction.h:21
void operator()(Term const &A, Term const &B, Term const &C) const
evaluate C+=f(A,B) or f(A,B,C) if transform
Definition: ctr_comm.cxx:15
#define ASSERT(...)
Definition: util.h:88
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
char const * beta
scaling of existing C
Definition: contraction.h:28
algstrct const * sr_B
Definition: ctr_comm.h:168
double estimate_red_time(int64_t msg_sz, MPI_Op op)
Definition: common.cxx:308
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
virtual ~ctr()
deallocates generic ctr object
Definition: ctr_comm.cxx:40
ctr(ctr *other)
copies generic ctr object
Definition: ctr_comm.cxx:31
algstrct const * sr_C
Definition: ctr_comm.h:169
class for execution distributed contraction of tensors
Definition: contraction.h:16
void run(char *A, char *B, char *C)
Definition: ctr_comm.cxx:205
virtual Term * clone(std::map< tensor *, tensor * > *remap=NULL) const =0
base classes must implement this copy function to retrieve pointer
virtual void print()
Definition: ctr_comm.h:175
tensor * C
output
Definition: contraction.h:23
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
algstrct const * sr_A
Definition: ctr_comm.h:167
CommData ** cdt_A
Definition: ctr_comm.h:207
algstrct * sr
algstrct on which tensor elements and operations are defined
virtual ctr * clone()
Definition: ctr_comm.h:180
double estimate_bcast_time(int64_t msg_sz)
Definition: common.cxx:295
virtual MPI_Op addmop() const
MPI addition operation for reductions.
Definition: algstrct.cxx:73
CommData * dim_comm
Definition: topology.h:20
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
Definition: algstrct.cxx:262
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
Definition: common.cxx:336
void execute(CTF::Idx_Tensor output) const
evalues the expression, which just scales by default
Definition: fun_term.cxx:96
virtual double est_time_rec(int nlyr)
Definition: ctr_comm.h:179
virtual std::vector< char > get_uniq_inds() const =0
find list of unique indices that are involved in this term
int64_t mem_rec()
returns the number of bytes need by each processor in this kernel and its recursive calls ...
Definition: ctr_comm.cxx:200
virtual void run(char *A, char *B, char *C)
Definition: ctr_comm.h:174
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
topology * topo
topology to which the tensor is mapped
CommData ** cdt_B
Definition: ctr_comm.h:208
ctr_replicate(ctr *other)
Definition: ctr_comm.cxx:137
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
char const * beta
Definition: ctr_comm.h:170
virtual MPI_Datatype mdtype() const
MPI datatype.
Definition: algstrct.cxx:80
double est_time_fp(int nlyr)
returns the execution time the local part this kernel is estimated to take
Definition: ctr_comm.cxx:173
def np(self)
Definition: core.pyx:315