Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
sum_tsr.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #include "../shared/util.h"
4 #include "sum_tsr.h"
5 #include "sym_seq_sum.h"
6 #include "../interface/fun_term.h"
7 #include "../interface/idx_tensor.h"
8 
9 namespace CTF_int {
11  return Unifun_Term(A.clone(), this);
12  }
13 
14  void univar_function::operator()(Term const & A, Term const & B) const {
15  Unifun_Term ft(A.clone(), this);
16  ft.execute(B.execute(B.get_uniq_inds()));
17  }
18 
19  tsum::tsum(tsum * other){
20  A = other->A;
21  sr_A = other->sr_A;
22  alpha = other->alpha;
23  B = other->B;
24  beta = other->beta;
25  sr_B = other->sr_B;
26 
27  buffer = NULL;
28  }
29 
31  if (buffer != NULL) cdealloc(buffer);
32  }
33 
34  tsum::tsum(summation const * s){
35  A = s->A->data;
36  sr_A = s->A->sr;
37  alpha = s->alpha;
38 
39  B = s->B->data;
40  sr_B = s->B->sr;
41  beta = s->beta;
42 
43  buffer = NULL;
44  }
45 
47  cdealloc(virt_dim);
48  delete rec_tsum;
49  }
50 
51  tsum_virt::tsum_virt(tsum * other) : tsum(other) {
52  tsum_virt * o = (tsum_virt*)other;
53  rec_tsum = o->rec_tsum->clone();
54  num_dim = o->num_dim;
55  virt_dim = (int*)alloc(sizeof(int)*num_dim);
56  memcpy(virt_dim, o->virt_dim, sizeof(int)*num_dim);
57  }
58 
60  order_A = s->A->order;
61  idx_map_A = s->idx_A;
62  order_B = s->B->order;
63  idx_map_B = s->idx_B;
64  }
65 
67  return new tsum_virt(this);
68  }
69 
71  int i;
72  printf("tsum_virt:\n");
73  printf("blk_sz_A = %ld, blk_sz_B = %ld\n",
75  for (i=0; i<num_dim; i++){
76  printf("virt_dim[%d] = %d\n", i, virt_dim[i]);
77  }
78  rec_tsum->print();
79  }
80 
81  int64_t tsum_virt::mem_fp(){
82  return (order_A+order_B+3*num_dim)*sizeof(int);
83  }
84 
86  int * idx_arr, * lda_A, * lda_B, * beta_arr;
87  int * ilda_A, * ilda_B;
88  int64_t i, off_A, off_B;
89  int nb_A, nb_B, alloced, ret;
90  TAU_FSTART(sum_virt);
91 
92  if (this->buffer != NULL){
93  alloced = 0;
94  idx_arr = (int*)this->buffer;
95  } else {
96  alloced = 1;
97  ret = alloc_ptr(mem_fp(), (void**)&idx_arr);
98  ASSERT(ret==0);
99  }
100 
101  lda_A = idx_arr + num_dim;
102  lda_B = lda_A + order_A;
103  ilda_A = lda_B + order_B;
104  ilda_B = ilda_A + num_dim;
105 
106 
107  #define SET_LDA_X(__X) \
108  do { \
109  nb_##__X = 1; \
110  for (i=0; i<order_##__X; i++){ \
111  lda_##__X[i] = nb_##__X; \
112  nb_##__X = nb_##__X*virt_dim[idx_map_##__X[i]]; \
113  } \
114  memset(ilda_##__X, 0, num_dim*sizeof(int)); \
115  for (i=0; i<order_##__X; i++){ \
116  ilda_##__X[idx_map_##__X[i]] += lda_##__X[i]; \
117  } \
118  } while (0)
119  SET_LDA_X(A);
120  SET_LDA_X(B);
121  #undef SET_LDA_X
122 
123  /* dynammically determined size */
124  beta_arr = (int*)alloc(sizeof(int)*nb_B);
125 
126  memset(idx_arr, 0, num_dim*sizeof(int));
127  memset(beta_arr, 0, nb_B*sizeof(int));
128  off_A = 0, off_B = 0;
129  rec_tsum->alpha = this->alpha;
130  rec_tsum->beta = this->beta;
131  for (;;){
132  rec_tsum->A = this->A + off_A*blk_sz_A*this->sr_A->el_size;
133  rec_tsum->B = this->B + off_B*blk_sz_B*this->sr_B->el_size;
134 // sr_B->copy(rec_tsum->beta, sr_B->mulid());
135  if (beta_arr[off_B]>0)
136  rec_tsum->beta = sr_B->mulid();
137  else
138  rec_tsum->beta = this->beta;
139 
140  rec_tsum->run();
141  beta_arr[off_B] = 1;
142 
143  for (i=0; i<num_dim; i++){
144  off_A -= ilda_A[i]*idx_arr[i];
145  off_B -= ilda_B[i]*idx_arr[i];
146  idx_arr[i]++;
147  if (idx_arr[i] >= virt_dim[i])
148  idx_arr[i] = 0;
149  off_A += ilda_A[i]*idx_arr[i];
150  off_B += ilda_B[i]*idx_arr[i];
151  if (idx_arr[i] != 0) break;
152  }
153  if (i==num_dim) break;
154  }
155  if (alloced){
156  cdealloc(idx_arr);
157  }
158  cdealloc(beta_arr);
159  TAU_FSTOP(sum_virt);
160  }
161 
163  int i;
164  printf("tsum_replicate: \n");
165  printf("cdt_A = %p, size_A = %ld, ncdt_A = %d\n",
166  cdt_A, size_A, ncdt_A);
167  for (i=0; i<ncdt_A; i++){
168  printf("cdt_A[%d] length = %d\n",i,cdt_A[i]->np);
169  }
170  printf("cdt_B = %p, size_B = %ld, ncdt_B = %d\n",
171  cdt_B, size_B, ncdt_B);
172  for (i=0; i<ncdt_B; i++){
173  printf("cdt_B[%d] length = %d\n",i,cdt_B[i]->np);
174  }
175 
176  rec_tsum->print();
177  }
178 
180  delete rec_tsum;
181 /* for (int i=0; i<ncdt_A; i++){
182  cdt_A[i]->deactivate();
183  }*/
184  if (ncdt_A > 0)
185  cdealloc(cdt_A);
186 /* for (int i=0; i<ncdt_B; i++){
187  cdt_B[i]->deactivate();
188  }*/
189  if (ncdt_B > 0)
190  cdealloc(cdt_B);
191  }
192 
194  tsum_replicate * o = (tsum_replicate*)other;
195  rec_tsum = o->rec_tsum->clone();
196  size_A = o->size_A;
197  size_B = o->size_B;
198  ncdt_A = o->ncdt_A;
199  ncdt_B = o->ncdt_B;
200  }
201 
202 
204  int const * phys_mapped,
205  int64_t blk_sz_A,
206  int64_t blk_sz_B) : tsum(s) {
207  int i;
208  int nphys_dim = s->A->topo->order;
209  this->ncdt_A = 0;
210  this->ncdt_B = 0;
211  this->size_A = blk_sz_A;
212  this->size_B = blk_sz_B;
213  this->cdt_A = NULL;
214  this->cdt_B = NULL;
215  for (i=0; i<nphys_dim; i++){
216  if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
217  this->ncdt_A++;
218  }
219  if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
220  this->ncdt_B++;
221  }
222  }
223  if (this->ncdt_A > 0)
224  CTF_int::alloc_ptr(sizeof(CommData*)*this->ncdt_A, (void**)&this->cdt_A);
225  if (this->ncdt_B > 0)
226  CTF_int::alloc_ptr(sizeof(CommData*)*this->ncdt_B, (void**)&this->cdt_B);
227  this->ncdt_A = 0;
228  this->ncdt_B = 0;
229  for (i=0; i<nphys_dim; i++){
230  if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
231  this->cdt_A[this->ncdt_A] = &s->A->topo->dim_comm[i];
232  this->ncdt_A++;
233  }
234  if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
235  this->cdt_B[this->ncdt_B] = &s->B->topo->dim_comm[i];
236  this->ncdt_B++;
237  }
238  }
239  ASSERT(this->ncdt_A == 0 || this->cdt_B == 0);
240 
241  }
242 
244  return new tsum_replicate(this);
245  }
246 
248  return 0;
249  }
250 
252  int brank, i;
253  char * buf = this->A;
254  for (i=0; i<ncdt_A; i++){
255  cdt_A[i]->bcast(this->A, size_A, sr_A->mdtype(), 0);
256  }
257 
258  /* for (i=0; i<ncdt_B; i++){
259  POST_BCAST(this->B, size_B*sizeof(dtype), COMM_CHAR_T, 0, cdt_B[i]-> 0);
260  }*/
261  brank = 0;
262  for (i=0; i<ncdt_B; i++){
263  brank += cdt_B[i]->rank;
264  }
265  if (brank != 0) sr_B->set(this->B, sr_B->addid(), size_B);
266 
267  rec_tsum->A = buf;
268  rec_tsum->B = this->B;
269  rec_tsum->alpha = this->alpha;
270  if (brank != 0)
271  rec_tsum->beta = sr_B->mulid();
272  else
273  rec_tsum->beta = this->beta;
274 
275  rec_tsum->run();
276 
277  if (buf != this->A) cdealloc(buf);
278 
279  for (i=0; i<ncdt_B; i++){
280  cdt_B[i]->allred(MPI_IN_PLACE, this->B, size_B, sr_B->mdtype(), sr_B->addmop());
281  }
282 
283  }
284 
285 
286  seq_tsr_sum::seq_tsr_sum(tsum * other) : tsum(other) {
287  seq_tsr_sum * o = (seq_tsr_sum*)other;
288 
289  order_A = o->order_A;
290  idx_map_A = o->idx_map_A;
291  sym_A = o->sym_A;
292  edge_len_A = (int*)alloc(sizeof(int)*order_A);
293  memcpy(edge_len_A, o->edge_len_A, sizeof(int)*order_A);
294 
295  order_B = o->order_B;
296  idx_map_B = o->idx_map_B;
297  sym_B = o->sym_B;
298  edge_len_B = (int*)alloc(sizeof(int)*order_B);
299  memcpy(edge_len_B, o->edge_len_B, sizeof(int)*order_B);
300 
301  is_inner = o->is_inner;
302  inr_stride = o->inr_stride;
303 
304  map_pfx = o->map_pfx;
305 
306  is_custom = o->is_custom;
307  func = o->func;
308  }
309 
311  order_A = s->A->order;
312  sym_A = s->A->sym;
313  idx_map_A = s->idx_A;
314  order_B = s->B->order;
315  sym_B = s->B->sym;
316  idx_map_B = s->idx_B;
317  is_custom = s->is_custom;
318 
319  map_pfx = 1;
320  }
321 
323  int i;
324  printf("seq_tsr_sum:\n");
325  for (i=0; i<order_A; i++){
326  printf("edge_len_A[%d]=%d\n",i,edge_len_A[i]);
327  }
328  for (i=0; i<order_B; i++){
329  printf("edge_len_B[%d]=%d\n",i,edge_len_B[i]);
330  }
331  printf("is inner = %d\n", is_inner);
332  if (is_inner) printf("inner stride = %d\n", inr_stride);
333  printf("map_pfx = %ld\n", map_pfx);
334  }
335 
337  return new seq_tsr_sum(this);
338  }
339 
340  int64_t seq_tsr_sum::mem_fp(){ return 0; }
341 
343  if (is_custom){
344  ASSERT(is_inner == 0);
345  sym_seq_sum_cust(this->alpha,
346  this->A,
347  this->sr_A,
348  order_A,
349  edge_len_A,
350  sym_A,
351  idx_map_A,
352  this->beta,
353  this->B,
354  this->sr_B,
355  order_B,
356  edge_len_B,
357  sym_B,
358  idx_map_B,
359  func);
360  } else if (is_inner){
361  sym_seq_sum_inr(this->alpha,
362  this->A,
363  this->sr_A,
364  order_A,
365  edge_len_A,
366  sym_A,
367  idx_map_A,
368  this->beta,
369  this->B,
370  this->sr_B,
371  order_B,
372  edge_len_B,
373  sym_B,
374  idx_map_B,
375  inr_stride);
376  } else {
377  sym_seq_sum_ref(this->alpha,
378  this->A,
379  this->sr_A,
380  order_A,
381  edge_len_A,
382  sym_A,
383  idx_map_A,
384  this->beta,
385  this->B,
386  this->sr_B,
387  order_B,
388  edge_len_B,
389  sym_B,
390  idx_map_B);
391  }
392 
393  }
394 }
a term is an abstract object representing some expression of tensors
Definition: term.h:33
bool is_custom
whether there is a elementwise custom function
Definition: summation.h:32
void execute(CTF::Idx_Tensor output) const
evalues the expression, which just scales by default
Definition: fun_term.cxx:33
CommData ** cdt_A
Definition: sum_tsr.h:129
algstrct const * sr_A
Definition: sum_tsr.h:70
void * buffer
Definition: sum_tsr.h:75
virtual void execute(CTF::Idx_Tensor output) const =0
evalues the expression, which just scales by default
int * sym
symmetries among tensor dimensions
int * idx_A
indices of left operand
Definition: summation.h:28
int sym_seq_sum_ref(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B)
performs symmetric contraction with unblocked reference kernel
Unifun_Term operator()(Term const &A) const
evaluate B=f(A)
Definition: sum_tsr.cxx:10
void allred(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op)
allreduce, same interface as MPI_Allreduce, but excluding the comm
Definition: common.cxx:360
int const * idx_map_B
Definition: sum_tsr.h:155
int const * idx_map_A
Definition: sum_tsr.h:151
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
tsum * clone()
Definition: sum_tsr.cxx:66
#define SET_LDA_X(__X)
tsum * rec_tsum
Definition: sum_tsr.h:94
void run()
wraps user sequential function signature
Definition: sum_tsr.cxx:342
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
int64_t mem_fp()
returns the number of bytes of buffer space needed
Definition: sum_tsr.cxx:81
int64_t blk_sz_B
Definition: sum_tsr.h:102
int order
number of tensor dimensions
int64_t mem_fp()
returns the number of bytes of buffer space needed
Definition: sum_tsr.cxx:340
int const * idx_map_A
Definition: sum_tsr.h:100
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
performs replication along a dimension, generates 2.5D algs
Definition: sum_tsr.h:122
virtual Term * clone(std::map< tensor *, tensor * > *remap=NULL) const =0
base classes must implement this copy function to retrieve pointer
tsum_replicate(tsum *other)
Definition: sum_tsr.cxx:193
char const * alpha
scaling of A
Definition: summation.h:23
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
virtual void run()
Definition: sum_tsr.h:77
int64_t mem_fp()
returns the number of bytes of buffer space needed
Definition: sum_tsr.cxx:247
algstrct * sr
algstrct on which tensor elements and operations are defined
tsum_virt(tsum *other)
iterates over the dense virtualization block grid and contracts
Definition: sum_tsr.cxx:51
tsum(tsum *other)
Definition: sum_tsr.cxx:19
virtual MPI_Op addmop() const
MPI addition operation for reductions.
Definition: algstrct.cxx:73
univar_function const * func
Definition: sum_tsr.h:165
CommData * dim_comm
Definition: topology.h:20
int * idx_B
indices of output
Definition: summation.h:30
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
char const * alpha
Definition: sum_tsr.h:71
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
Definition: common.cxx:336
virtual std::vector< char > get_uniq_inds() const =0
find list of unique indices that are involved in this term
char * B
Definition: sum_tsr.h:72
CommData ** cdt_B
Definition: sum_tsr.h:130
seq_tsr_sum(tsum *other)
copies sum object
Definition: sum_tsr.cxx:286
int sym_seq_sum_cust(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, univar_function const *func)
performs symmetric summation with custom elementwise function
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
char const * beta
scaling of existing B
Definition: summation.h:25
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
tensor * B
output
Definition: summation.h:20
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
algstrct const * sr_B
Definition: sum_tsr.h:73
virtual tsum * clone()
Definition: sum_tsr.h:85
virtual ~tsum()
Definition: sum_tsr.cxx:30
topology * topo
topology to which the tensor is mapped
int sym_seq_sum_inr(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, int inr_stride)
performs symmetric summation with blocked daxpy
tensor * A
left operand
Definition: summation.h:18
class for execution distributed summation of tensors
Definition: summation.h:15
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
int64_t blk_sz_A
Definition: sum_tsr.h:99
int const * idx_map_B
Definition: sum_tsr.h:103
char * A
Definition: sum_tsr.h:69
virtual void print()
Definition: sum_tsr.h:78
virtual MPI_Datatype mdtype() const
MPI datatype.
Definition: algstrct.cxx:80
def np(self)
Definition: core.pyx:315
char const * beta
Definition: sum_tsr.h:74