Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
strp_tsr.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #include "../shared/util.h"
4 #include "strp_tsr.h"
5 
6 namespace CTF_int {
7 
9  alloced = o->alloced;
10  order = o->order;
11  blk_sz = o->blk_sz;
12  edge_len = o->edge_len;
13  strip_dim = o->strip_dim;
14  strip_idx = o->strip_idx;
15  A = o->A;
16  buffer = NULL;
17  }
18 
20  return new strp_tsr(this);
21  }
22 
23  int64_t strp_tsr::mem_fp(){
24  int i;
25  int64_t sub_sz;
26  sub_sz = blk_sz;
27  for (i=0; i<order; i++){
28  sub_sz = sub_sz * edge_len[i] / strip_dim[i];
29  }
30  return sub_sz*sr_A->el_size;
31  }
32 
33  void strp_tsr::run(int const dir){
35  int i, ilda, toff, boff, ret;
36  int * idx_arr, * lda;
37 
38  if (dir == 0) {
39  if (buffer != NULL){
40  alloced = 0;
41  } else {
42  alloced = 1;
43  ret = CTF_int::alloc_ptr(mem_fp(), (void**)&this->buffer);
44  ASSERT(ret==0);
45  }
46  }
47  idx_arr = (int*)CTF_int::alloc(sizeof(int)*order);
48  lda = (int*)CTF_int::alloc(sizeof(int)*order);
49  memset(idx_arr, 0, sizeof(int)*order);
50 
51  ilda = 1, toff = 0;
52  for (i=0; i<order; i++){
53  lda[i] = ilda;
54  ilda *= edge_len[i];
55  idx_arr[i] = strip_idx[i]*(edge_len[i]/strip_dim[i]);
56  toff += idx_arr[i]*lda[i];
57  DPRINTF(3,"[%d] sidx = %d, sdim = %d, edge_len = %d\n", i, strip_idx[i], strip_dim[i], edge_len[i]);
58  }
59 
60  boff = 0;
61  for (;;){
62  if (dir)
63  sr_A->copy(A+sr_A->el_size*toff*blk_sz, buffer+sr_A->el_size*boff*blk_sz, (edge_len[0]/strip_dim[0])*blk_sz);
64  else {
65  /* printf("boff = %d, toff = %d blk_sz = " PRId64 " mv_ez=" PRId64 "\n",boff,toff,blk_sz,
66  (edge_len[0]/strip_dim[0])*blk_sz*sr_A->el_size);*/
67  sr_A->copy(buffer+sr_A->el_size*boff*blk_sz, A+sr_A->el_size*toff*blk_sz, (edge_len[0]/strip_dim[0])*blk_sz);
68  }
69  boff += (edge_len[0]/strip_dim[0]);
70 
71  for (i=1; i<order; i++){
72  toff -= idx_arr[i]*lda[i];
73  idx_arr[i]++;
74  if (idx_arr[i] >= (strip_idx[i]+1)*(edge_len[i]/strip_dim[i]))
75  idx_arr[i] = strip_idx[i]*(edge_len[i]/strip_dim[i]);
76  toff += idx_arr[i]*lda[i];
77  if (idx_arr[i] != strip_idx[i]*(edge_len[i]/strip_dim[i])) break;
78  }
79  if (i==order) break;
80  }
81 
82 
83  if (dir == 1) {
84  if (alloced){
86  buffer = NULL;
87  }
88  }
89  CTF_int::cdealloc(idx_arr);
90  CTF_int::cdealloc(lda);
92  }
93 
95  if (alloced){
97  buffer = NULL;
98  }
99  }
100 
102  delete rec_tsum;
103  if (strip_A)
104  delete rec_strp_A;
105  if (strip_B)
106  delete rec_strp_B;
107  }
108 
109  strp_sum::strp_sum(tsum * other) : tsum(other) {
110  strp_sum * o = (strp_sum*)other;
111  rec_tsum = o->rec_tsum->clone();
112  rec_strp_A = o->rec_strp_A->clone();
113  rec_strp_B = o->rec_strp_B->clone();
114  strip_A = o->strip_A;
115  strip_B = o->strip_B;
116  }
117 
118  strp_sum::strp_sum(summation const * s) : tsum(s) { }
119 
121  return new strp_sum(this);
122  }
123 
124  int64_t strp_sum::mem_fp(){
125  return 0;
126  }
127 
129  char * bA, * bB;
130 
131  if (strip_A) {
132  rec_strp_A->A = this->A;
133  rec_strp_A->run(0);
134  bA = rec_strp_A->buffer;
135  } else {
136  bA = this->A;
137  }
138  if (strip_B) {
139  rec_strp_B->A = this->B;
140  rec_strp_B->run(0);
141  bB = rec_strp_B->buffer;
142  } else {
143  bB = this->B;
144  }
145 
146  rec_tsum->A = bA;
147  rec_tsum->B = bB;
148  rec_tsum->alpha = this->alpha;
149  rec_tsum->beta = this->beta;
150  rec_tsum->run();
151 
152  if (strip_A) rec_strp_A->free_exp();
153  if (strip_B) rec_strp_B->run(1);
154 
155  }
156 
157 
159  delete rec_ctr;
160  if (strip_A)
161  delete rec_strp_A;
162  if (strip_B)
163  delete rec_strp_B;
164  if (strip_C)
165  delete rec_strp_C;
166  }
167 
168  strp_ctr::strp_ctr(ctr * other) : ctr(other) {
169  strp_ctr * o = (strp_ctr*)other;
170  rec_ctr = o->rec_ctr->clone();
171  rec_strp_A = o->rec_strp_A->clone();
172  rec_strp_B = o->rec_strp_B->clone();
173  rec_strp_C = o->rec_strp_C->clone();
174  strip_A = o->strip_A;
175  strip_B = o->strip_B;
176  strip_C = o->strip_C;
177  }
178 
180  return new strp_ctr(this);
181  }
182 
183  int64_t strp_ctr::mem_fp(){
184  return 0;
185  }
186 
187  int64_t strp_ctr::mem_rec() {
188  return rec_ctr->mem_rec() + mem_fp();
189  }
190 
191  double strp_ctr::est_time_rec(int nlyr) {
192  return rec_ctr->est_time_rec(nlyr);
193  }
194 
195 
196  void strp_ctr::run(char * A, char * B, char * C){
197  char * bA, * bB, * bC;
198 
199  if (strip_A) {
200  rec_strp_A->A = A;
201  rec_strp_A->run(0);
202  bA = rec_strp_A->buffer;
203  } else {
204  bA = A;
205  }
206  if (strip_B) {
207  rec_strp_B->A = B;
208  rec_strp_B->run(0);
209  bB = rec_strp_B->buffer;
210  } else {
211  bB = B;
212  }
213  if (strip_C) {
214  rec_strp_C->A = C;
215  rec_strp_C->run(0);
216  bC = rec_strp_C->buffer;
217  } else {
218  bC = C;
219  }
220 
221 
222  rec_ctr->num_lyr = this->num_lyr;
223  rec_ctr->idx_lyr = this->idx_lyr;
224  rec_ctr->beta = this->beta;
225  rec_ctr->run(bA, bB, bC);
226 
227  if (strip_A) rec_strp_A->free_exp();
228  if (strip_B) rec_strp_B->free_exp();
229  if (strip_C) rec_strp_C->run(1);
230 
231  }
232 
234  delete rec_scl;
235  delete rec_strp;
236  }
237 
238  strp_scl::strp_scl(scl * other) : scl(other) {
239  strp_scl * o = (strp_scl*)other;
240  rec_scl = o->rec_scl->clone();
241  rec_strp = o->rec_strp->clone();
242  }
243 
245  return new strp_scl(this);
246  }
247 
248  int64_t strp_scl::mem_fp(){
249  return 0;
250  }
251 
253  char * bA;
254 
255  rec_strp->A = this->A;
256  rec_strp->run(0);
257  bA = rec_strp->buffer;
258 
259  /* printf("alpha = %lf %lf\n",
260  ((std::complex<double>)this->alpha).real(),
261  ((std::complex<double>)this->alpha).imag());
262  printf("A[0] = %lf %lf\n",
263  ((std::complex<double>)bA[0]).real(),
264  ((std::complex<double>)bA[0]).imag());*/
265 
266  rec_scl->A = bA;
267  rec_scl->alpha = this->alpha;
268  rec_scl->run();
269 
270  rec_strp->run(1);
271  }
272 
273  int strip_diag(int order,
274  int order_tot,
275  int const * idx_map,
276  int64_t vrt_sz,
277  mapping const * edge_map,
278  topology const * topo,
279  algstrct const * sr,
280  int * blk_edge_len,
281  int64_t * blk_sz,
282  strp_tsr ** stpr){
283  int64_t i;
284  int need_strip;
285  int * pmap, * edge_len, * sdim, * sidx;
286  strp_tsr * stripper;
287 
288  CTF_int::alloc_ptr(order_tot*sizeof(int), (void**)&pmap);
289 
290  std::fill(pmap, pmap+order_tot, -1);
291 
292  need_strip = 0;
293 
294  for (i=0; i<order; i++){
295  if (edge_map[i].type == PHYSICAL_MAP) {
296  ASSERT(pmap[idx_map[i]] == -1);
297  pmap[idx_map[i]] = i;
298  }
299  }
300  for (i=0; i<order; i++){
301  if (edge_map[i].type == VIRTUAL_MAP && pmap[idx_map[i]] != -1)
302  need_strip = 1;
303  }
304  if (need_strip == 0) {
305  CTF_int::cdealloc(pmap);
306  return 0;
307  }
308 
309  CTF_int::alloc_ptr(order*sizeof(int), (void**)&edge_len);
310  CTF_int::alloc_ptr(order*sizeof(int), (void**)&sdim);
311  CTF_int::alloc_ptr(order*sizeof(int), (void**)&sidx);
312  stripper = new strp_tsr;
313 
314  std::fill(sdim, sdim+order, 1);
315  std::fill(sidx, sidx+order, 0);
316 
317  for (i=0; i<order; i++){
318  edge_len[i] = edge_map[i].calc_phase()/edge_map[i].calc_phys_phase();
319  //if (edge_map[i].type == VIRTUAL_MAP) {
320  // edge_len[i] = edge_map[i].np;
321  //}
322  //if (edge_map[i].type == PHYSICAL_MAP && edge_map[i].has_child) {
323  //dont allow recursive mappings for self indices
324  // or things get weird here
325  //ASSERT(edge_map[i].child->type == VIRTUAL_MAP);
326  // edge_len[i] = edge_map[i].child->np;
327  // }
328  if (edge_map[i].type == VIRTUAL_MAP && pmap[idx_map[i]] != -1) {
329  sdim[i] = edge_len[i];
330  sidx[i] = edge_map[pmap[idx_map[i]]].calc_phys_rank(topo);
331  ASSERT(edge_map[i].np == edge_map[pmap[idx_map[i]]].np);
332  }
333  blk_edge_len[i] = blk_edge_len[i] / sdim[i];
334  *blk_sz = (*blk_sz) / sdim[i];
335  }
336 
337  stripper->alloced = 0;
338  stripper->order = order;
339  stripper->edge_len = edge_len;
340  stripper->strip_dim = sdim;
341  stripper->strip_idx = sidx;
342  stripper->buffer = NULL;
343  stripper->blk_sz = vrt_sz;
344  stripper->sr_A = sr;
345 
346  *stpr = stripper;
347 
348  CTF_int::cdealloc(pmap);
349 
350  return 1;
351  }
352 
353 
354 }
scl * clone()
copies strp_scl object
Definition: strp_tsr.cxx:244
int calc_phys_rank(topology const *topo) const
compute the physical rank of a mapping
Definition: mapping.cxx:74
char const * alpha
Definition: scale_tsr.h:16
int64_t mem_fp()
gets memory usage of op
Definition: strp_tsr.cxx:248
strp_tsr * clone()
copies strp_tsr object
Definition: strp_tsr.cxx:19
int64_t mem_fp()
returns the number of bytes of buffer space we need recursively
Definition: strp_tsr.cxx:183
void run(char *A, char *B, char *C)
runs strip for contraction of tensors
Definition: strp_tsr.cxx:196
#define DPRINTF(...)
Definition: util.h:235
int calc_phase() const
compute the phase of a mapping
Definition: mapping.cxx:39
strp_tsr * rec_strp_B
Definition: strp_tsr.h:95
char * A
Definition: scale_tsr.h:14
virtual int64_t mem_rec()
Definition: ctr_comm.h:177
algstrct const * sr_A
Definition: strp_tsr.h:24
int calc_phys_phase() const
compute the physical phase of a mapping
Definition: mapping.cxx:57
virtual void copy(char *a, char const *b) const
copies element b to element a
Definition: algstrct.cxx:538
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
strp_tsr * rec_strp_A
Definition: strp_tsr.h:128
strp_tsr * rec_strp_A
Definition: strp_tsr.h:94
tsum * rec_tsum
Definition: strp_tsr.h:92
~strp_scl()
deconstructor
Definition: strp_tsr.cxx:233
tsum * clone()
copies strp_sum object
Definition: strp_tsr.cxx:120
int strip_diag(int order, int order_tot, int const *idx_map, int64_t vrt_sz, mapping const *edge_map, topology const *topo, algstrct const *sr, int *blk_edge_len, int64_t *blk_sz, strp_tsr **stpr)
build stack required for stripping out diagonals of tensor
Definition: strp_tsr.cxx:273
virtual void run()
Definition: scale_tsr.h:19
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
strp_tsr * rec_strp_B
Definition: strp_tsr.h:129
void run()
runs strip for scale of tensor
Definition: strp_tsr.cxx:252
virtual void run()
Definition: sum_tsr.h:77
virtual ctr * clone()
Definition: ctr_comm.h:180
strp_ctr(ctr *other)
copies strp_ctr object
Definition: strp_tsr.cxx:168
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
char const * alpha
Definition: sum_tsr.h:71
virtual double est_time_rec(int nlyr)
Definition: ctr_comm.h:179
void run()
runs strip for sum of tensors
Definition: strp_tsr.cxx:128
char * B
Definition: sum_tsr.h:72
int64_t blk_sz
Definition: strp_tsr.h:18
strp_tsr * rec_strp_C
Definition: strp_tsr.h:130
virtual void run(char *A, char *B, char *C)
Definition: ctr_comm.h:174
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
virtual tsum * clone()
Definition: sum_tsr.h:85
strp_sum(tsum *other)
Definition: strp_tsr.cxx:109
virtual scl * clone()
Definition: scale_tsr.h:21
int64_t mem_fp()
gets memory usage of op
Definition: strp_tsr.cxx:124
class for execution distributed summation of tensors
Definition: summation.h:15
void free_exp()
deallocates buffer
Definition: strp_tsr.cxx:94
strp_tsr * rec_strp
Definition: strp_tsr.h:61
~strp_ctr()
deconstructor
Definition: strp_tsr.cxx:158
~strp_sum()
deconstructor
Definition: strp_tsr.cxx:101
char * A
Definition: sum_tsr.h:69
ctr * clone()
copies strp_ctr object
Definition: strp_tsr.cxx:179
char const * beta
Definition: ctr_comm.h:170
int64_t mem_rec()
Definition: strp_tsr.cxx:187
def np(self)
Definition: core.pyx:315
int64_t mem_fp()
returns the number of bytes of buffer space we need
Definition: strp_tsr.cxx:23
double est_time_rec(int nlyr)
returns the number of bytes sent recursively
Definition: strp_tsr.cxx:191
void run(int const dir)
strips out part of tensor to be operated on
Definition: strp_tsr.cxx:33
char const * beta
Definition: sum_tsr.h:74