Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
spctr_2d_general.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #include "spctr_2d_general.h"
4 #include "../tensor/untyped_tensor.h"
5 #include "../mapping/mapping.h"
6 #include "../shared/util.h"
7 #include <climits>
8 
9 namespace CTF_int {
10 
12  /*if (move_A) cdt_A->deactivate();
13  if (move_B) cdt_B->deactivate();
14  if (move_C) cdt_C->deactivate();*/
15  if (rec_ctr != NULL)
16  delete rec_ctr;
17  }
18 
20  spctr_2d_general * o = (spctr_2d_general*)other;
21  rec_ctr = o->rec_ctr->clone();
22  edge_len = o->edge_len;
23  ctr_lda_A = o->ctr_lda_A;
25  cdt_A = o->cdt_A;
26  move_A = o->move_A;
27  ctr_lda_B = o->ctr_lda_B;
29  cdt_B = o->cdt_B;
30  move_B = o->move_B;
31  ctr_lda_C = o->ctr_lda_C;
33  cdt_C = o->cdt_C;
34  move_C = o->move_C;
35 
39 #if 0 //def OFFLOAD
40  alloc_host_buf = o->alloc_host_buf;
41 #endif
42  }
43 
45  printf("spctr_2d_general: edge_len = %d\n", edge_len);
46  printf("move_A = %d, ctr_lda_A = %ld, ctr_sub_lda_A = %ld\n",
48  if (move_A) printf("cdt_A length = %d\n",cdt_A->np);
49  printf("move_B = %d, ctr_lda_B = %ld, ctr_sub_lda_B = %ld\n",
51  if (move_B) printf("cdt_B length = %d\n",cdt_B->np);
52  printf("move_C = %d, ctr_lda_C = %ld, ctr_sub_lda_C = %ld\n",
54  if (move_C) printf("cdt_C length = %d\n",cdt_C->np);
55 #if 0 //def OFFLOAD
56  if (alloc_host_buf)
57  printf("alloc_host_buf is true\n");
58  else
59  printf("alloc_host_buf is false\n");
60 #endif
61  rec_ctr->print();
62  }
63 
65  return new spctr_2d_general(this);
66  }
67 
68  void spctr_2d_general::find_bsizes(int64_t & b_A,
69  int64_t & b_B,
70  int64_t & b_C,
71  int64_t & s_A,
72  int64_t & s_B,
73  int64_t & s_C,
74  int64_t & aux_size){
75  b_A = 0, b_B = 0, b_C = 0;
79  if (move_A){
80  b_A = edge_len/cdt_A->np;
81  }
82  if (move_B){
83  b_B = edge_len/cdt_B->np;
84  }
85  if (move_C){
86  b_C = edge_len/cdt_C->np;
87  }
88 
89  aux_size = MAX(move_A*sr_A->el_size*s_A, MAX(move_B*sr_B->el_size*s_B, move_C*sr_C->el_size*s_C));
90  }
91 
92  double spctr_2d_general::est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C) {
93  int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
94  find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
95  double est_bcast_time = 0.0;
96  if (move_A){
97  if (is_sparse_A)
98  est_bcast_time += cdt_A->estimate_bcast_time(sr_A->pair_size()*s_A*nnz_frac_A*dns_vrt_sz_A);
99  else
100  est_bcast_time += cdt_A->estimate_bcast_time(sr_A->el_size*s_A*nnz_frac_A);
101  }
102  if (move_B){
103  if (is_sparse_B)
104  est_bcast_time += cdt_B->estimate_bcast_time(sr_B->pair_size()*s_B*nnz_frac_B*dns_vrt_sz_B);
105  else
106  est_bcast_time += cdt_B->estimate_bcast_time(sr_B->el_size*s_B*nnz_frac_B);
107  }
108  if (move_C){
109  if (is_sparse_C)
110  est_bcast_time += sr_C->estimate_csr_red_time(sr_C->pair_size()*s_C*nnz_frac_C*dns_vrt_sz_C, cdt_C);
111  else
112  est_bcast_time += cdt_C->estimate_red_time(sr_C->el_size*s_C*nnz_frac_C, sr_C->addmop());
113  }
114  return (est_bcast_time*(double)edge_len)/MIN(nlyr,edge_len);
115  }
116 
117  double spctr_2d_general::est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C) {
118  return rec_ctr->est_time_rec(1, nnz_frac_A, nnz_frac_B, nnz_frac_C)*(double)edge_len/MIN(nlyr,edge_len) + est_time_fp(nlyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
119  }
120 
121  int64_t spctr_2d_general::spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C) {
122  int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
123  find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
124  int64_t mem_usage = 0;
125  if (is_sparse_A) mem_usage += sr_A->pair_size()*s_A*nnz_frac_A;
126  else mem_usage += sr_A->el_size*s_A;
127  if (is_sparse_B) mem_usage += sr_B->pair_size()*s_B*nnz_frac_B;
128  else mem_usage += sr_B->el_size*s_B;
129  if (is_sparse_C) mem_usage += sr_C->pair_size()*s_C*nnz_frac_C;
130  else mem_usage += sr_C->el_size*s_C;
131  return mem_usage;
132  }
133 
134  int64_t spctr_2d_general::spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C) {
135  return rec_ctr->spmem_rec(nnz_frac_A, nnz_frac_B, nnz_frac_C) + spmem_fp(nnz_frac_A, nnz_frac_B, nnz_frac_C);
136  }
137 
138  char * bcast_step(int edge_len, char * A, bool is_sparse_A, bool move_A, algstrct const * sr_A, int64_t b_A, int64_t s_A, char * buf_A, CommData * cdt_A, int64_t ctr_sub_lda_A, int64_t ctr_lda_A, int nblk_A, int64_t const * size_blk_A, int & new_nblk_A, int64_t *& new_size_blk_A, int64_t * offsets_A, int ib){
139  int ret;
140  char * op_A = NULL;
141  new_size_blk_A = (int64_t*)size_blk_A;
142  if (move_A){
143  new_nblk_A = nblk_A/b_A;
144  int owner_A = ib % cdt_A->np;
145  if (cdt_A->rank == owner_A){
146  if (b_A == 1){
147  op_A = A;
148  } else {
149  if (is_sparse_A){
150  int64_t * new_offsets_A;
151  socopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*b_A, ctr_sub_lda_A,
152  size_blk_A+(ib/cdt_A->np)*ctr_sub_lda_A,
153  new_size_blk_A, new_offsets_A);
154 
155  int64_t bc_size_A = 0;
156  for (int z=0; z<new_nblk_A; z++) bc_size_A += new_size_blk_A[z];
157  ret = CTF_int::mst_alloc_ptr(bc_size_A, (void**)&buf_A);
158  ASSERT(ret==0);
159  op_A = buf_A;
160  spcopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*b_A, ctr_sub_lda_A,
161  size_blk_A+(ib/cdt_A->np)*ctr_sub_lda_A,
162  offsets_A+(ib/cdt_A->np)*ctr_sub_lda_A,
163  A,
164  new_size_blk_A, new_offsets_A, op_A);
165  cdealloc(new_offsets_A);
166  } else {
167  op_A = buf_A;
168  sr_A->copy(ctr_sub_lda_A, ctr_lda_A,
169  A+sr_A->el_size*(ib/cdt_A->np)*ctr_sub_lda_A, ctr_sub_lda_A*b_A,
170  op_A, ctr_sub_lda_A);
171  }
172  }
173  } else {
174  if (is_sparse_A)
175  CTF_int::alloc_ptr(sizeof(int64_t)*nblk_A/b_A, (void**)&new_size_blk_A);
176  else
177  op_A = buf_A;
178  }
179  if (is_sparse_A){
180  cdt_A->bcast(new_size_blk_A, new_nblk_A, MPI_INT64_T, owner_A);
181  int64_t bc_size_A = 0;
182  for (int z=0; z<new_nblk_A; z++) bc_size_A += new_size_blk_A[z];
183 
184  if (cdt_A->rank != owner_A){
185  ret = CTF_int::mst_alloc_ptr(bc_size_A, (void**)&buf_A);
186  ASSERT(ret==0);
187  op_A = buf_A;
188  }
189  cdt_A->bcast(op_A, bc_size_A, MPI_CHAR, owner_A);
190  /*int rrank;
191  MPI_Comm_rank(MPI_COMM_WORLD, &rrank);
192  printf("rrank = %d new_nblk_A = %d rank = %d owner = %d new_nnz_A = %ld old_nnz_A = %ld\n",rrank,new_nblk_A,cdt_A->rank, owner_A, new_nnz_A, nnz_A);
193  for (int rr=0; rr<new_nblk_A; rr++){
194  printf("rrank = %d new_nblk_A = %d new_size_blk_A[%d] = %ld\n", rrank, new_nblk_A, rr, new_size_blk_A[rr]);
195  }*/
196  } else {
197  cdt_A->bcast(op_A, s_A, sr_A->mdtype(), owner_A);
198  }
199  } else {
200  if (ctr_sub_lda_A == 0)
201  op_A = A;
202  else {
203  new_nblk_A = nblk_A/edge_len;
204  if (ctr_lda_A == 1){
205  if (is_sparse_A){
206  op_A = A+offsets_A[ib*ctr_sub_lda_A];
207  CTF_int::alloc_ptr(sizeof(int64_t)*new_nblk_A, (void**)&new_size_blk_A);
208  memcpy(new_size_blk_A, size_blk_A+ib*ctr_sub_lda_A, sizeof(int64_t)*new_nblk_A);
209 /* int rrank;
210  MPI_Comm_rank(MPI_COMM_WORLD, &rrank);
211  printf("rrank = %d ib = %ld new_nblk_A = %d, new_nnz_A = %ld offset = %ld\n", rrank, ib, new_nblk_A, new_nnz_A, offsets_A[ib*ctr_sub_lda_A]);*/
212  } else {
213  op_A = A+sr_A->el_size*ib*ctr_sub_lda_A;
214  }
215  } else {
216  if (is_sparse_A){
217  int64_t * new_offsets_A;
218  socopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*edge_len, ctr_sub_lda_A,
219  size_blk_A+ib*ctr_sub_lda_A,
220  new_size_blk_A, new_offsets_A);
221  int64_t bc_size_A = 0;
222  for (int z=0; z<new_nblk_A; z++) bc_size_A += new_size_blk_A[z];
223 
224  ret = CTF_int::mst_alloc_ptr(bc_size_A, (void**)&buf_A);
225  ASSERT(ret==0);
226  op_A = buf_A;
227  spcopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*edge_len, ctr_sub_lda_A,
228  size_blk_A+ib*ctr_sub_lda_A, offsets_A+ib*ctr_sub_lda_A, A,
229  new_size_blk_A, new_offsets_A, op_A);
230  cdealloc(new_offsets_A);
231  } else {
232  op_A = buf_A;
233  sr_A->copy(ctr_sub_lda_A, ctr_lda_A,
234  A+sr_A->el_size*ib*ctr_sub_lda_A, ctr_sub_lda_A*edge_len,
235  buf_A, ctr_sub_lda_A);
236  }
237  }
238  }
239  }
240  return op_A;
241  }
242 
243 
244  char * reduce_step_pre(int edge_len, char * C, bool is_sparse_C, bool move_C, algstrct const * sr_C, int64_t b_C, int64_t s_C, char * buf_C, CommData * cdt_C, int64_t ctr_sub_lda_C, int64_t ctr_lda_C, int nblk_C, int64_t const * size_blk_C, int & new_nblk_C, int64_t *& new_size_blk_C, int64_t * offsets_C, int ib, char const *& rec_beta){
245  char * op_C;
246  new_size_blk_C = (int64_t*)size_blk_C;
247  if (move_C){
248  op_C = buf_C;
249  rec_beta = sr_C->addid();
250  new_nblk_C = nblk_C/b_C;
251  if (is_sparse_C){
252  CTF_int::alloc_ptr(sizeof(int64_t)*new_nblk_C, (void**)&new_size_blk_C);
253  memset(new_size_blk_C, 0, sizeof(int64_t)*new_nblk_C);
254  }
255  } else {
256  if (ctr_sub_lda_C == 0){
257  new_nblk_C = nblk_C;
258  op_C = C;
259  } else {
260  new_nblk_C = nblk_C/edge_len;
261  if (ctr_lda_C == 1){
262  if (is_sparse_C){
263  CTF_int::alloc_ptr(sizeof(int64_t)*new_nblk_C, (void**)&new_size_blk_C);
264  memcpy(new_size_blk_C, size_blk_C+ib*ctr_sub_lda_C, sizeof(int64_t)*new_nblk_C);
265  op_C = C+offsets_C[ib*ctr_sub_lda_C];
266  } else {
267  op_C = C+sr_C->el_size*ib*ctr_sub_lda_C;
268  }
269  } else {
270  op_C = buf_C;
271  rec_beta = sr_C->addid();
272  CTF_int::alloc_ptr(sizeof(int64_t)*new_nblk_C, (void**)&new_size_blk_C);
273  memset(new_size_blk_C, 0, sizeof(int64_t)*new_nblk_C);
274  }
275  }
276  }
277  return op_C;
278  }
279 
280 
281  void reduce_step_post(int edge_len, char * C, bool is_sparse_C, bool move_C, algstrct const * sr_C, int64_t b_C, int64_t s_C, char * buf_C, CommData * cdt_C, int64_t ctr_sub_lda_C, int64_t ctr_lda_C, int nblk_C, int64_t * size_blk_C, int & new_nblk_C, int64_t *& new_size_blk_C, int64_t * offsets_C, int ib, char const *& rec_beta, char const * beta, char *& up_C, char *& new_C, int n_new_C_grps, int & i_new_C_grp, char ** new_C_grps){
282  if (move_C){
283 #ifdef PROFILE
284  TAU_FSTART(spctr_2d_general_barrier);
285  MPI_Barrier(cdt_C->cm);
286  TAU_FSTOP(spctr_2d_general_barrier);
287 #endif
288  int owner_C = ib % cdt_C->np;
289  if (is_sparse_C){
290  int64_t csr_sz_acc = 0;
291  int64_t new_csr_sz_acc = 0;
292  char * new_Cs[new_nblk_C];
293  for (int blk=0; blk<new_nblk_C; blk++){
294  new_Cs[blk] = sr_C->csr_reduce(up_C+csr_sz_acc, owner_C, cdt_C->cm);
295 
296  csr_sz_acc += new_size_blk_C[blk];
297  new_size_blk_C[blk] = cdt_C->rank == owner_C ? ((CSR_Matrix)(new_Cs[blk])).size() : 0;
298  new_csr_sz_acc += new_size_blk_C[blk];
299  }
300  cdealloc(up_C);
301  if (cdt_C->rank == owner_C){
302  if (n_new_C_grps == 1){
303  alloc_ptr(new_csr_sz_acc, (void**)&up_C);
304  new_csr_sz_acc = 0;
305  ASSERT(nblk_C == new_nblk_C);
306  for (int blk=0; blk<nblk_C; blk++){
307  memcpy(up_C+new_csr_sz_acc, new_Cs[blk], new_size_blk_C[blk]);
308  cdealloc(new_Cs[blk]);
309  new_csr_sz_acc += new_size_blk_C[blk];
310  }
311  if (new_C != C) cdealloc(new_C);
312  new_C = up_C;
313  } else {
314  ASSERT(new_nblk_C == 1);
315  for (int k=0; k<ctr_lda_C; k++){
316  for (int j=0; j<ctr_sub_lda_C; j++){
317  size_blk_C[ctr_sub_lda_C*(k*n_new_C_grps+i_new_C_grp)+j] = new_size_blk_C[ctr_sub_lda_C*k+j];
318  }
319  }
320  new_C_grps[i_new_C_grp] = new_Cs[0];
321  i_new_C_grp++;
322  }
323  } else {
324  up_C = NULL;
325  }
326  } else {
327  if (cdt_C->rank == owner_C)
328  cdt_C->red(MPI_IN_PLACE, up_C, s_C, sr_C->mdtype(), sr_C->addmop(), owner_C);
329  else
330  cdt_C->red(up_C, NULL, s_C, sr_C->mdtype(), sr_C->addmop(), owner_C);
331  if (cdt_C->rank == owner_C){
332  sr_C->copy(ctr_sub_lda_C, ctr_lda_C,
333  up_C, ctr_sub_lda_C, sr_C->mulid(),
334  C+sr_C->el_size*(ib/cdt_C->np)*ctr_sub_lda_C,
335  ctr_sub_lda_C*b_C, beta);
336  }
337  }
338  } else {
339  if (ctr_sub_lda_C != 0){
340  if (is_sparse_C){
341  new_C_grps[i_new_C_grp] = up_C;
342  for (int k=0; k<ctr_lda_C; k++){
343  for (int j=0; j<ctr_sub_lda_C; j++){
344  size_blk_C[ctr_sub_lda_C*(k*n_new_C_grps+i_new_C_grp)+j] = new_size_blk_C[ctr_sub_lda_C*k+j];
345  }
346  }
347  i_new_C_grp++;
348  } else if (ctr_lda_C != 1){
349  sr_C->copy(ctr_sub_lda_C, ctr_lda_C,
350  buf_C, ctr_sub_lda_C, sr_C->mulid(),
351  C+sr_C->el_size*ib*ctr_sub_lda_C,
352  ctr_sub_lda_C*edge_len, beta);
353  }
354  } else {
355  rec_beta = sr_C->mulid();
356  if (is_sparse_C){
357  size_blk_C[0] = new_size_blk_C[0];
358  if (new_C != C) cdealloc(new_C);
359  new_C = up_C;
360  }
361  }
362  }
363  }
364 
365  void spctr_2d_general::run(char * A, int nblk_A, int64_t const * size_blk_A,
366  char * B, int nblk_B, int64_t const * size_blk_B,
367  char * C, int nblk_C, int64_t * size_blk_C,
368  char *& new_C){
369  int ret, n_new_C_grps;
370  int64_t ib;
371  char * buf_A, * buf_B, * buf_C, * buf_aux, * up_C;
372  char ** new_C_grps;
373  char * op_A = NULL;
374  char * op_B = NULL;
375  char * op_C = NULL;
376  int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
377 
378  if (is_sparse_C){
379  if (move_C){
380  n_new_C_grps = edge_len/cdt_C->np;
381  } else {
382  //if (ctr_lda_C != 1 && ctr_sub_lda_C != 0)
383  if (ctr_sub_lda_C != 0){
384  n_new_C_grps = edge_len;
385  } else {
386  n_new_C_grps = 1;
387  }
388  }
389  } else {
390  n_new_C_grps = 1;
391  }
392  if (n_new_C_grps > 1)
393  alloc_ptr(n_new_C_grps*sizeof(char*), (void**)&new_C_grps);
394  int i_new_C_grp = 0;
396 
397  /* Must move at most two tensors */
398  ASSERT(!(move_A && move_B && move_C));
399 
400  rec_ctr->beta = this->beta;
401 
402  int iidx_lyr, inum_lyr;
403  if (edge_len >= num_lyr && edge_len % num_lyr == 0){
404  inum_lyr = num_lyr;
405  iidx_lyr = idx_lyr;
406  rec_ctr->num_lyr = 1;
407  rec_ctr->idx_lyr = 0;
408  } else if (edge_len < num_lyr && num_lyr % edge_len == 0){
409  inum_lyr = edge_len;
410  iidx_lyr = idx_lyr%edge_len;
413  } else {
416  inum_lyr = 1;
417  iidx_lyr = 0;
418  }
419 
420 
421  find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
422 
423 #if 0 //def OFFLOAD
424  if (alloc_host_buf){
425  host_pinned_alloc((void**)&buf_A, s_A*sr_A->el_size);
426  host_pinned_alloc((void**)&buf_B, s_B*sr_B->el_size);
427  host_pinned_alloc((void**)&buf_C, s_C*sr_C->el_size);
428  }
429 #endif
430  if (0){
431  } else {
432  if (!is_sparse_A){
433  ret = CTF_int::mst_alloc_ptr(s_A*sr_A->el_size, (void**)&buf_A);
434  ASSERT(ret==0);
435  } else buf_A = NULL;
436  if (!is_sparse_B){
437  ret = CTF_int::mst_alloc_ptr(s_B*sr_B->el_size, (void**)&buf_B);
438  ASSERT(ret==0);
439  } else buf_B = NULL;
440  if (!is_sparse_C){
441  ret = CTF_int::mst_alloc_ptr(s_C*sr_C->el_size, (void**)&buf_C);
442  ASSERT(ret==0);
443  } else buf_C = NULL;
444  }
445  ret = CTF_int::mst_alloc_ptr(aux_size, (void**)&buf_aux);
446  ASSERT(ret==0);
447 
448  int64_t * offsets_A;
449  if (is_sparse_A){
450  CTF_int::alloc_ptr(sizeof(int64_t)*nblk_A, (void**)&offsets_A);
451  for (int i=0; i<nblk_A; i++){
452  if (i==0) offsets_A[0] = 0;
453  else offsets_A[i] = offsets_A[i-1]+size_blk_A[i-1];
454  }
455  }
456  int64_t * offsets_B;
457  if (is_sparse_B){
458  CTF_int::alloc_ptr(sizeof(int64_t)*nblk_B, (void**)&offsets_B);
459  for (int i=0; i<nblk_B; i++){
460  if (i==0) offsets_B[0] = 0;
461  else offsets_B[i] = offsets_B[i-1]+size_blk_B[i-1];
462  }
463  }
464  int64_t * offsets_C;
465  if (is_sparse_C){
466  CTF_int::alloc_ptr(sizeof(int64_t)*nblk_C, (void**)&offsets_C);
467  for (int i=0; i<nblk_C; i++){
468  if (i==0) offsets_C[0] = 0;
469  else offsets_C[i] = offsets_C[i-1]+size_blk_C[i-1];
470  }
471  }
472 
473 
474  int64_t * new_size_blk_A;
475  int new_nblk_A = nblk_A;
476  int64_t * new_size_blk_B;
477  int new_nblk_B = nblk_B;
478  int64_t * new_size_blk_C;
479  int new_nblk_C = nblk_C;
480 
481  new_C = C;
482  up_C = NULL;
483 
484  for (ib=iidx_lyr; ib<edge_len; ib+=inum_lyr){
485  op_A = bcast_step(edge_len, A, is_sparse_A, move_A, sr_A, b_A, s_A, buf_A, cdt_A, ctr_sub_lda_A, ctr_lda_A, nblk_A, size_blk_A, new_nblk_A, new_size_blk_A, offsets_A, ib);
486  op_B = bcast_step(edge_len, B, is_sparse_B, move_B, sr_B, b_B, s_B, buf_B, cdt_B, ctr_sub_lda_B, ctr_lda_B, nblk_B, size_blk_B, new_nblk_B, new_size_blk_B, offsets_B, ib);
487  op_C = reduce_step_pre(edge_len, new_C, is_sparse_C, move_C, sr_C, b_C, s_C, buf_C, cdt_C, ctr_sub_lda_C, ctr_lda_C, nblk_C, size_blk_C, new_nblk_C, new_size_blk_C, offsets_C, ib, rec_ctr->beta);
488 
489 
491  rec_ctr->run(op_A, new_nblk_A, new_size_blk_A,
492  op_B, new_nblk_B, new_size_blk_B,
493  op_C, new_nblk_C, new_size_blk_C,
494  up_C);
495 
497  /*for (int i=0; i<ctr_sub_lda_C*ctr_lda_C; i++){
498  printf("[%d] P%d up_C[%d] = %lf\n",ctr_lda_C,idx_lyr,i, ((double*)up_C)[i]);
499  }*/
500  if (is_sparse_A && ((move_A && (cdt_A->rank != (ib % cdt_A->np) || b_A != 1)) || (!move_A && ctr_sub_lda_A != 0 && ctr_lda_A != 1))){
501  cdealloc(op_A);
502  }
503  if (is_sparse_B && ((move_B && (cdt_B->rank != (ib % cdt_B->np) || b_B != 1)) || (!move_B && ctr_sub_lda_B != 0 && ctr_lda_B != 1))){
504  cdealloc(op_B);
505  }
506  reduce_step_post(edge_len, C, is_sparse_C, move_C, sr_C, b_C, s_C, buf_C, cdt_C, ctr_sub_lda_C, ctr_lda_C, nblk_C, size_blk_C, new_nblk_C, new_size_blk_C, offsets_C, ib, rec_ctr->beta, this->beta, up_C, new_C, n_new_C_grps, i_new_C_grp, new_C_grps);
507 
508  if (new_size_blk_A != size_blk_A)
509  cdealloc(new_size_blk_A);
510  if (new_size_blk_B != size_blk_B)
511  cdealloc(new_size_blk_B);
512  if (is_sparse_A && buf_A != NULL){
513  cdealloc(buf_A);
514  buf_A = NULL;
515  }
516  if (is_sparse_B && buf_B != NULL){
517  cdealloc(buf_B);
518  buf_B = NULL;
519  }
520  if (new_size_blk_C != size_blk_C)
521  cdealloc(new_size_blk_C);
522  }
523 #if 0 //def OFFLOAD
524  if (alloc_host_buf){
525  host_pinned_free(buf_A);
526  host_pinned_free(buf_B);
527  host_pinned_free(buf_C);
528  }
529 #endif
530  if (n_new_C_grps > 1){
531  ASSERT(i_new_C_grp == n_new_C_grps);
532  int64_t new_sz_C = 0;
533  int64_t * new_offsets_C;
534  int64_t * grp_offsets_C;
535  int64_t * grp_sizes_C;
536  CTF_int::alloc_ptr(sizeof(int64_t)*nblk_C, (void**)&new_offsets_C);
537  CTF_int::alloc_ptr(sizeof(int64_t)*nblk_C/n_new_C_grps, (void**)&grp_offsets_C);
538  CTF_int::alloc_ptr(sizeof(int64_t)*nblk_C/n_new_C_grps, (void**)&grp_sizes_C);
539  for (int i=0; i<nblk_C; i++){
540  new_offsets_C[i] = new_sz_C;
541  new_sz_C += size_blk_C[i];
542  }
543  alloc_ptr(new_sz_C, (void**)&new_C);
544  for (int i=0; i<n_new_C_grps; i++){
545  int64_t last_grp_offset = 0;
546  for (int j=0; j<ctr_sub_lda_C; j++){
547  for (int k=0; k<ctr_lda_C; k++){
548  grp_offsets_C[ctr_sub_lda_C*k+j] = last_grp_offset;
549  grp_sizes_C[ctr_sub_lda_C*k+j] = size_blk_C[ctr_sub_lda_C*(i+n_new_C_grps*k)+j];
550  last_grp_offset += grp_sizes_C[ctr_sub_lda_C*k+j];
551  }
552  }
553 // printf("copying %ld %ld elements from matrix of size %ld from offset %ld to offset %ld\n", size_blk_C[0], grp_sizes_C[0], ((CSR_Matrix)new_C_grps[i]).size(), grp_offsets_C[0], new_offsets_C[0]);
554  spcopy(ctr_sub_lda_C, ctr_lda_C, ctr_sub_lda_C, ctr_sub_lda_C*n_new_C_grps,
555  grp_sizes_C, grp_offsets_C, new_C_grps[i],
556  size_blk_C+i*ctr_sub_lda_C, new_offsets_C+i*ctr_sub_lda_C, new_C);
557  cdealloc(new_C_grps[i]);
558  }
559  cdealloc(new_offsets_C);
560  cdealloc(grp_offsets_C);
561  cdealloc(grp_sizes_C);
562  }
563  if (move_C && is_sparse_C && C != NULL){
564  char * new_Cs[nblk_C];
565  int64_t org_offset = 0;
566  int64_t cmp_offset = 0;
567  int64_t new_offset = 0;
568  for (int i=0; i<nblk_C; i++){
569  new_Cs[i] = sr_C->csr_add(C+org_offset, new_C+cmp_offset);
570  new_offset += ((CSR_Matrix)new_Cs[i]).size();
571  org_offset += ((CSR_Matrix)(C+org_offset)).size();
572  cmp_offset += ((CSR_Matrix)(new_C+cmp_offset)).size();
573  }
574  if (new_C != C)
575  cdealloc(new_C);
576  new_C = (char*)alloc(new_offset);
577  new_offset = 0;
578  for (int i=0; i<nblk_C; i++){
579  size_blk_C[i] = ((CSR_Matrix)new_Cs[i]).size();
580  memcpy(new_C+new_offset, new_Cs[i], size_blk_C[i]);
581  new_offset += size_blk_C[i];
582  cdealloc(new_Cs[i]);
583  }
584  }
585  if (0){
586  } else {
587  if (buf_A != NULL) CTF_int::cdealloc(buf_A);
588  if (buf_B != NULL) CTF_int::cdealloc(buf_B);
589  if (buf_C != NULL) CTF_int::cdealloc(buf_C);
590  CTF_int::cdealloc(buf_aux);
591  }
592  if (is_sparse_A){
593  cdealloc(offsets_A);
594  }
595  if (is_sparse_B){
596  cdealloc(offsets_B);
597  }
598  if (is_sparse_C){
599  cdealloc(offsets_C);
600  } else {
601  new_C = C;
602  }
604  }
605 }
606 
virtual char * csr_add(char *cA, char *cB) const
adds CSR matrices A (stored in cA) and B (stored in cB) to create matric C (pointer to all_data retur...
Definition: algstrct.cxx:362
virtual int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
Definition: spctr_tsr.h:45
double est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the time this kernel will take including calls to rec_ctr
virtual int pair_size() const
gets pair size el_size plus the key size
Definition: algstrct.h:46
void red(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op, int root)
reduce, same interface as MPI_Reduce, but excluding the comm
Definition: common.cxx:392
virtual double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
Definition: spctr_tsr.h:39
virtual void copy(char *a, char const *b) const
copies element b to element a
Definition: algstrct.cxx:538
void host_pinned_alloc(void **ptr, int64_t size)
allocate a pinned host buffer
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the time this kernel will take including calls to rec_ctr
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
bool is_sparse_C
Definition: spctr_tsr.h:14
bool is_sparse_A
Definition: spctr_tsr.h:12
algstrct const * sr_B
Definition: ctr_comm.h:168
double estimate_red_time(int64_t msg_sz, MPI_Op op)
Definition: common.cxx:308
double estimate_csr_red_time(int64_t msg_sz, CommData const *cdt) const
Definition: algstrct.cxx:508
char * bcast_step(int edge_len, char *A, bool is_sparse_A, bool move_A, algstrct const *sr_A, int64_t b_A, int64_t s_A, char *buf_A, CommData *cdt_A, int64_t ctr_sub_lda_A, int64_t ctr_lda_A, int nblk_A, int64_t const *size_blk_A, int &new_nblk_A, int64_t *&new_size_blk_A, int64_t *offsets_A, int ib)
#define MAX(a, b)
Definition: util.h:180
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction
Definition: memcontrol.cxx:269
algstrct const * sr_C
Definition: ctr_comm.h:169
int64_t spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes of buffer space we need
void host_pinned_free(void *ptr)
free a pinned host buffer
virtual void print()
Definition: ctr_comm.h:175
int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes of buffer space we need recursively
char * new_C
Definition: spctr_tsr.h:15
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
algstrct const * sr_A
Definition: ctr_comm.h:167
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
Definition: csr.h:22
virtual char * csr_reduce(char *cA, int root, MPI_Comm cm) const
reduces CSR matrices stored in cA on each processor in cm and returns result on processor root ...
Definition: algstrct.cxx:367
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
Basically doing SUMMA, except assumes equal block size on each processor. Performs rank-b updates whe...
double estimate_bcast_time(int64_t msg_sz)
Definition: common.cxx:295
virtual MPI_Op addmop() const
MPI addition operation for reductions.
Definition: algstrct.cxx:73
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
Definition: common.cxx:336
void reduce_step_post(int edge_len, char *C, bool is_sparse_C, bool move_C, algstrct const *sr_C, int64_t b_C, int64_t s_C, char *buf_C, CommData *cdt_C, int64_t ctr_sub_lda_C, int64_t ctr_lda_C, int nblk_C, int64_t *size_blk_C, int &new_nblk_C, int64_t *&new_size_blk_C, int64_t *offsets_C, int ib, char const *&rec_beta, char const *beta, char *&up_C, char *&new_C, int n_new_C_grps, int &i_new_C_grp, char **new_C_grps)
MPI_Comm cm
Definition: common.h:129
spctr_2d_general(spctr *other)
copies spctr object
void socopy(int64_t m, int64_t n, int64_t lda_a, int64_t lda_b, int64_t const *sizes_a, int64_t *&sizes_b, int64_t *&offsets_b)
Definition: util.cxx:240
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
virtual spctr * clone()
Definition: spctr_tsr.h:19
void find_bsizes(int64_t &b_A, int64_t &b_B, int64_t &b_C, int64_t &s_A, int64_t &s_B, int64_t &s_C, int64_t &aux_size)
determines buffer and block sizes needed for spctr_2d_general
#define MIN(a, b)
Definition: util.h:176
char * reduce_step_pre(int edge_len, char *C, bool is_sparse_C, bool move_C, algstrct const *sr_C, int64_t b_C, int64_t s_C, char *buf_C, CommData *cdt_C, int64_t ctr_sub_lda_C, int64_t ctr_lda_C, int nblk_C, int64_t const *size_blk_C, int &new_nblk_C, int64_t *&new_size_blk_C, int64_t *offsets_C, int ib, char const *&rec_beta)
void print()
print ctr object
~spctr_2d_general()
deallocs spctr_2d_general object
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
void run(char *A, char *B, char *C)
Definition: spctr_tsr.h:48
char const * beta
Definition: ctr_comm.h:170
bool is_sparse_B
Definition: spctr_tsr.h:13
virtual MPI_Datatype mdtype() const
MPI datatype.
Definition: algstrct.cxx:80
void spcopy(int64_t m, int64_t n, int64_t lda_a, int64_t lda_b, int64_t const *sizes_a, int64_t const *offsets_a, char const *a, int64_t const *sizes_b, int64_t const *offsets_b, char *b)
Definition: util.cxx:260