Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
sp_seq_ctr.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #include "../shared/iter_tsr.h"
4 #include <limits.h>
5 #include "sp_seq_ctr.h"
6 #include "sym_seq_ctr.h"
7 #include "../shared/offload.h"
8 #include "../shared/util.h"
9 
10 namespace CTF_int{
11  template<int idim>
12  void spA_dnB_dnC_ctrloop(char const * alpha,
14  int64_t & size_A,
15  algstrct const * sr_A,
16  int order_A,
17  int const * edge_len_A,
18  int64_t const * lda_A,
19  int const * sym_A,
20  int const * idx_map_A,
21  char const * B,
22  algstrct const * sr_B,
23  int order_B,
24  int const * edge_len_B,
25  int const * sym_B,
26  int const * idx_map_B,
27  uint64_t *const* offsets_B,
28  char const * beta,
29  char * C,
30  algstrct const * sr_C,
31  int order_C,
32  int const * edge_len_C,
33  int const * sym_C,
34  int const * idx_map_C,
35  uint64_t *const* offsets_C,
36  bivar_function const * func,
37  int const * idx,
38  int const * rev_idx_map,
39  int idx_max){
40  int imax=0;
41  int rA = rev_idx_map[3*idim+0];
42  int rB = rev_idx_map[3*idim+1];
43  int rC = rev_idx_map[3*idim+2];
44 
45  ASSERT(!(rA != -1 && rB == -1 && rC == -1));
46 
47  if (rB != -1)
48  imax = edge_len_B[rB];
49  else if (rC != -1)
50  imax = edge_len_C[rC];
51 
52  if (rA != -1 && sym_A[rA] != NS){
53  int rrA = rA;
54  do {
55  if (idx_map_A[rrA+1] > idim)
56  imax = idx[idx_map_A[rrA+1]]+1;
57  rrA++;
58  } while (sym_A[rrA] != NS && idx_map_A[rrA] < idim);
59  }
60 
61  if (rB != -1 && sym_B[rB] != NS){
62  int rrB = rB;
63  do {
64  if (idx_map_B[rrB+1] > idim)
65  imax = std::min(imax,idx[idx_map_B[rrB+1]]+1);
66  rrB++;
67  } while (sym_B[rrB] != NS && idx_map_B[rrB] < idim);
68  }
69 
70  if (rC != -1 && sym_C[rC] != NS){
71  int rrC = rC;
72  do {
73  if (idx_map_C[rrC+1] > idim)
74  imax = std::min(imax,idx[idx_map_C[rrC+1]]+1);
75  rrC++;
76  } while (sym_C[rrC] != NS && idx_map_C[rrC] < idim);
77  }
78 
79  int imin = 0;
80 
81  if (rA > 0 && sym_A[rA-1] != NS){
82  int rrA = rA;
83  do {
84  if (idx_map_A[rrA-1] > idim)
85  imin = idx[idx_map_A[rrA-1]];
86  rrA--;
87  } while (rrA>0 && sym_A[rrA-1] != NS && idx_map_A[rrA] < idim);
88  }
89 
90  if (rB > 0 && sym_B[rB-1] != NS){
91  int rrB = rB;
92  do {
93  if (idx_map_B[rrB-1] > idim)
94  imin = std::max(imin,idx[idx_map_B[rrB-1]]);
95  rrB--;
96  } while (rrB>0 && sym_B[rrB-1] != NS && idx_map_B[rrB] < idim);
97  }
98 
99  if (rC > 0 && sym_C[rC-1] != NS){
100  int rrC = rC;
101  do {
102  if (idx_map_C[rrC-1] > idim)
103  imin = std::max(imin,idx[idx_map_C[rrC-1]]);
104  rrC--;
105  } while (rrC>0 && sym_C[rrC-1] != NS && idx_map_C[rrC] < idim);
106  }
107  int64_t key_offset = 0;
108  for (int i=0; i<order_A; i++){
109  if (i != rA){
110  key_offset += idx[idx_map_A[i]]*lda_A[i];
111  }
112  }
113 
114  //FIXME: if rC != -1 && rA == -1, thread
115  for (int i=imin; i<imax; i++){
116 /* if (size_A != 0){
117  printf("lda_A[rA]=%ld\n",lda_A[rA]);
118  printf("A[0].k() == %ld\n",A[0].k());
119  }*/
120  if (size_A == 0 ||
121  (rA == -1 && A[0].k()!=key_offset) ||
122  (rA != -1 && (A[0].k()/lda_A[rA]/edge_len_A[rA])!=key_offset/lda_A[rA]/edge_len_A[rA])){
123  //printf("i = %d idim = %d breaking rA = %d k=%ld\n", i, idim, rA, A[0].k());
124  break;
125  }
126  // if we should be iterating over A and the next sparse value of A does not match this iteration number
127  // there will be no actual work to do in the inner iterations, so continue
128  if (rA != -1 && (A[0].k()/lda_A[rA])%edge_len_A[rA] != i){
129  ASSERT((A[0].k()/lda_A[rA])%edge_len_A[rA] > i);
130  continue;
131  }
132  ConstPairIterator cpiA(sr_A, A.ptr);
133  int64_t new_size_A = size_A;
134  int nidx[idx_max];
135  memcpy(nidx, idx, idx_max*sizeof(int));
136  nidx[idim] = i;
137  spA_dnB_dnC_ctrloop<idim-1>(alpha, cpiA, new_size_A, sr_A, order_A, edge_len_A, lda_A, sym_A, idx_map_A, B+offsets_B[idim][nidx[idim]], sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C+offsets_C[idim][nidx[idim]], sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, func, nidx, rev_idx_map, idx_max);
138  if (rA != -1) {
139  if (size_A == new_size_A){
140  ASSERT(rA==0);
141  //if we did not advance, in recursive loops, it means all rA=-1 for lower idim, and we now want to advance by 1
142  size_A = new_size_A-1;
143  A.ptr = cpiA[1].ptr;
144  } else {
145  size_A = new_size_A;
146  A.ptr = cpiA.ptr;
147  }
148  }
149  }
150  }
151 
152  template<>
153  void spA_dnB_dnC_ctrloop<0>(char const * alpha,
154  ConstPairIterator & A,
155  int64_t & size_A,
156  algstrct const * sr_A,
157  int order_A,
158  int const * edge_len_A,
159  int64_t const * lda_A,
160  int const * sym_A,
161  int const * idx_map_A,
162  char const * B,
163  algstrct const * sr_B,
164  int order_B,
165  int const * edge_len_B,
166  int const * sym_B,
167  int const * idx_map_B,
168  uint64_t *const* offsets_B,
169  char const * beta,
170  char * C,
171  algstrct const * sr_C,
172  int order_C,
173  int const * edge_len_C,
174  int const * sym_C,
175  int const * idx_map_C,
176  uint64_t *const* offsets_C,
177  bivar_function const * func,
178  int const * idx,
179  int const * rev_idx_map,
180  int idx_max){
181 
182 
183  int imax=0;
184  int rA = rev_idx_map[0];
185  int rB = rev_idx_map[1];
186  int rC = rev_idx_map[2];
187 
188  ASSERT(!(rA != -1 && rB == -1 && rC == -1));
189 
190  if (rB != -1)
191  imax = edge_len_B[rB];
192  else if (rC != -1)
193  imax = edge_len_C[rC];
194 
195  if (rA != -1 && sym_A[rA] != NS)
196  imax = idx[idx_map_A[rA+1]]+1;
197  if (rB != -1 && sym_B[rB] != NS)
198  imax = std::min(imax,idx[idx_map_B[rB+1]]+1);
199  if (rC != -1 && sym_C[rC] != NS)
200  imax = std::min(imax,idx[idx_map_C[rC+1]]+1);
201 
202  int imin = 0;
203 
204  if (rA > 0 && sym_A[rA-1] != NS)
205  imin = idx[idx_map_A[rA-1]];
206  if (rB > 0 && sym_B[rB-1] != NS)
207  imin = std::max(imin,idx[idx_map_B[rB-1]]);
208  if (rC > 0 && sym_C[rC-1] != NS)
209  imin = std::max(imin,idx[idx_map_C[rC-1]]);
210 
211 /* int tid, ntd;
212  tid = omp_get_thread_num();
213  ntd = omp_get_num_threads();
214  printf("-> %d/%d %d %d %d\n",tid,ntd,func==NULL, alpha==NULL,beta==NULL);*/
215  if (rA == -1){
216  if (func == NULL){
217 /* if (alpha == NULL && beta == NULL){
218  for (int i=imin; i<imax; i++){
219  sr_C->mul(A[0].d(),
220  B+offsets_B[0][i],
221  C+offsets_C[0][i]);
222  }
223  CTF_FLOPS_ADD(imax-imin);
224  } else if (alpha == NULL)*/
225  if (alpha == NULL || sr_C->isequal(alpha,sr_C->mulid())){
226  for (int i=imin; i<imax; i++){
227  char tmp[sr_C->el_size];
228  sr_C->mul(A[0].d(),
229  B+offsets_B[0][i],
230  tmp);
231  sr_C->add(tmp,
232  C+offsets_C[0][i],
233  C+offsets_C[0][i]);
234  }
235  CTF_FLOPS_ADD(2*(imax-imin));
236  } else {
237  for (int i=imin; i<imax; i++){
238  char tmp[sr_C->el_size];
239  sr_C->mul(A[0].d(),
240  B+offsets_B[0][i],
241  tmp);
242  sr_C->mul(tmp,
243  alpha,
244  tmp);
245  sr_C->add(tmp,
246  C+offsets_C[0][i],
247  C+offsets_C[0][i]);
248  }
249  CTF_FLOPS_ADD(3*(imax-imin));
250  }
251  } else {
252 /* if (alpha == NULL && beta == NULL){
253  for (int i=imin; i<imax; i++){
254  func->apply_f(A[0].d(),
255  B+offsets_B[0][i],
256  C+offsets_C[0][i]);
257  }
258  CTF_FLOPS_ADD(imax-imin);
259  } else if (alpha == NULL)*/
260  if (alpha == NULL || sr_C->isequal(alpha,sr_C->mulid())){
261  for (int i=imin; i<imax; i++){
262  func->acc_f(A[0].d(),
263  B+offsets_B[0][i],
264  C+offsets_C[0][i],
265  sr_C);
266 /*
267  char tmp[sr_C->el_size];
268  func->apply_f(A[0].d(),
269  B+offsets_B[0][i],
270  tmp);
271  sr_C->add(tmp,
272  C+offsets_C[0][i],
273  C+offsets_C[0][i]);*/
274  }
275  CTF_FLOPS_ADD(2*(imax-imin));
276  } else {
277  ASSERT(0);
278  assert(0);
279  for (int i=imin; i<imax; i++){
280  char tmp[sr_C->el_size];
281  func->apply_f(A[0].d(),
282  B+offsets_B[0][i],
283  tmp);
284  sr_C->mul(tmp,
285  alpha,
286  tmp);
287  sr_C->add(tmp,
288  C+offsets_C[0][i],
289  C+offsets_C[0][i]);
290  }
291  CTF_FLOPS_ADD(3*(imax-imin));
292  }
293  }
294  } else {
295  int64_t key_offset = 0;
296  for (int i=0; i<order_A; i++){
297  if (i != rA){
298  key_offset += idx[idx_map_A[i]]*lda_A[i];
299  }
300  }
301  ASSERT(func == NULL && alpha != NULL && beta != NULL);
302  assert(func == NULL && alpha != NULL && beta != NULL);
303  do {
304  int64_t sk = A[0].k()-key_offset;
305  ASSERT(sk >= 0);
306  int i = sk/lda_A[rA];
307  if (i>=imax) break;
308  if (i>=imin){
309  char tmp[sr_C->el_size];
310  sr_C->mul(A[0].d(),
311  B+offsets_B[0][i],
312  tmp);
313  sr_C->mul(tmp,
314  alpha,
315  tmp);
316  sr_C->add(tmp,
317  C+offsets_C[0][i],
318  C+offsets_C[0][i]);
319  }
320  A = A[1];
321  size_A--;
322  } while (size_A > 0);
323  }
324  }
325 
326  template<>
328  (char const * alpha,
329  ConstPairIterator & A,
330  int64_t & size_A,
331  algstrct const * sr_A,
332  int order_A,
333  int const * edge_len_A,
334  int64_t const * lda_A,
335  int const * sym_A,
336  int const * idx_map_A,
337  char const * B,
338  algstrct const * sr_B,
339  int order_B,
340  int const * edge_len_B,
341  int const * sym_B,
342  int const * idx_map_B,
343  uint64_t *const* offsets_B,
344  char const * beta,
345  char * C,
346  algstrct const * sr_C,
347  int order_C,
348  int const * edge_len_C,
349  int const * sym_C,
350  int const * idx_map_C,
351  uint64_t *const* offsets_C,
352  bivar_function const * func,
353  int const * idx,
354  int const * rev_idx_map,
355  int idx_max);
356 
357 
358  void spA_dnB_dnC_seq_ctr(char const * alpha,
359  char const * A,
360  int64_t size_A,
361  algstrct const * sr_A,
362  int order_A,
363  int const * edge_len_A,
364  int const * sym_A,
365  int const * idx_map_A,
366  char const * B,
367  algstrct const * sr_B,
368  int order_B,
369  int const * edge_len_B,
370  int const * sym_B,
371  int const * idx_map_B,
372  char const * beta,
373  char * C,
374  algstrct const * sr_C,
375  int order_C,
376  int const * edge_len_C,
377  int const * sym_C,
378  int const * idx_map_C,
379  bivar_function const * func){
381  int idx_max;
382  int64_t sz;
383  int * rev_idx_map;
384  int * dlen_A, * dlen_B, * dlen_C;
385 
386  inv_idx(order_A, idx_map_A,
387  order_B, idx_map_B,
388  order_C, idx_map_C,
389  &idx_max, &rev_idx_map);
390 
391  if (idx_max == 0){
392  if (alpha == NULL && beta == NULL){
393  sr_C->mul(A, B, C);
394  CTF_FLOPS_ADD(1);
395  } else if (alpha == NULL){
396  char tmp[sr_C->el_size];
397  sr_C->mul(A, B, tmp);
398  sr_C->mul(C, beta, C);
399  sr_C->add(tmp, C, C);
400  CTF_FLOPS_ADD(2);
401  } else {
402  char tmp[sr_C->el_size];
403  sr_C->mul(A, B, tmp);
404  sr_C->mul(tmp, alpha, tmp);
405  sr_C->mul(C, beta, C);
406  sr_C->add(tmp, C, C);
407  CTF_FLOPS_ADD(3);
408  }
410  return;
411  }
412  dlen_A = (int*)CTF_int::alloc(sizeof(int)*order_A);
413  dlen_B = (int*)CTF_int::alloc(sizeof(int)*order_B);
414  dlen_C = (int*)CTF_int::alloc(sizeof(int)*order_C);
415  memcpy(dlen_A, edge_len_A, sizeof(int)*order_A);
416  memcpy(dlen_B, edge_len_B, sizeof(int)*order_B);
417  memcpy(dlen_C, edge_len_C, sizeof(int)*order_C);
418 
419 
420 
421  /* Scale C immediately. FIXME: wrong for iterators over subset of C */
422  if (!sr_C->isequal(beta, sr_C->mulid())){
423  sz = sy_packed_size(order_C, edge_len_C, sym_C);
424  if (sr_C->isequal(beta, sr_C->addid()) || sr_C->isequal(beta, NULL)){
425  sr_C->set(C, sr_C->addid(), sz);
426  } else {
427  sr_C->scal(sz, beta, C, 1);
428  /*for (i=0; i<sz; i++){
429  sr_C->mul(C+i*sr_C->el_size, beta,
430  C+i*sr_C->el_size);
431  }*/
432  }
433  }
434  ASSERT(idx_max <= MAX_ORD);
435  uint64_t ** offsets_A;
436  uint64_t ** offsets_B;
437  uint64_t ** offsets_C;
438  compute_syoffs(sr_A, order_A, edge_len_A, sym_A, idx_map_A, sr_B, order_B, edge_len_B, sym_B, idx_map_B, sr_C, order_C, edge_len_C, sym_C, idx_map_C, idx_max, rev_idx_map, offsets_A, offsets_B, offsets_C);
439 
440  //if we have something to parallelize without needing to replicate C
441  //FIXME spawn threads when
442 // if (order_C > 1 || (order_C > 0 && idx_map_C[0] != 0)){
443  {
444  int * idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
445  memset(idx_glb, 0, sizeof(int)*idx_max);
446 
447 
448  int64_t lda_A[order_A];
449  for (int i=0; i<order_A; i++){
450  if (i==0) lda_A[i] = 1;
451  else lda_A[i] = lda_A[i-1]*edge_len_A[i-1];
452  }
453 
454  ASSERT(idx_max<=MAX_ORD);
455 
456  ConstPairIterator pA(sr_A, A);
457 
458 
459  SWITCH_ORD_CALL(spA_dnB_dnC_ctrloop, idx_max-1, alpha, pA, size_A, sr_A, order_A, edge_len_A, lda_A, sym_A, idx_map_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C, sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, func, idx_glb, rev_idx_map, idx_max);
460  cdealloc(idx_glb);
461  }
462  for (int l=0; l<idx_max; l++){
463  cdealloc(offsets_A[l]);
464  cdealloc(offsets_B[l]);
465  cdealloc(offsets_C[l]);
466  }
467  cdealloc(offsets_A);
468  cdealloc(offsets_B);
469  cdealloc(offsets_C);
470 
471  CTF_int::cdealloc(dlen_A);
472  CTF_int::cdealloc(dlen_B);
473  CTF_int::cdealloc(dlen_C);
474  CTF_int::cdealloc(rev_idx_map);
476  }
477 }
void compute_syoffs(algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, int tot_order, int const *rev_idx_map, uint64_t **&offsets_A, uint64_t **&offsets_B, uint64_t **&offsets_C)
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
Definition: ctr_tsr.cxx:592
#define MAX_ORD
Definition: util.h:103
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
Definition: common.h:37
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
untyped internal class for triply-typed bivariate function
Definition: ctr_comm.h:16
#define CTF_FLOPS_ADD(n)
Definition: util.h:138
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
void spA_dnB_dnC_ctrloop< 0 >(char const *alpha, ConstPairIterator &A, int64_t &size_A, algstrct const *sr_A, int order_A, int const *edge_len_A, int64_t const *lda_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, uint64_t *const *offsets_C, bivar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
Definition: sp_seq_ctr.cxx:153
#define SWITCH_ORD_CALL(F, act_ord,...)
Definition: util.h:119
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
Definition: algstrct.cxx:262
void spA_dnB_dnC_seq_ctr(char const *alpha, char const *A, int64_t size_A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, bivar_function const *func)
Definition: sp_seq_ctr.cxx:358
virtual void add(char const *a, char const *b, char *c) const
c = a+b
Definition: algstrct.cxx:109
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
void spA_dnB_dnC_ctrloop(char const *alpha, ConstPairIterator &A, int64_t &size_A, algstrct const *sr_A, int order_A, int const *edge_len_A, int64_t const *lda_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, uint64_t *const *offsets_C, bivar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
Definition: sp_seq_ctr.cxx:12
void spA_dnB_dnC_ctrloop< MAX_ORD >(char const *alpha, ConstPairIterator &A, int64_t &size_A, algstrct const *sr_A, int order_A, int const *edge_len_A, int64_t const *lda_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, uint64_t *const *offsets_C, bivar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
virtual void mul(char const *a, char const *b, char *c) const
c = a*b
Definition: algstrct.cxx:120
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
Definition: util.cxx:10