Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
sym_seq_ctr.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #include "../shared/iter_tsr.h"
4 #include <limits.h>
5 #include "sym_seq_ctr.h"
6 #include "../shared/offload.h"
7 #include "../shared/util.h"
8 
9 namespace CTF_int{
10 
11  template <int idim>
12  void sym_seq_ctr_loop(char const * alpha,
13  char const * A,
14  algstrct const * sr_A,
15  int order_A,
16  int const * edge_len_A,
17  int const * sym_A,
18  int const * idx_map_A,
19  uint64_t *const* offsets_A,
20  char const * B,
21  algstrct const * sr_B,
22  int order_B,
23  int const * edge_len_B,
24  int const * sym_B,
25  int const * idx_map_B,
26  uint64_t *const* offsets_B,
27  char const * beta,
28  char * C,
29  algstrct const * sr_C,
30  int order_C,
31  int const * edge_len_C,
32  int const * sym_C,
33  int const * idx_map_C,
34  uint64_t *const* offsets_C,
35  bivar_function const * func,
36  int const * idx,
37  int const * rev_idx_map,
38  int idx_max){
39  int imax=0;
40  int rA = rev_idx_map[3*idim+0];
41  int rB = rev_idx_map[3*idim+1];
42  int rC = rev_idx_map[3*idim+2];
43 
44  if (rA != -1)
45  imax = edge_len_A[rA];
46  else if (rB != -1)
47  imax = edge_len_B[rB];
48  else if (rC != -1)
49  imax = edge_len_C[rC];
50 
51  if (rA != -1 && sym_A[rA] != NS){
52  int rrA = rA;
53  do {
54  if (idx_map_A[rrA+1] > idim)
55  imax = idx[idx_map_A[rrA+1]]+1;
56  rrA++;
57  } while (sym_A[rrA] != NS && idx_map_A[rrA] < idim);
58  }
59 
60  if (rB != -1 && sym_B[rB] != NS){
61  int rrB = rB;
62  do {
63  if (idx_map_B[rrB+1] > idim)
64  imax = std::min(imax,idx[idx_map_B[rrB+1]]+1);
65  rrB++;
66  } while (sym_B[rrB] != NS && idx_map_B[rrB] < idim);
67  }
68 
69  if (rC != -1 && sym_C[rC] != NS){
70  int rrC = rC;
71  do {
72  if (idx_map_C[rrC+1] > idim)
73  imax = std::min(imax,idx[idx_map_C[rrC+1]]+1);
74  rrC++;
75  } while (sym_C[rrC] != NS && idx_map_C[rrC] < idim);
76  }
77 
78  int imin = 0;
79 
80  if (rA > 0 && sym_A[rA-1] != NS){
81  int rrA = rA;
82  do {
83  if (idx_map_A[rrA-1] > idim)
84  imin = idx[idx_map_A[rrA-1]];
85  rrA--;
86  } while (rrA>0 && sym_A[rrA-1] != NS && idx_map_A[rrA] < idim);
87  }
88 
89  if (rB > 0 && sym_B[rB-1] != NS){
90  int rrB = rB;
91  do {
92  if (idx_map_B[rrB-1] > idim)
93  imin = std::max(imin,idx[idx_map_B[rrB-1]]);
94  rrB--;
95  } while (rrB>0 && sym_B[rrB-1] != NS && idx_map_B[rrB] < idim);
96  }
97 
98  if (rC > 0 && sym_C[rC-1] != NS){
99  int rrC = rC;
100  do {
101  if (idx_map_C[rrC-1] > idim)
102  imin = std::max(imin,idx[idx_map_C[rrC-1]]);
103  rrC--;
104  } while (rrC>0 && sym_C[rrC-1] != NS && idx_map_C[rrC] < idim);
105  }
106 
107  if (rC != -1){
108 #ifdef USE_OMP
109  #pragma omp for
110 #endif
111  for (int i=imin; i<imax; i++){
112 #ifdef USE_OMP
113  #pragma omp parallel
114 #endif
115  {
116  int nidx[idx_max];
117  memcpy(nidx, idx, idx_max*sizeof(int));
118  nidx[idim] = i;
119  sym_seq_ctr_loop<idim-1>(alpha, A+offsets_A[idim][nidx[idim]], sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B+offsets_B[idim][nidx[idim]], sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C+offsets_C[idim][nidx[idim]], sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, func, nidx, rev_idx_map, idx_max);
120  }
121  }
122  } else {
123  for (int i=imin; i<imax; i++){
124  int nidx[idx_max];
125  memcpy(nidx, idx, idx_max*sizeof(int));
126  nidx[idim] = i;
127  sym_seq_ctr_loop<idim-1>(alpha, A+offsets_A[idim][nidx[idim]], sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B+offsets_B[idim][nidx[idim]], sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C+offsets_C[idim][nidx[idim]], sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, func, nidx, rev_idx_map, idx_max);
128  }
129 
130  }
131 // idx[idim] = 0;
132  }
133 
134 
135  template <>
137  (char const * alpha,
138  char const * A,
139  algstrct const * sr_A,
140  int order_A,
141  int const * edge_len_A,
142  int const * sym_A,
143  int const * idx_map_A,
144  uint64_t *const* offsets_A,
145  char const * B,
146  algstrct const * sr_B,
147  int order_B,
148  int const * edge_len_B,
149  int const * sym_B,
150  int const * idx_map_B,
151  uint64_t *const* offsets_B,
152  char const * beta,
153  char * C,
154  algstrct const * sr_C,
155  int order_C,
156  int const * edge_len_C,
157  int const * sym_C,
158  int const * idx_map_C,
159  uint64_t *const* offsets_C,
160  bivar_function const * func,
161  int const * idx,
162  int const * rev_idx_map,
163  int idx_max){
164  int imax=0;
165  int rA = rev_idx_map[0];
166  int rB = rev_idx_map[1];
167  int rC = rev_idx_map[2];
168 
169  if (rA != -1)
170  imax = edge_len_A[rA];
171  else if (rB != -1)
172  imax = edge_len_B[rB];
173  else if (rC != -1)
174  imax = edge_len_C[rC];
175 
176  if (rA != -1 && sym_A[rA] != NS)
177  imax = idx[idx_map_A[rA+1]]+1;
178  if (rB != -1 && sym_B[rB] != NS)
179  imax = std::min(imax,idx[idx_map_B[rB+1]]+1);
180  if (rC != -1 && sym_C[rC] != NS)
181  imax = std::min(imax,idx[idx_map_C[rC+1]]+1);
182 
183  int imin = 0;
184 
185  if (rA > 0 && sym_A[rA-1] != NS)
186  imin = idx[idx_map_A[rA-1]];
187  if (rB > 0 && sym_B[rB-1] != NS)
188  imin = std::max(imin,idx[idx_map_B[rB-1]]);
189  if (rC > 0 && sym_C[rC-1] != NS)
190  imin = std::max(imin,idx[idx_map_C[rC-1]]);
191 
192 /* int tid, ntd;
193  tid = omp_get_thread_num();
194  ntd = omp_get_num_threads();
195  printf("-> %d/%d %d %d %d\n",tid,ntd,func==NULL, alpha==NULL,beta==NULL);*/
196 
197  if (func == NULL){
198  /*if (alpha == NULL && beta == NULL){
199  for (int i=imin; i<imax; i++){
200  sr_C->mul(A+offsets_A[0][i],
201  B+offsets_B[0][i],
202  C+offsets_C[0][i]);
203  }
204  CTF_FLOPS_ADD(imax-imin);
205  } else*/
206  if (alpha == NULL || sr_C->isequal(alpha,sr_C->mulid())){
207  for (int i=imin; i<imax; i++){
208  char tmp[sr_C->el_size];
209  sr_C->mul(A+offsets_A[0][i],
210  B+offsets_B[0][i],
211  tmp);
212  sr_C->add(tmp,
213  C+offsets_C[0][i],
214  C+offsets_C[0][i]);
215  }
216  CTF_FLOPS_ADD(2*(imax-imin));
217  } else {
218  for (int i=imin; i<imax; i++){
219  char tmp[sr_C->el_size];
220  sr_C->mul(A+offsets_A[0][i],
221  B+offsets_B[0][i],
222  tmp);
223  sr_C->mul(tmp,
224  alpha,
225  tmp);
226  sr_C->add(tmp,
227  C+offsets_C[0][i],
228  C+offsets_C[0][i]);
229  }
230  CTF_FLOPS_ADD(3*(imax-imin));
231  }
232  } else {
233  /*if (alpha == NULL && beta == NULL){
234  for (int i=imin; i<imax; i++){
235  func->apply_f(A+offsets_A[0][i],
236  B+offsets_B[0][i],
237  C+offsets_C[0][i]);
238  }
239  CTF_FLOPS_ADD(imax-imin);
240  } else*/
241  if (alpha == NULL || sr_C->isequal(alpha,sr_C->mulid())){
242  for (int i=imin; i<imax; i++){
243  func->acc_f(A+offsets_A[0][i],
244  B+offsets_B[0][i],
245  C+offsets_C[0][i],
246  sr_C);
247  }
248  CTF_FLOPS_ADD(2*(imax-imin));
249  } else {
250  //ASSERT(0);
251  //assert(0);
252  //printf("HERTE alpha = %d\n",*(int*)alpha);
253  for (int i=imin; i<imax; i++){
254  char tmp[sr_C->el_size];
255  func->apply_f(A+offsets_A[0][i],
256  B+offsets_B[0][i],
257  tmp);
258  sr_C->mul(tmp,
259  alpha,
260  tmp);
261  sr_C->add(tmp,
262  C+offsets_C[0][i],
263  C+offsets_C[0][i]);
264  }
265  CTF_FLOPS_ADD(3*(imax-imin));
266  }
267  }
268  }
269 
270  template
272  (char const * alpha,
273  char const * A,
274  algstrct const * sr_A,
275  int order_A,
276  int const * edge_len_A,
277  int const * sym_A,
278  int const * idx_map_A,
279  uint64_t *const* offsets_A,
280  char const * B,
281  algstrct const * sr_B,
282  int order_B,
283  int const * edge_len_B,
284  int const * sym_B,
285  int const * idx_map_B,
286  uint64_t *const* offsets_B,
287  char const * beta,
288  char * C,
289  algstrct const * sr_C,
290  int order_C,
291  int const * edge_len_C,
292  int const * sym_C,
293  int const * idx_map_C,
294  uint64_t *const* offsets_C,
295  bivar_function const * func,
296  int const * idx,
297  int const * rev_idx_map,
298  int idx_max);
299 
300 
301  void compute_syoff(int r,
302  int len,
303  algstrct const * sr,
304  int const * edge_len,
305  int const * sym,
306  uint64_t * offsets){
307  if (r == -1){
308  std::fill(offsets, offsets+len, 0);
309  } else if (r == 0){
310  for (int i=0; i<len; i++){
311  offsets[i] = i*sr->el_size;
312  }
313  } else if (sym[r-1] == NS){
314  int64_t sz = sy_packed_size(r, edge_len, sym)*sr->el_size;
315  for (int i=0; i<len; i++){
316  offsets[i] = i*sz;
317  }
318  } else {
319  int medge_len[r+1];
320  memcpy(medge_len, edge_len, r*sizeof(int));
321  int rr = r-1;
322  while (rr>0 && sym[rr-1] != NS) rr--;
323  for (int i=0; i<len; i++){
324  std::fill(medge_len+rr,medge_len+r+1, i);
325  int64_t sz = sy_packed_size(r+1, medge_len, sym)*sr->el_size;
326  offsets[i] = sz;
327  }
328  }
329  }
330 
331 
332  void compute_syoffs(algstrct const * sr_A,
333  int order_A,
334  int const * edge_len_A,
335  int const * sym_A,
336  int const * idx_map_A,
337  algstrct const * sr_B,
338  int order_B,
339  int const * edge_len_B,
340  int const * sym_B,
341  int const * idx_map_B,
342  algstrct const * sr_C,
343  int order_C,
344  int const * edge_len_C,
345  int const * sym_C,
346  int const * idx_map_C,
347  int tot_order,
348  int const * rev_idx_map,
349  uint64_t **& offsets_A,
350  uint64_t **& offsets_B,
351  uint64_t **& offsets_C){
353  offsets_A = (uint64_t**)CTF_int::alloc(sizeof(uint64_t*)*tot_order);
354  offsets_B = (uint64_t**)CTF_int::alloc(sizeof(uint64_t*)*tot_order);
355  offsets_C = (uint64_t**)CTF_int::alloc(sizeof(uint64_t*)*tot_order);
356 
357  for (int idim=0; idim<tot_order; idim++){
358  int len=0;
359 
360  int rA = rev_idx_map[3*idim+0];
361  int rB = rev_idx_map[3*idim+1];
362  int rC = rev_idx_map[3*idim+2];
363 
364  if (rA != -1)
365  len = edge_len_A[rA];
366  else if (rB != -1)
367  len = edge_len_B[rB];
368  else if (rC != -1)
369  len = edge_len_C[rC];
370 
371  offsets_A[idim] = (uint64_t*)CTF_int::alloc(sizeof(uint64_t)*len);
372  offsets_B[idim] = (uint64_t*)CTF_int::alloc(sizeof(uint64_t)*len);
373  offsets_C[idim] = (uint64_t*)CTF_int::alloc(sizeof(uint64_t)*len);
374  compute_syoff(rA, len, sr_A, edge_len_A, sym_A, offsets_A[idim]);
375  compute_syoff(rB, len, sr_B, edge_len_B, sym_B, offsets_B[idim]);
376  compute_syoff(rC, len, sr_C, edge_len_C, sym_C, offsets_C[idim]);
377  }
379  }
380 
381  int sym_seq_ctr_ref(char const * alpha,
382  char const * A,
383  algstrct const * sr_A,
384  int order_A,
385  int const * edge_len_A,
386  int const * sym_A,
387  int const * idx_map_A,
388  char const * B,
389  algstrct const * sr_B,
390  int order_B,
391  int const * edge_len_B,
392  int const * sym_B,
393  int const * idx_map_B,
394  char const * beta,
395  char * C,
396  algstrct const * sr_C,
397  int order_C,
398  int const * edge_len_C,
399  int const * sym_C,
400  int const * idx_map_C){
402  int idx, i, idx_max, imin, imax, iA, iB, iC, j, k;
403  int64_t sz;
404  int off_idx, sym_pass;
405  int * rev_idx_map;
406  int * dlen_A, * dlen_B, * dlen_C;
407  int64_t idx_A, idx_B, idx_C, off_lda;
408 
409  inv_idx(order_A, idx_map_A,
410  order_B, idx_map_B,
411  order_C, idx_map_C,
412  &idx_max, &rev_idx_map);
413 
414  if (idx_max == 0){
415  if (alpha == NULL && beta == NULL){
416  sr_C->mul(A, B, C);
417  CTF_FLOPS_ADD(1);
418  } else if (alpha == NULL){
419  char tmp[sr_C->el_size];
420  sr_C->mul(A, B, tmp);
421  sr_C->mul(C, beta, C);
422  sr_C->add(tmp, C, C);
423  CTF_FLOPS_ADD(2);
424  } else {
425  char tmp[sr_C->el_size];
426  sr_C->mul(A, B, tmp);
427  sr_C->mul(tmp, alpha, tmp);
428  sr_C->mul(C, beta, C);
429  sr_C->add(tmp, C, C);
430  CTF_FLOPS_ADD(3);
431  }
433  return 0;
434  }
435  dlen_A = (int*)CTF_int::alloc(sizeof(int)*order_A);
436  dlen_B = (int*)CTF_int::alloc(sizeof(int)*order_B);
437  dlen_C = (int*)CTF_int::alloc(sizeof(int)*order_C);
438  memcpy(dlen_A, edge_len_A, sizeof(int)*order_A);
439  memcpy(dlen_B, edge_len_B, sizeof(int)*order_B);
440  memcpy(dlen_C, edge_len_C, sizeof(int)*order_C);
441 
442 
443 
444  /* Scale C immediately. FIXME: wrong for iterators over subset of C */
445  if (!sr_C->isequal(beta, sr_C->mulid())){
446  sz = sy_packed_size(order_C, edge_len_C, sym_C);
447  if (sr_C->isequal(beta, sr_C->addid()) || sr_C->isequal(beta, NULL)){
448  sr_C->set(C, sr_C->addid(), sz);
449  } else {
450  sr_C->scal(sz, beta, C, 1);
451  /*for (i=0; i<sz; i++){
452  sr_C->mul(C+i*sr_C->el_size, beta,
453  C+i*sr_C->el_size);
454  }*/
455  }
456  }
457  if (idx_max <= MAX_ORD){
458  uint64_t ** offsets_A;
459  uint64_t ** offsets_B;
460  uint64_t ** offsets_C;
461  compute_syoffs(sr_A, order_A, edge_len_A, sym_A, idx_map_A, sr_B, order_B, edge_len_B, sym_B, idx_map_B, sr_C, order_C, edge_len_C, sym_C, idx_map_C, idx_max, rev_idx_map, offsets_A, offsets_B, offsets_C);
462 
463  //if we have something to parallelize without needing to replicate C
464  if (order_C > 1 || (order_C > 0 && idx_map_C[0] != 0)){
465 #ifdef USE_OMP
466  #pragma omp parallel
467 #endif
468  {
469  int * idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
470  memset(idx_glb, 0, sizeof(int)*idx_max);
471 
472  SWITCH_ORD_CALL(sym_seq_ctr_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C, sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, NULL, idx_glb, rev_idx_map, idx_max);
473  cdealloc(idx_glb);
474  }
475  } else {
476  {
477  int * idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
478  memset(idx_glb, 0, sizeof(int)*idx_max);
479 
480  SWITCH_ORD_CALL(sym_seq_ctr_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C, sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, NULL, idx_glb, rev_idx_map, idx_max);
481  cdealloc(idx_glb);
482  }
483  }
484  for (int l=0; l<idx_max; l++){
485  cdealloc(offsets_A[l]);
486  cdealloc(offsets_B[l]);
487  cdealloc(offsets_C[l]);
488  }
489  cdealloc(offsets_A);
490  cdealloc(offsets_B);
491  cdealloc(offsets_C);
492  } else {
493  int * idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
494  memset(idx_glb, 0, sizeof(int)*idx_max);
495 
496  idx_A = 0, idx_B = 0, idx_C = 0;
497  sym_pass = 1;
498  for (;;){
499  //printf("[%d] <- [%d]*[%d]\n",idx_C, idx_A, idx_B);
500  if (sym_pass){
501  /*if (alpha == NULL && beta == NULL){
502  sr_C->mul(A+idx_A*sr_A->el_size, B+idx_B*sr_B->el_size,
503  C+idx_C*sr_C->el_size);
504  CTF_FLOPS_ADD(1);
505  } else*/ if (alpha == NULL){
506  char tmp[sr_C->el_size];
507  sr_C->mul(A+idx_A*sr_A->el_size, B+idx_B*sr_B->el_size,
508  tmp);
509  sr_C->add(tmp, C+idx_C*sr_C->el_size, C+idx_C*sr_C->el_size);
510  CTF_FLOPS_ADD(2);
511  } else {
512  char tmp[sr_C->el_size];
513  sr_C->mul(A+idx_A*sr_A->el_size, B+idx_B*sr_B->el_size,
514  tmp);
515  sr_C->mul(tmp, alpha, tmp);
516  sr_C->add(tmp, C+idx_C*sr_C->el_size, C+idx_C*sr_C->el_size);
517  CTF_FLOPS_ADD(3);
518  }
519  }
520  //printf("[%lf] <- [%lf]*[%lf]\n",C[idx_C],A[idx_A],B[idx_B]);
521 
522  for (idx=0; idx<idx_max; idx++){
523  imin = 0, imax = INT_MAX;
524 
525  GET_MIN_MAX(A,0,3);
526  GET_MIN_MAX(B,1,3);
527  GET_MIN_MAX(C,2,3);
528 
529  ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);
530 
531  idx_glb[idx]++;
532 
533  if (idx_glb[idx] >= imax){
534  idx_glb[idx] = imin;
535  }
536  if (idx_glb[idx] != imin) {
537  break;
538  }
539  }
540  if (idx == idx_max) break;
541 
542  CHECK_SYM(A);
543  if (!sym_pass) continue;
544  CHECK_SYM(B);
545  if (!sym_pass) continue;
546  CHECK_SYM(C);
547  if (!sym_pass) continue;
548 
549  if (order_A > 0)
550  RESET_IDX(A);
551  if (order_B > 0)
552  RESET_IDX(B);
553  if (order_C > 0)
554  RESET_IDX(C);
555  }
556  CTF_int::cdealloc(idx_glb);
557  }
558  CTF_int::cdealloc(dlen_A);
559  CTF_int::cdealloc(dlen_B);
560  CTF_int::cdealloc(dlen_C);
561  CTF_int::cdealloc(rev_idx_map);
563  return 0;
564  }
565 
566  int sym_seq_ctr_cust(char const * alpha,
567  char const * A,
568  algstrct const * sr_A,
569  int order_A,
570  int const * edge_len_A,
571  int const * sym_A,
572  int const * idx_map_A,
573  char const * B,
574  algstrct const * sr_B,
575  int order_B,
576  int const * edge_len_B,
577  int const * sym_B,
578  int const * idx_map_B,
579  char const * beta,
580  char * C,
581  algstrct const * sr_C,
582  int order_C,
583  int const * edge_len_C,
584  int const * sym_C,
585  int const * idx_map_C,
586  bivar_function const * func){
588  int idx, i, idx_max, imin, imax, iA, iB, iC, j, k;
589  int off_idx, sym_pass;
590  int * idx_glb, * rev_idx_map;
591  int * dlen_A, * dlen_B, * dlen_C;
592  //int64_t sz;
593  int64_t idx_A, idx_B, idx_C, off_lda;
594 
595  inv_idx(order_A, idx_map_A,
596  order_B, idx_map_B,
597  order_C, idx_map_C,
598  &idx_max, &rev_idx_map);
599 
600  dlen_A = (int*)CTF_int::alloc(sizeof(int)*order_A);
601  dlen_B = (int*)CTF_int::alloc(sizeof(int)*order_B);
602  dlen_C = (int*)CTF_int::alloc(sizeof(int)*order_C);
603  memcpy(dlen_A, edge_len_A, sizeof(int)*order_A);
604  memcpy(dlen_B, edge_len_B, sizeof(int)*order_B);
605  memcpy(dlen_C, edge_len_C, sizeof(int)*order_C);
606 
607  idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
608  memset(idx_glb, 0, sizeof(int)*idx_max);
609 
610  /* Scale C immediately. FIXME: wrong for iterators over subset of C */
611  /*if (beta != get_one<dtype>()) {
612  sz = sy_packed_size(order_C, edge_len_C, sym_C);
613  for (i=0; i<sz; i++){
614  C[i] = C[i]*beta;
615  }
616  }*/
617 /* if (beta != NULL && !sr_C->isequal(beta, sr_C->mulid())){
618  int64_t sz = sy_packed_size(order_C, edge_len_C, sym_C);
619  if (sr_C->isequal(beta, sr_C->addid())){
620  sr_C->set(C, sr_C->addid(), sz);
621  } else {
622  for (i=0; i<sz; i++){
623  sr_C->mul(C+i*sr_C->el_size, beta,
624  C+i*sr_C->el_size);
625  }
626  }
627  }*/
628  if (!sr_C->isequal(beta, sr_C->mulid())){
629  int64_t sz = sy_packed_size(order_C, edge_len_C, sym_C);
630  if (sr_C->isequal(beta, sr_C->addid()) || sr_C->isequal(beta, NULL)){
631  sr_C->set(C, sr_C->addid(), sz);
632  } else {
633  sr_C->scal(sz, beta, C, 1);
634  /*for (i=0; i<sz; i++){
635  sr_C->mul(C+i*sr_C->el_size, beta,
636  C+i*sr_C->el_size);
637  }*/
638  }
639  }
640 
641 
642  if (idx_max <= MAX_ORD){
643  uint64_t ** offsets_A;
644  uint64_t ** offsets_B;
645  uint64_t ** offsets_C;
646  compute_syoffs(sr_A, order_A, edge_len_A, sym_A, idx_map_A, sr_B, order_B, edge_len_B, sym_B, idx_map_B, sr_C, order_C, edge_len_C, sym_C, idx_map_C, idx_max, rev_idx_map, offsets_A, offsets_B, offsets_C);
647 
648  //if we have something to parallelize without needing to replicate C
649  if (order_C > 1 || (order_C > 0 && idx_map_C[0] != 0)){
650 #ifdef USE_OMP
651  #pragma omp parallel
652 #endif
653  {
654  int * idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
655  memset(idx_glb, 0, sizeof(int)*idx_max);
656 
657  SWITCH_ORD_CALL(sym_seq_ctr_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C, sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, func, idx_glb, rev_idx_map, idx_max);
658  cdealloc(idx_glb);
659  }
660  } else {
661  {
662  int * idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
663  memset(idx_glb, 0, sizeof(int)*idx_max);
664 
665  SWITCH_ORD_CALL(sym_seq_ctr_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C, sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, func, idx_glb, rev_idx_map, idx_max);
666  cdealloc(idx_glb);
667  }
668  }
669  for (int l=0; l<idx_max; l++){
670  cdealloc(offsets_A[l]);
671  cdealloc(offsets_B[l]);
672  cdealloc(offsets_C[l]);
673  }
674  cdealloc(offsets_A);
675  cdealloc(offsets_B);
676  cdealloc(offsets_C);
677  } else {
678 
679 
680  idx_A = 0, idx_B = 0, idx_C = 0;
681  sym_pass = 1;
682  for (;;){
683  //printf("[%d] <- [%d]*[%d]\n",idx_C, idx_A, idx_B);
684  if (sym_pass){
685  /*if (alpha == NULL && beta == NULL){
686  func->apply_f(A+idx_A*sr_A->el_size, B+idx_B*sr_B->el_size,
687  C+idx_C*sr_C->el_size);
688  CTF_FLOPS_ADD(1);
689  } else */ if (alpha == NULL){
690  func->acc_f(A+idx_A*sr_A->el_size, B+idx_B*sr_B->el_size, C+idx_C*sr_C->el_size, sr_C);
691  CTF_FLOPS_ADD(2);
692  } else {
693  char tmp[sr_C->el_size];
694  sr_C->mul(A+idx_A*sr_A->el_size, alpha, tmp);
695  func->acc_f(tmp, B+idx_B*sr_B->el_size, C+idx_C*sr_C->el_size, sr_C);
696  CTF_FLOPS_ADD(3);
697  }
698  }
699 
700  for (idx=0; idx<idx_max; idx++){
701  imin = 0, imax = INT_MAX;
702 
703  GET_MIN_MAX(A,0,3);
704  GET_MIN_MAX(B,1,3);
705  GET_MIN_MAX(C,2,3);
706 
707  ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);
708 
709  idx_glb[idx]++;
710 
711  if (idx_glb[idx] >= imax){
712  idx_glb[idx] = imin;
713  }
714  if (idx_glb[idx] != imin) {
715  break;
716  }
717  }
718  if (idx == idx_max) break;
719 
720  CHECK_SYM(A);
721  if (!sym_pass) continue;
722  CHECK_SYM(B);
723  if (!sym_pass) continue;
724  CHECK_SYM(C);
725  if (!sym_pass) continue;
726 
727 
728  if (order_A > 0)
729  RESET_IDX(A);
730  if (order_B > 0)
731  RESET_IDX(B);
732  if (order_C > 0)
733  RESET_IDX(C);
734  }
735  }
736  CTF_int::cdealloc(dlen_A);
737  CTF_int::cdealloc(dlen_B);
738  CTF_int::cdealloc(dlen_C);
739  CTF_int::cdealloc(idx_glb);
740  CTF_int::cdealloc(rev_idx_map);
742  return 0;
743  }
744 
745  int sym_seq_ctr_inr(char const * alpha,
746  char const * A,
747  algstrct const * sr_A,
748  int order_A,
749  int const * edge_len_A,
750  int const * sym_A,
751  int const * idx_map_A,
752  char const * B,
753  algstrct const * sr_B,
754  int order_B,
755  int const * edge_len_B,
756  int const * sym_B,
757  int const * idx_map_B,
758  char const * beta,
759  char * C,
760  algstrct const * sr_C,
761  int order_C,
762  int const * edge_len_C,
763  int const * sym_C,
764  int const * idx_map_C,
765  iparam const * prm,
766  bivar_function const * func){
767  TAU_FSTART(sym_seq_ctr_inner);
768  int idx, i, idx_max, imin, imax, iA, iB, iC, j, k;
769  int off_idx, sym_pass, stride_A, stride_B, stride_C;
770  int * idx_glb, * rev_idx_map;
771  int * dlen_A, * dlen_B, * dlen_C;
772  int64_t idx_A, idx_B, idx_C, off_lda;
773 
774  stride_A = prm->m*prm->k*prm->l;
775  stride_B = prm->k*prm->n*prm->l;
776  stride_C = prm->m*prm->n*prm->l;
777 
778  inv_idx(order_A, idx_map_A,
779  order_B, idx_map_B,
780  order_C, idx_map_C,
781  &idx_max, &rev_idx_map);
782 
783  dlen_A = (int*)CTF_int::alloc(sizeof(int)*order_A);
784  dlen_B = (int*)CTF_int::alloc(sizeof(int)*order_B);
785  dlen_C = (int*)CTF_int::alloc(sizeof(int)*order_C);
786  memcpy(dlen_A, edge_len_A, sizeof(int)*order_A);
787  memcpy(dlen_B, edge_len_B, sizeof(int)*order_B);
788  memcpy(dlen_C, edge_len_C, sizeof(int)*order_C);
789 
790  idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
791  memset(idx_glb, 0, sizeof(int)*idx_max);
792 
793 
794  /* Scale C immediately. WARNING: wrong for iterators over subset of C */
795  if (!prm->offload){
796  if (!sr_C->isequal(beta, sr_C->mulid())){
797  CTF_FLOPS_ADD(prm->sz_C);
798  /* for (i=0; i<prm->sz_C; i++){
799  C[i] = C[i]*beta;
800  }*/
801  if (sr_C->isequal(beta, sr_C->addid())){
802  sr_C->set(C, sr_C->addid(), prm->sz_C);
803  } else {
804  sr_C->scal(prm->sz_C, beta, C, 1);
805  }
806  }
807  }
808  idx_A = 0, idx_B = 0, idx_C = 0;
809  sym_pass = 1;
810  // int cntr=0;
811  for (;;){
812  if (sym_pass){
813  TAU_FSTART(gemm);
814  if (prm->tC == 'N'){
815  if (prm->offload){
816  //FIXME: Add GPU batched gemm support
817  ASSERT(prm->l == 1);
818  if (func == NULL){
819  sr_C->offload_gemm(prm->tA, prm->tB, prm->m, prm->n, prm->k, alpha,
820  A+idx_A*stride_A*sr_A->el_size,
821  B+idx_B*stride_B*sr_B->el_size, sr_C->mulid(),
822  C+idx_C*stride_C*sr_C->el_size);
823  } else {
824  ASSERT(sr_C->isequal(alpha,sr_C->mulid()));
825  func->coffload_gemm(prm->tA, prm->tB, prm->m, prm->n, prm->k,
826  A+idx_A*stride_A*sr_A->el_size,
827  B+idx_B*stride_B*sr_B->el_size,
828  C+idx_C*stride_C*sr_C->el_size);
829  }
830  } else {
831  if (func == NULL){
832  sr_C->gemm_batch(prm->tA, prm->tB, prm->l, prm->m, prm->n, prm->k, alpha,
833  A+idx_A*stride_A*sr_A->el_size,
834  B+idx_B*stride_B*sr_B->el_size, sr_C->mulid(),
835  C+idx_C*stride_C*sr_C->el_size);
836  } else {
837  ASSERT(prm->l == 1);
838  ASSERT(sr_C->isequal(alpha,sr_C->mulid()));
839  func->cgemm(prm->tA, prm->tB, prm->m, prm->n, prm->k,
840  A+idx_A*stride_A*sr_A->el_size,
841  B+idx_B*stride_B*sr_B->el_size,
842  C+idx_C*stride_C*sr_C->el_size);
843  }
844  }
845  } else {
846  if (prm->offload){
847  ASSERT(prm->l == 1);
848  if (func == NULL){
849  sr_C->offload_gemm(prm->tB, prm->tA, prm->n, prm->m, prm->k, alpha,
850  B+idx_B*stride_B*sr_B->el_size,
851  A+idx_A*stride_A*sr_A->el_size, sr_C->mulid(),
852  C+idx_C*stride_C*sr_C->el_size);
853  } else {
854  ASSERT(sr_C->isequal(alpha,sr_C->mulid()));
855  func->coffload_gemm(prm->tB, prm->tA, prm->n, prm->m, prm->k,
856  B+idx_B*stride_B*sr_B->el_size,
857  A+idx_A*stride_A*sr_A->el_size,
858  C+idx_C*stride_C*sr_C->el_size);
859  }
860  } else {
861  if (func == NULL){
862  sr_C->gemm_batch(prm->tB, prm->tA, prm->l, prm->n, prm->m, prm->k, alpha,
863  B+idx_B*stride_B*sr_B->el_size,
864  A+idx_A*stride_A*sr_A->el_size, sr_C->mulid(),
865  C+idx_C*stride_C*sr_C->el_size);
866  } else {
867  ASSERT(sr_C->isequal(alpha,sr_C->mulid()));
868  ASSERT(prm->l == 1);
869  func->cgemm(prm->tB, prm->tA, prm->n, prm->m, prm->k,
870  B+idx_B*stride_B*sr_B->el_size,
871  A+idx_A*stride_A*sr_A->el_size,
872  C+idx_C*stride_C*sr_C->el_size);
873 
874  }
875  }
876  }
877  //printf("[%d] <- [%d]*[%d] (%d)\n",idx_C, idx_A, idx_B, cntr++);
878  //printf("%c %c %c %d %d %d\n", prm->tC, prm->tA, prm->tB, prm->m, prm->n, prm->k);
879  /*printf("multiplying %lf by %lf and got %lf\n",
880  ((double*)(A+idx_A*stride_A*sr_A->el_size))[0],
881  ((double*)(B+idx_B*stride_B*sr_B->el_size))[0],
882  ((double*)(C+idx_C*stride_C*sr_C->el_size))[0]);*/
883  TAU_FSTOP(gemm);
884  // count n^2 FLOPS too
885  CTF_FLOPS_ADD((2 * (int64_t)prm->l * (int64_t)prm->n * (int64_t)prm->m * (int64_t)(prm->k+1)));
886  }
887  //printf("[%ld] <- [%ld]*[%ld] (%d <- %d, %d)\n",idx_C,idx_A,idx_B,stride_C,stride_A,stride_B);
888 
889  for (idx=0; idx<idx_max; idx++){
890  imin = 0, imax = INT_MAX;
891 
892  GET_MIN_MAX(A,0,3);
893  GET_MIN_MAX(B,1,3);
894  GET_MIN_MAX(C,2,3);
895 
896  ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);
897 
898  idx_glb[idx]++;
899 
900  if (idx_glb[idx] >= imax){
901  idx_glb[idx] = imin;
902  }
903  if (idx_glb[idx] != imin) {
904  break;
905  }
906  }
907  if (idx == idx_max) break;
908 
909  CHECK_SYM(A);
910  if (!sym_pass) continue;
911  CHECK_SYM(B);
912  if (!sym_pass) continue;
913  CHECK_SYM(C);
914  if (!sym_pass) continue;
915 
916 
917  if (order_A > 0)
918  RESET_IDX(A);
919  if (order_B > 0)
920  RESET_IDX(B);
921  if (order_C > 0)
922  RESET_IDX(C);
923  }
924  CTF_int::cdealloc(dlen_A);
925  CTF_int::cdealloc(dlen_B);
926  CTF_int::cdealloc(dlen_C);
927  CTF_int::cdealloc(idx_glb);
928  CTF_int::cdealloc(rev_idx_map);
929  TAU_FSTOP(sym_seq_ctr_inner);
930  return 0;
931  }
932 }
void compute_syoffs(algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, int tot_order, int const *rev_idx_map, uint64_t **&offsets_A, uint64_t **&offsets_B, uint64_t **&offsets_C)
bool offload
Definition: ctr_tsr.h:84
int64_t k
Definition: ctr_tsr.h:79
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340
#define RESET_IDX(__X)
Definition: iter_tsr.h:67
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
Definition: ctr_tsr.cxx:592
#define MAX_ORD
Definition: util.h:103
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
Definition: common.h:37
int64_t m
Definition: ctr_tsr.h:78
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
void gemm(char tA, char tB, int m, int n, int k, dtype alpha, dtype const *A, dtype const *B, dtype beta, dtype *C)
Definition: semiring.cxx:82
untyped internal class for triply-typed bivariate function
Definition: ctr_comm.h:16
#define GET_MIN_MAX(__X, nr, wd)
Definition: iter_tsr.h:16
#define CTF_FLOPS_ADD(n)
Definition: util.h:138
virtual void gemm_batch(char tA, char tB, int l, int m, int n, int k, char const *alpha, char const *A, char const *B, char const *beta, char *C) const
beta*C["ijl"]=alpha*A^tA["ikl"]*B^tB["kjl"];
Definition: algstrct.cxx:291
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
virtual void coffload_gemm(char tA, char tB, int m, int n, int k, char const *A, char const *B, char *C) const
Definition: ctr_comm.h:87
int64_t n
Definition: ctr_tsr.h:77
int64_t sz_C
Definition: ctr_tsr.h:80
#define SWITCH_ORD_CALL(F, act_ord,...)
Definition: util.h:119
int sym_seq_ctr_cust(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, bivar_function const *func)
performs symmetric contraction with custom elementwise function
int64_t l
Definition: ctr_tsr.h:76
int sym_seq_ctr_ref(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C)
performs symmetric contraction with reference (unblocked) kernel
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
#define CHECK_SYM(__X)
Definition: iter_tsr.h:52
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
Definition: algstrct.cxx:262
void sym_seq_ctr_loop(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, uint64_t *const *offsets_C, bivar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
Definition: sym_seq_ctr.cxx:12
int sym_seq_ctr_inr(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, iparam const *prm, bivar_function const *func)
performs symmetric contraction with blocked gemm
virtual void add(char const *a, char const *b, char *c) const
c = a+b
Definition: algstrct.cxx:109
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
void sym_seq_ctr_loop< 0 >(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, uint64_t *const *offsets_C, bivar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
void compute_syoff(int r, int len, algstrct const *sr, int const *edge_len, int const *sym, uint64_t *offsets)
virtual void mul(char const *a, char const *b, char *c) const
c = a*b
Definition: algstrct.cxx:120
template void sym_seq_ctr_loop< MAX_ORD >(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, uint64_t *const *offsets_C, bivar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93
virtual void offload_gemm(char tA, char tB, int m, int n, int k, char const *alpha, char const *A, char const *B, char const *beta, char *C) const
Definition: algstrct.cxx:322
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
Definition: util.cxx:10
virtual void cgemm(char tA, char tB, int m, int n, int k, char const *A, char const *B, char *C) const
Definition: ctr_comm.h:78
virtual void acc_f(char const *a, char const *b, char *c, CTF_int::algstrct const *sr_C) const =0
compute c = c+f(a,b)