3 #include "../shared/iter_tsr.h" 4 #include "../shared/util.h" 15 int const * edge_len_A,
17 int const * idx_map_A,
18 uint64_t *
const* offsets_A,
22 int const * edge_len_B,
24 int const * idx_map_B,
25 uint64_t *
const* offsets_B,
28 int const * rev_idx_map,
31 int rA = rev_idx_map[2*idim+0];
32 int rB = rev_idx_map[2*idim+1];
35 imax = edge_len_A[rA];
37 imax = edge_len_B[rB];
39 if (rA != -1 && sym_A[rA] !=
NS){
42 if (idx_map_A[rrA+1] > idim)
43 imax = idx[idx_map_A[rrA+1]]+1;
45 }
while (sym_A[rrA] !=
NS && idx_map_A[rrA] < idim);
48 if (rB != -1 && sym_B[rB] !=
NS){
51 if (idx_map_B[rrB+1] > idim)
52 imax = std::min(imax,idx[idx_map_B[rrB+1]]+1);
54 }
while (sym_B[rrB] !=
NS && idx_map_B[rrB] < idim);
59 if (rA > 0 && sym_A[rA-1] !=
NS){
62 if (idx_map_A[rrA-1] > idim)
63 imin = idx[idx_map_A[rrA-1]];
65 }
while (rrA>0 && sym_A[rrA-1] !=
NS && idx_map_A[rrA] < idim);
68 if (rB > 0 && sym_B[rB-1] !=
NS){
71 if (idx_map_B[rrB-1] > idim)
72 imin = std::max(imin,idx[idx_map_B[rrB-1]]);
74 }
while (rrB>0 && sym_B[rrB-1] !=
NS && idx_map_B[rrB] < idim);
81 for (
int i=imin; i<imax; i++){
87 memcpy(nidx, idx, idx_max*
sizeof(
int));
89 sym_seq_sum_loop<idim-1>(alpha, A+offsets_A[idim][nidx[idim]], sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B+offsets_B[idim][nidx[idim]], sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, func, nidx, rev_idx_map, idx_max);
93 for (
int i=imin; i<imax; i++){
95 memcpy(nidx, idx, idx_max*
sizeof(
int));
97 sym_seq_sum_loop<idim-1>(alpha, A+offsets_A[idim][nidx[idim]], sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B+offsets_B[idim][nidx[idim]], sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, func, nidx, rev_idx_map, idx_max);
111 int const * edge_len_A,
113 int const * idx_map_A,
114 uint64_t *
const* offsets_A,
118 int const * edge_len_B,
120 int const * idx_map_B,
121 uint64_t *
const* offsets_B,
124 int const * rev_idx_map,
127 int rA = rev_idx_map[0];
128 int rB = rev_idx_map[1];
131 imax = edge_len_A[rA];
133 imax = edge_len_B[rB];
135 if (rA != -1 && sym_A[rA] !=
NS)
136 imax = idx[idx_map_A[rA+1]]+1;
137 if (rB != -1 && sym_B[rB] !=
NS)
138 imax = std::min(imax,idx[idx_map_B[rB+1]]+1);
142 if (rA > 0 && sym_A[rA-1] !=
NS)
143 imin = idx[idx_map_A[rA-1]];
144 if (rB > 0 && sym_B[rB-1] !=
NS)
145 imin = std::max(imin,idx[idx_map_B[rB-1]]);
149 for (
int i=imin; i<imax; i++){
150 sr_B->add(A+offsets_A[0][i],
156 for (
int i=imin; i<imax; i++){
157 char tmp[sr_A->el_size];
158 sr_A->mul(A+offsets_A[0][i],
176 int const * edge_len_A,
178 int const * idx_map_A,
179 uint64_t *
const* offsets_A,
183 int const * edge_len_B,
185 int const * idx_map_B,
186 uint64_t *
const* offsets_B,
189 int const * rev_idx_map,
195 int const * edge_len_A,
197 int const * idx_map_A,
200 int const * edge_len_B,
202 int const * idx_map_B,
204 int const * rev_idx_map,
205 uint64_t **& offsets_A,
206 uint64_t **& offsets_B){
208 offsets_A = (uint64_t**)
CTF_int::alloc(
sizeof(uint64_t*)*tot_order);
209 offsets_B = (uint64_t**)
CTF_int::alloc(
sizeof(uint64_t*)*tot_order);
211 for (
int idim=0; idim<tot_order; idim++){
214 int rA = rev_idx_map[2*idim+0];
215 int rB = rev_idx_map[2*idim+1];
218 len = edge_len_A[rA];
220 len = edge_len_B[rB];
222 offsets_A[idim] = (uint64_t*)
CTF_int::alloc(
sizeof(uint64_t)*len);
223 offsets_B[idim] = (uint64_t*)
CTF_int::alloc(
sizeof(uint64_t)*len);
224 compute_syoff(rA, len, sr_A, edge_len_A, sym_A, offsets_A[idim]);
225 compute_syoff(rB, len, sr_B, edge_len_B, sym_B, offsets_B[idim]);
231 #define SCAL_B do { \ 232 if (!sr_B->isequal(beta, sr_B->mulid())){ \ 233 memset(idx_glb, 0, sizeof(int)*idx_max); \ 234 idx_A = 0, idx_B = 0; \ 238 sr_B->mul(beta, B+idx_B*sr_B->el_size, B+idx_B*sr_B->el_size); \ 241 for (idx=0; idx<idx_max; idx++){ \ 242 imin = 0, imax = INT_MAX; \ 243 GET_MIN_MAX(B,1,2); \ 244 if (rev_idx_map[2*idx+1] == -1) imax = imin+1; \ 246 if (idx_glb[idx] >= imax){ \ 247 idx_glb[idx] = imin; \ 249 if (idx_glb[idx] != imin) { \ 253 if (idx == idx_max) break; \ 255 if (!sym_pass) continue; \ 262 #define SCAL_B_inr do { \ 263 if (!sr_B->isequal(beta, sr_B->mulid())){ \ 264 memset(idx_glb, 0, sizeof(int)*idx_max); \ 265 idx_A = 0, idx_B = 0; \ 269 sr_B->scal(inr_stride, beta, B+idx_B*inr_stride*sr_B->el_size, 1); \ 270 CTF_FLOPS_ADD(inr_stride); \ 272 for (idx=0; idx<idx_max; idx++){ \ 273 imin = 0, imax = INT_MAX; \ 274 GET_MIN_MAX(B,1,2); \ 275 if (rev_idx_map[2*idx+1] == -1) imax = imin+1; \ 277 if (idx_glb[idx] >= imax){ \ 278 idx_glb[idx] = imin; \ 280 if (idx_glb[idx] != imin) { \ 284 if (idx == idx_max) break; \ 286 if (!sym_pass) continue; \ 296 int const * edge_len_A,
298 int const * idx_map_A,
303 int const * edge_len_B,
305 int const * idx_map_B){
307 int idx, i, idx_max, imin, imax, iA, iB, j, k;
308 int off_idx, sym_pass;
310 int * dlen_A, * dlen_B;
311 int64_t idx_A, idx_B, off_lda;
315 &idx_max, &rev_idx_map);
317 bool rep_idx =
false;
318 for (i=0; i<order_A; i++){
319 for (j=0; j<order_A; j++){
320 if (i!=j && idx_map_A[i] == idx_map_A[j]) rep_idx =
true;
323 for (i=0; i<order_B; i++){
324 for (j=0; j<order_B; j++){
325 if (i!=j && idx_map_B[i] == idx_map_B[j]) rep_idx =
true;
331 memcpy(dlen_A, edge_len_A,
sizeof(
int)*order_A);
332 memcpy(dlen_B, edge_len_B,
sizeof(
int)*order_B);
335 memset(idx_glb, 0,
sizeof(
int)*idx_max);
342 if (beta != NULL || sr_B->
mulid() != NULL){
346 sr_B->
scal(sz_B, beta, B, 1);
351 memset(idx_glb, 0,
sizeof(
int)*idx_max);
352 if (!rep_idx && idx_max>0 && idx_max <=
MAX_ORD){
353 uint64_t ** offsets_A;
354 uint64_t ** offsets_B;
355 compute_syoffs(sr_A, order_A, edge_len_A, sym_A, idx_map_A, sr_B, order_B, edge_len_B, sym_B, idx_map_B, idx_max, rev_idx_map, offsets_A, offsets_B);
356 if (order_B > 1 || (order_B > 0 && idx_map_B[0] != 0)){
362 memset(nidx_glb, 0,
sizeof(
int)*idx_max);
364 SWITCH_ORD_CALL(
sym_seq_sum_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, NULL, nidx_glb, rev_idx_map, idx_max);
368 SWITCH_ORD_CALL(
sym_seq_sum_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, NULL, idx_glb, rev_idx_map, idx_max);
370 for (
int l=0; l<idx_max; l++){
377 idx_A = 0, idx_B = 0;
396 for (idx=0; idx<idx_max; idx++){
397 imin = 0, imax = INT_MAX;
402 ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);
406 if (idx_glb[idx] >= imax){
409 if (idx_glb[idx] != imin) {
413 if (idx == idx_max)
break;
416 if (!sym_pass)
continue;
418 if (!sym_pass)
continue;
438 int const * edge_len_A,
440 int const * idx_map_A,
445 int const * edge_len_B,
447 int const * idx_map_B,
450 int idx, i, idx_max, imin, imax, iA, iB, j, k;
451 int off_idx, sym_pass;
452 int * idx_glb, * rev_idx_map;
453 int * dlen_A, * dlen_B;
454 int64_t idx_A, idx_B, off_lda;
458 &idx_max, &rev_idx_map);
462 memcpy(dlen_A, edge_len_A,
sizeof(
int)*order_A);
463 memcpy(dlen_B, edge_len_B,
sizeof(
int)*order_B);
469 memset(idx_glb, 0,
sizeof(
int)*idx_max);
471 idx_A = 0, idx_B = 0;
483 sr_B->
axpy(inr_stride, alpha, A+idx_A*sr_A->
el_size*inr_stride, 1, B+idx_B*sr_B->
el_size*inr_stride, 1);
487 for (idx=0; idx<idx_max; idx++){
488 imin = 0, imax = INT_MAX;
493 ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);
497 if (idx_glb[idx] >= imax){
500 if (idx_glb[idx] != imin) {
504 if (idx == idx_max)
break;
507 if (!sym_pass)
continue;
509 if (!sym_pass)
continue;
528 int const * edge_len_A,
530 int const * idx_map_A,
535 int const * edge_len_B,
537 int const * idx_map_B,
540 int idx, i, idx_max, imin, imax, iA, iB, j, k;
541 int off_idx, sym_pass;
542 int * idx_glb, * rev_idx_map;
543 int * dlen_A, * dlen_B;
544 int64_t idx_A, idx_B, off_lda;
548 &idx_max, &rev_idx_map);
552 memcpy(dlen_A, edge_len_A,
sizeof(
int)*order_A);
553 memcpy(dlen_B, edge_len_B,
sizeof(
int)*order_B);
556 memset(idx_glb, 0,
sizeof(
int)*idx_max);
560 idx_A = 0, idx_B = 0;
566 sr_A->
mul(A+sr_A->
el_size*idx_A, alpha, tmp_A);
579 for (idx=0; idx<idx_max; idx++){
580 imin = 0, imax = INT_MAX;
585 ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);
589 if (idx_glb[idx] >= imax){
592 if (idx_glb[idx] != imin) {
596 if (idx == idx_max)
break;
599 if (!sym_pass)
continue;
601 if (!sym_pass)
continue;
void compute_syoffs(algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, int tot_order, int const *rev_idx_map, uint64_t **&offsets_A, uint64_t **&offsets_B, uint64_t **&offsets_C)
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
int sym_seq_sum_ref(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B)
performs symmetric contraction with unblocked reference kernel
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
void * alloc(int64_t len)
alloc abstraction
untyped internal class for doubly-typed univariate function
virtual char const * addid() const
MPI datatype for pairs.
#define GET_MIN_MAX(__X, nr, wd)
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
virtual void acc_f(char const *a, char *b, CTF_int::algstrct const *sr_B) const
compute b = b+f(a)
#define SWITCH_ORD_CALL(F, act_ord,...)
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
template void sym_seq_sum_loop< MAX_ORD >(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, univar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
virtual void axpy(int n, char const *alpha, char const *X, int incX, char *Y, int incY) const
Y["i"]+=alpha*X["i"];.
void sym_seq_sum_loop< 0 >(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, univar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
virtual void add(char const *a, char const *b, char *c) const
c = a+b
int sym_seq_sum_cust(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, univar_function const *func)
performs symmetric summation with custom elementwise function
int el_size
size of each element of algstrct in bytes
int cdealloc(void *ptr)
free abstraction
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
int sym_seq_sum_inr(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, int inr_stride)
performs symmetric summation with blocked daxpy
void compute_syoff(int r, int len, algstrct const *sr, int const *edge_len, int const *sym, uint64_t *offsets)
void sym_seq_sum_loop(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, univar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
virtual void mul(char const *a, char const *b, char *c) const
c = a*b
virtual char const * mulid() const
identity element for multiplication i.e. 1
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout