3 #include "../shared/iter_tsr.h"     4 #include "../shared/util.h"    15                         int const *             edge_len_A,
    17                         int const *             idx_map_A,
    18                         uint64_t *
const*        offsets_A,
    22                         int const *             edge_len_B,
    24                         int const *             idx_map_B,
    25                         uint64_t *
const*        offsets_B,
    28                         int const *             rev_idx_map,
    31     int rA = rev_idx_map[2*idim+0];
    32     int rB = rev_idx_map[2*idim+1];
    35       imax = edge_len_A[rA];
    37       imax = edge_len_B[rB];
    39     if (rA != -1 && sym_A[rA] != 
NS){
    42         if (idx_map_A[rrA+1] > idim)
    43           imax = idx[idx_map_A[rrA+1]]+1;
    45       } 
while (sym_A[rrA] != 
NS && idx_map_A[rrA] < idim);
    48     if (rB != -1 && sym_B[rB] != 
NS){
    51         if (idx_map_B[rrB+1] > idim)
    52           imax = std::min(imax,idx[idx_map_B[rrB+1]]+1);
    54       } 
while (sym_B[rrB] != 
NS && idx_map_B[rrB] < idim);
    59     if (rA > 0 && sym_A[rA-1] != 
NS){
    62         if (idx_map_A[rrA-1] > idim)
    63           imin = idx[idx_map_A[rrA-1]];
    65       } 
while (rrA>0 && sym_A[rrA-1] != 
NS && idx_map_A[rrA] < idim);
    68     if (rB > 0 && sym_B[rB-1] != 
NS){
    71         if (idx_map_B[rrB-1] > idim)
    72           imin = std::max(imin,idx[idx_map_B[rrB-1]]);
    74       } 
while (rrB>0 && sym_B[rrB-1] != 
NS && idx_map_B[rrB] < idim);
    81       for (
int i=imin; i<imax; i++){
    87           memcpy(nidx, idx, idx_max*
sizeof(
int));
    89           sym_seq_sum_loop<idim-1>(alpha, A+offsets_A[idim][nidx[idim]], sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B+offsets_B[idim][nidx[idim]], sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, func, nidx, rev_idx_map, idx_max);
    93       for (
int i=imin; i<imax; i++){
    95         memcpy(nidx, idx, idx_max*
sizeof(
int));
    97         sym_seq_sum_loop<idim-1>(alpha, A+offsets_A[idim][nidx[idim]], sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B+offsets_B[idim][nidx[idim]], sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, func, nidx, rev_idx_map, idx_max);
   111                         int const *             edge_len_A,
   113                         int const *             idx_map_A,
   114                         uint64_t *
const*        offsets_A,
   118                         int const *             edge_len_B,
   120                         int const *             idx_map_B,
   121                         uint64_t *
const*        offsets_B,
   124                         int const *             rev_idx_map,
   127     int rA = rev_idx_map[0];
   128     int rB = rev_idx_map[1];
   131       imax = edge_len_A[rA];
   133       imax = edge_len_B[rB];
   135     if (rA != -1 && sym_A[rA] != 
NS)
   136       imax = idx[idx_map_A[rA+1]]+1;
   137     if (rB != -1 && sym_B[rB] != 
NS)
   138       imax = std::min(imax,idx[idx_map_B[rB+1]]+1);
   142     if (rA > 0 && sym_A[rA-1] != 
NS)
   143       imin = idx[idx_map_A[rA-1]];
   144     if (rB > 0 && sym_B[rB-1] != 
NS)
   145       imin = std::max(imin,idx[idx_map_B[rB-1]]);
   149         for (
int i=imin; i<imax; i++){
   150           sr_B->add(A+offsets_A[0][i],
   156         for (
int i=imin; i<imax; i++){
   157           char tmp[sr_A->el_size];
   158           sr_A->mul(A+offsets_A[0][i], 
   176                         int const *             edge_len_A,
   178                         int const *             idx_map_A,
   179                         uint64_t *
const*        offsets_A,
   183                         int const *             edge_len_B,
   185                         int const *             idx_map_B,
   186                         uint64_t *
const*        offsets_B,
   189                         int const *       rev_idx_map,
   195                       int const *      edge_len_A,
   197                       int const *      idx_map_A,
   200                       int const *      edge_len_B,
   202                       int const *      idx_map_B,
   204                       int const *      rev_idx_map,
   205                       uint64_t **&     offsets_A,
   206                       uint64_t **&     offsets_B){
   208     offsets_A = (uint64_t**)
CTF_int::alloc(
sizeof(uint64_t*)*tot_order);
   209     offsets_B = (uint64_t**)
CTF_int::alloc(
sizeof(uint64_t*)*tot_order);
   211     for (
int idim=0; idim<tot_order; idim++){
   214       int rA = rev_idx_map[2*idim+0];
   215       int rB = rev_idx_map[2*idim+1];
   218         len = edge_len_A[rA];
   220         len = edge_len_B[rB];
   222       offsets_A[idim] = (uint64_t*)
CTF_int::alloc(
sizeof(uint64_t)*len);
   223       offsets_B[idim] = (uint64_t*)
CTF_int::alloc(
sizeof(uint64_t)*len);
   224       compute_syoff(rA, len, sr_A, edge_len_A, sym_A, offsets_A[idim]);
   225       compute_syoff(rB, len, sr_B, edge_len_B, sym_B, offsets_B[idim]);
   231   #define SCAL_B do {                                                      \   232     if (!sr_B->isequal(beta, sr_B->mulid())){                                 \   233       memset(idx_glb, 0, sizeof(int)*idx_max);                             \   234       idx_A = 0, idx_B = 0;                                                \   238           sr_B->mul(beta, B+idx_B*sr_B->el_size, B+idx_B*sr_B->el_size);      \   241         for (idx=0; idx<idx_max; idx++){                                   \   242           imin = 0, imax = INT_MAX;                                        \   243           GET_MIN_MAX(B,1,2);                                              \   244           if (rev_idx_map[2*idx+1] == -1) imax = imin+1;                   \   246           if (idx_glb[idx] >= imax){                                       \   247              idx_glb[idx] = imin;                                          \   249           if (idx_glb[idx] != imin) {                                      \   253         if (idx == idx_max) break;                                         \   255         if (!sym_pass) continue;                                           \   262   #define SCAL_B_inr do {                                                  \   263     if (!sr_B->isequal(beta, sr_B->mulid())){                                 \   264       memset(idx_glb, 0, sizeof(int)*idx_max);                             \   265       idx_A = 0, idx_B = 0;                                                \   269           sr_B->scal(inr_stride, beta, B+idx_B*inr_stride*sr_B->el_size, 1); \   270           CTF_FLOPS_ADD(inr_stride);                                       \   272         for (idx=0; idx<idx_max; idx++){                                   \   273           imin = 0, imax = INT_MAX;                                        \   274           GET_MIN_MAX(B,1,2);                                              \   275           if (rev_idx_map[2*idx+1] == -1) imax = imin+1;                   \   277           if (idx_glb[idx] >= imax){                                       \   278              idx_glb[idx] = imin;                                          \   280           if (idx_glb[idx] != imin) {                                      \   284         if (idx == idx_max) break;                                         \   286         if (!sym_pass) continue;                                           \   296                        int const *      edge_len_A,
   298                        int const *      idx_map_A,
   303                        int const *      edge_len_B,
   305                        int const *      idx_map_B){
   307     int idx, i, idx_max, imin, imax, iA, iB, j, k;
   308     int off_idx, sym_pass;
   310     int * dlen_A, * dlen_B;
   311     int64_t idx_A, idx_B, off_lda;
   315             &idx_max,     &rev_idx_map);
   317     bool rep_idx = 
false;
   318     for (i=0; i<order_A; i++){
   319       for (j=0; j<order_A; j++){
   320         if (i!=j && idx_map_A[i] == idx_map_A[j]) rep_idx = 
true;
   323     for (i=0; i<order_B; i++){
   324       for (j=0; j<order_B; j++){
   325         if (i!=j && idx_map_B[i] == idx_map_B[j]) rep_idx = 
true;
   331     memcpy(dlen_A, edge_len_A, 
sizeof(
int)*order_A);
   332     memcpy(dlen_B, edge_len_B, 
sizeof(
int)*order_B);
   335     memset(idx_glb, 0, 
sizeof(
int)*idx_max);
   342       if (beta != NULL || sr_B->
mulid() != NULL){
   346           sr_B->
scal(sz_B, beta, B, 1);
   351     memset(idx_glb, 0, 
sizeof(
int)*idx_max);
   352     if (!rep_idx && idx_max>0 && idx_max <= 
MAX_ORD){
   353       uint64_t ** offsets_A;
   354       uint64_t ** offsets_B;
   355       compute_syoffs(sr_A, order_A, edge_len_A, sym_A, idx_map_A, sr_B, order_B, edge_len_B, sym_B, idx_map_B, idx_max, rev_idx_map, offsets_A, offsets_B);
   356       if (order_B > 1 || (order_B > 0 && idx_map_B[0] != 0)){
   362           memset(nidx_glb, 0, 
sizeof(
int)*idx_max);
   364           SWITCH_ORD_CALL(
sym_seq_sum_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, NULL, nidx_glb, rev_idx_map, idx_max);
   368         SWITCH_ORD_CALL(
sym_seq_sum_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, NULL, idx_glb, rev_idx_map, idx_max);
   370       for (
int l=0; l<idx_max; l++){
   377       idx_A = 0, idx_B = 0;
   396         for (idx=0; idx<idx_max; idx++){
   397           imin = 0, imax = INT_MAX;
   402           ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);
   406           if (idx_glb[idx] >= imax){
   409           if (idx_glb[idx] != imin) {
   413         if (idx == idx_max) 
break;
   416         if (!sym_pass) 
continue;
   418         if (!sym_pass) 
continue;
   438                        int const *      edge_len_A,
   440                        int const *      idx_map_A,
   445                        int const *      edge_len_B,
   447                        int const *      idx_map_B,
   450     int idx, i, idx_max, imin, imax, iA, iB, j, k;
   451     int off_idx, sym_pass;
   452     int * idx_glb, * rev_idx_map;
   453     int * dlen_A, * dlen_B;
   454     int64_t idx_A, idx_B, off_lda;
   458             &idx_max,     &rev_idx_map);
   462     memcpy(dlen_A, edge_len_A, 
sizeof(
int)*order_A);
   463     memcpy(dlen_B, edge_len_B, 
sizeof(
int)*order_B);
   469     memset(idx_glb, 0, 
sizeof(
int)*idx_max);
   471     idx_A = 0, idx_B = 0;
   483         sr_B->
axpy(inr_stride, alpha, A+idx_A*sr_A->
el_size*inr_stride, 1, B+idx_B*sr_B->
el_size*inr_stride, 1); 
   487       for (idx=0; idx<idx_max; idx++){
   488         imin = 0, imax = INT_MAX;
   493         ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);
   497         if (idx_glb[idx] >= imax){
   500         if (idx_glb[idx] != imin) {
   504       if (idx == idx_max) 
break;
   507       if (!sym_pass) 
continue;
   509       if (!sym_pass) 
continue;
   528                        int const *             edge_len_A,
   530                        int const *             idx_map_A,
   535                        int const *             edge_len_B,
   537                        int const *             idx_map_B,
   540     int idx, i, idx_max, imin, imax, iA, iB, j, k;
   541     int off_idx, sym_pass;
   542     int * idx_glb, * rev_idx_map;
   543     int * dlen_A, * dlen_B;
   544     int64_t idx_A, idx_B, off_lda;
   548             &idx_max,     &rev_idx_map);
   552     memcpy(dlen_A, edge_len_A, 
sizeof(
int)*order_A);
   553     memcpy(dlen_B, edge_len_B, 
sizeof(
int)*order_B);
   556     memset(idx_glb, 0, 
sizeof(
int)*idx_max);
   560     idx_A = 0, idx_B = 0;
   566           sr_A->
mul(A+sr_A->
el_size*idx_A, alpha, tmp_A);
   579       for (idx=0; idx<idx_max; idx++){
   580         imin = 0, imax = INT_MAX;
   585         ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);
   589         if (idx_glb[idx] >= imax){
   592         if (idx_glb[idx] != imin) {
   596       if (idx == idx_max) 
break;
   599       if (!sym_pass) 
continue;
   601       if (!sym_pass) 
continue;
 
void compute_syoffs(algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, int tot_order, int const *rev_idx_map, uint64_t **&offsets_A, uint64_t **&offsets_B, uint64_t **&offsets_C)
virtual bool isequal(char const *a, char const *b) const 
returns true if algstrct elements a and b are equal 
int sym_seq_sum_ref(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B)
performs symmetric contraction with unblocked reference kernel 
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map 
void * alloc(int64_t len)
alloc abstraction 
untyped internal class for doubly-typed univariate function 
virtual char const * addid() const 
MPI datatype for pairs. 
#define GET_MIN_MAX(__X, nr, wd)                                                                                                
virtual void set(char *a, char const *b, int64_t n) const 
sets n elements of array a to value b 
virtual void acc_f(char const *a, char *b, CTF_int::algstrct const *sr_B) const 
compute b = b+f(a) 
#define SWITCH_ORD_CALL(F, act_ord,...)
virtual void scal(int n, char const *alpha, char *X, int incX) const 
X["i"]=alpha*X["i"];. 
template void sym_seq_sum_loop< MAX_ORD >(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, univar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
virtual void axpy(int n, char const *alpha, char const *X, int incX, char *Y, int incY) const 
Y["i"]+=alpha*X["i"];. 
void sym_seq_sum_loop< 0 >(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, univar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
virtual void add(char const *a, char const *b, char *c) const 
c = a+b 
int sym_seq_sum_cust(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, univar_function const *func)
performs symmetric summation with custom elementwise function 
int el_size
size of each element of algstrct in bytes 
int cdealloc(void *ptr)
free abstraction 
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
int sym_seq_sum_inr(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, int inr_stride)
performs symmetric summation with blocked daxpy 
void compute_syoff(int r, int len, algstrct const *sr, int const *edge_len, int const *sym, uint64_t *offsets)
void sym_seq_sum_loop(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, univar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
virtual void mul(char const *a, char const *b, char *c) const 
c = a*b 
virtual char const * mulid() const 
identity element for multiplication i.e. 1 
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout