ctf/sym__seq__ctr_8cxx_source.html

 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/

 #include "../shared/iter_tsr.h"
 #include <limits.h>
 #include "sym_seq_ctr.h"
 #include "../shared/offload.h"
 #include "../shared/util.h"

 namespace CTF_int{

   template <int idim>
   void sym_seq_ctr_loop(char const *     alpha,
                         char const *     A,
                         algstrct const * sr_A,
                         int              order_A,
                         int const *      edge_len_A,
                         int const *      sym_A,
                         int const *      idx_map_A,
                         uint64_t *const* offsets_A,
                         char const *     B,
                         algstrct const * sr_B,
                         int              order_B,
                         int const *      edge_len_B,
                         int const *      sym_B,
                         int const *      idx_map_B,
                         uint64_t *const* offsets_B,
                         char const *     beta,
                         char *           C,
                         algstrct const * sr_C,
                         int              order_C,
                         int const *      edge_len_C,
                         int const *      sym_C,
                         int const *      idx_map_C,
                         uint64_t *const* offsets_C,
                         bivar_function const * func,
                         int const *      idx,
                         int const *      rev_idx_map,
                         int              idx_max){
     int imax=0;
     int rA = rev_idx_map[3*idim+0];
     int rB = rev_idx_map[3*idim+1];
     int rC = rev_idx_map[3*idim+2];

     if (rA != -1)
       imax = edge_len_A[rA];
     else if (rB != -1)
       imax = edge_len_B[rB];
     else if (rC != -1)
       imax = edge_len_C[rC];

     if (rA != -1 && sym_A[rA] != NS){
       int rrA = rA;
       do {
         if (idx_map_A[rrA+1] > idim)
           imax = idx[idx_map_A[rrA+1]]+1;
         rrA++;
       } while (sym_A[rrA] != NS && idx_map_A[rrA] < idim);
     }

     if (rB != -1 && sym_B[rB] != NS){
       int rrB = rB;
       do {
         if (idx_map_B[rrB+1] > idim)
           imax = std::min(imax,idx[idx_map_B[rrB+1]]+1);
         rrB++;
       } while (sym_B[rrB] != NS && idx_map_B[rrB] < idim);
     }

     if (rC != -1 && sym_C[rC] != NS){
       int rrC = rC;
       do {
         if (idx_map_C[rrC+1] > idim)
           imax = std::min(imax,idx[idx_map_C[rrC+1]]+1);
         rrC++;
       } while (sym_C[rrC] != NS && idx_map_C[rrC] < idim);
     }

     int imin = 0;

     if (rA > 0 && sym_A[rA-1] != NS){
       int rrA = rA;
       do {
         if (idx_map_A[rrA-1] > idim)
           imin = idx[idx_map_A[rrA-1]];
         rrA--;
       } while (rrA>0 && sym_A[rrA-1] != NS && idx_map_A[rrA] < idim);
     }

     if (rB > 0 && sym_B[rB-1] != NS){
       int rrB = rB;
       do {
         if (idx_map_B[rrB-1] > idim)
           imin = std::max(imin,idx[idx_map_B[rrB-1]]);
         rrB--;
       } while (rrB>0 && sym_B[rrB-1] != NS && idx_map_B[rrB] < idim);
     }

     if (rC > 0 && sym_C[rC-1] != NS){
       int rrC = rC;
       do {
         if (idx_map_C[rrC-1] > idim)
           imin = std::max(imin,idx[idx_map_C[rrC-1]]);
         rrC--;
       } while (rrC>0 && sym_C[rrC-1] != NS && idx_map_C[rrC] < idim);
     }

     if (rC != -1){
 #ifdef USE_OMP
       #pragma omp for
 #endif
       for (int i=imin; i<imax; i++){
 #ifdef USE_OMP
         #pragma omp parallel
 #endif
         {
           int nidx[idx_max];
           memcpy(nidx, idx, idx_max*sizeof(int));
           nidx[idim] = i;
           sym_seq_ctr_loop<idim-1>(alpha, A+offsets_A[idim][nidx[idim]], sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B+offsets_B[idim][nidx[idim]], sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C+offsets_C[idim][nidx[idim]], sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, func, nidx, rev_idx_map, idx_max);
         }
       }
     } else {
       for (int i=imin; i<imax; i++){
         int nidx[idx_max];
         memcpy(nidx, idx, idx_max*sizeof(int));
         nidx[idim] = i;
         sym_seq_ctr_loop<idim-1>(alpha, A+offsets_A[idim][nidx[idim]], sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B+offsets_B[idim][nidx[idim]], sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C+offsets_C[idim][nidx[idim]], sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, func, nidx, rev_idx_map, idx_max);
       }

     }
 //    idx[idim] = 0;
   }


   template <>
   void sym_seq_ctr_loop<0>
                        (char const *     alpha,
                         char const *     A,
                         algstrct const * sr_A,
                         int              order_A,
                         int const *      edge_len_A,
                         int const *      sym_A,
                         int const *      idx_map_A,
                         uint64_t *const* offsets_A,
                         char const *     B,
                         algstrct const * sr_B,
                         int              order_B,
                         int const *      edge_len_B,
                         int const *      sym_B,
                         int const *      idx_map_B,
                         uint64_t *const* offsets_B,
                         char const *     beta,
                         char *           C,
                         algstrct const * sr_C,
                         int              order_C,
                         int const *      edge_len_C,
                         int const *      sym_C,
                         int const *      idx_map_C,
                         uint64_t *const* offsets_C,
                         bivar_function const * func,
                         int const *      idx,
                         int const *      rev_idx_map,
                         int              idx_max){
     int imax=0;
     int rA = rev_idx_map[0];
     int rB = rev_idx_map[1];
     int rC = rev_idx_map[2];

     if (rA != -1)
       imax = edge_len_A[rA];
     else if (rB != -1)
       imax = edge_len_B[rB];
     else if (rC != -1)
       imax = edge_len_C[rC];

     if (rA != -1 && sym_A[rA] != NS)
       imax = idx[idx_map_A[rA+1]]+1;
     if (rB != -1 && sym_B[rB] != NS)
       imax = std::min(imax,idx[idx_map_B[rB+1]]+1);
     if (rC != -1 && sym_C[rC] != NS)
       imax = std::min(imax,idx[idx_map_C[rC+1]]+1);

     int imin = 0;

     if (rA > 0 && sym_A[rA-1] != NS)
       imin = idx[idx_map_A[rA-1]];
     if (rB > 0 && sym_B[rB-1] != NS)
       imin = std::max(imin,idx[idx_map_B[rB-1]]);
     if (rC > 0 && sym_C[rC-1] != NS)
       imin = std::max(imin,idx[idx_map_C[rC-1]]);

 /*    int tid, ntd;
     tid = omp_get_thread_num();
     ntd = omp_get_num_threads();
     printf("-> %d/%d %d %d %d\n",tid,ntd,func==NULL, alpha==NULL,beta==NULL);*/

     if (func == NULL){
       /*if (alpha == NULL && beta == NULL){
         for (int i=imin; i<imax; i++){
           sr_C->mul(A+offsets_A[0][i],
                     B+offsets_B[0][i],
                     C+offsets_C[0][i]);
         }
         CTF_FLOPS_ADD(imax-imin);
       } else*/
       if (alpha == NULL || sr_C->isequal(alpha,sr_C->mulid())){
         for (int i=imin; i<imax; i++){
           char tmp[sr_C->el_size];
           sr_C->mul(A+offsets_A[0][i],
                     B+offsets_B[0][i],
                     tmp);
           sr_C->add(tmp,
                     C+offsets_C[0][i],
                     C+offsets_C[0][i]);
         }
         CTF_FLOPS_ADD(2*(imax-imin));
       } else {
         for (int i=imin; i<imax; i++){
           char tmp[sr_C->el_size];
           sr_C->mul(A+offsets_A[0][i],
                     B+offsets_B[0][i],
                     tmp);
           sr_C->mul(tmp,
                     alpha,
                     tmp);
           sr_C->add(tmp,
                     C+offsets_C[0][i],
                     C+offsets_C[0][i]);
         }
         CTF_FLOPS_ADD(3*(imax-imin));
       }
     } else {
       /*if (alpha == NULL && beta == NULL){
         for (int i=imin; i<imax; i++){
           func->apply_f(A+offsets_A[0][i],
                         B+offsets_B[0][i],
                         C+offsets_C[0][i]);
         }
         CTF_FLOPS_ADD(imax-imin);
       } else*/
       if (alpha == NULL || sr_C->isequal(alpha,sr_C->mulid())){
         for (int i=imin; i<imax; i++){
           func->acc_f(A+offsets_A[0][i],
                       B+offsets_B[0][i],
                       C+offsets_C[0][i],
                       sr_C);
         }
         CTF_FLOPS_ADD(2*(imax-imin));
       } else {
         //ASSERT(0);
         //assert(0);
         //printf("HERTE alpha = %d\n",*(int*)alpha);
         for (int i=imin; i<imax; i++){
           char tmp[sr_C->el_size];
           func->apply_f(A+offsets_A[0][i],
                         B+offsets_B[0][i],
                         tmp);
           sr_C->mul(tmp,
                     alpha,
                     tmp);
           sr_C->add(tmp,
                     C+offsets_C[0][i],
                     C+offsets_C[0][i]);
         }
         CTF_FLOPS_ADD(3*(imax-imin));
       }
     }
   }

   template
   void sym_seq_ctr_loop< MAX_ORD >
                        (char const *     alpha,
                         char const *     A,
                         algstrct const * sr_A,
                         int              order_A,
                         int const *      edge_len_A,
                         int const *      sym_A,
                         int const *      idx_map_A,
                         uint64_t *const* offsets_A,
                         char const *     B,
                         algstrct const * sr_B,
                         int              order_B,
                         int const *      edge_len_B,
                         int const *      sym_B,
                         int const *      idx_map_B,
                         uint64_t *const* offsets_B,
                         char const *     beta,
                         char *           C,
                         algstrct const * sr_C,
                         int              order_C,
                         int const *      edge_len_C,
                         int const *      sym_C,
                         int const *      idx_map_C,
                         uint64_t *const* offsets_C,
                         bivar_function const * func,
                         int const *      idx,
                         int const *      rev_idx_map,
                         int              idx_max);


   void compute_syoff(int              r,
                      int              len,
                      algstrct const * sr,
                      int const *      edge_len,
                      int const *      sym,
                      uint64_t *       offsets){
     if (r == -1){
       std::fill(offsets, offsets+len, 0);
     } else if (r == 0){
       for (int i=0; i<len; i++){
         offsets[i] = i*sr->el_size;
       }
     } else if (sym[r-1] == NS){
       int64_t sz = sy_packed_size(r, edge_len, sym)*sr->el_size;
       for (int i=0; i<len; i++){
         offsets[i] = i*sz;
       }
     } else {
       int medge_len[r+1];
       memcpy(medge_len, edge_len, r*sizeof(int));
       int rr = r-1;
       while (rr>0 && sym[rr-1] != NS) rr--;
       for (int i=0; i<len; i++){
         std::fill(medge_len+rr,medge_len+r+1, i);
         int64_t sz = sy_packed_size(r+1, medge_len, sym)*sr->el_size;
         offsets[i] = sz;
       }
     }
   }


   void compute_syoffs(algstrct const * sr_A,
                       int              order_A,
                       int const *      edge_len_A,
                       int const *      sym_A,
                       int const *      idx_map_A,
                       algstrct const * sr_B,
                       int              order_B,
                       int const *      edge_len_B,
                       int const *      sym_B,
                       int const *      idx_map_B,
                       algstrct const * sr_C,
                       int              order_C,
                       int const *      edge_len_C,
                       int const *      sym_C,
                       int const *      idx_map_C,
                       int              tot_order,
                       int const *      rev_idx_map,
                       uint64_t **&     offsets_A,
                       uint64_t **&     offsets_B,
                       uint64_t **&     offsets_C){
     TAU_FSTART(compute_syoffs);
     offsets_A = (uint64_t**)CTF_int::alloc(sizeof(uint64_t*)*tot_order);
     offsets_B = (uint64_t**)CTF_int::alloc(sizeof(uint64_t*)*tot_order);
     offsets_C = (uint64_t**)CTF_int::alloc(sizeof(uint64_t*)*tot_order);

     for (int idim=0; idim<tot_order; idim++){
       int len=0;

       int rA = rev_idx_map[3*idim+0];
       int rB = rev_idx_map[3*idim+1];
       int rC = rev_idx_map[3*idim+2];

       if (rA != -1)
         len = edge_len_A[rA];
       else if (rB != -1)
         len = edge_len_B[rB];
       else if (rC != -1)
         len = edge_len_C[rC];

       offsets_A[idim] = (uint64_t*)CTF_int::alloc(sizeof(uint64_t)*len);
       offsets_B[idim] = (uint64_t*)CTF_int::alloc(sizeof(uint64_t)*len);
       offsets_C[idim] = (uint64_t*)CTF_int::alloc(sizeof(uint64_t)*len);
       compute_syoff(rA, len, sr_A, edge_len_A, sym_A, offsets_A[idim]);
       compute_syoff(rB, len, sr_B, edge_len_B, sym_B, offsets_B[idim]);
       compute_syoff(rC, len, sr_C, edge_len_C, sym_C, offsets_C[idim]);
     }
     TAU_FSTOP(compute_syoffs);
   }

   int sym_seq_ctr_ref(char const *     alpha,
                       char const *     A,
                       algstrct const * sr_A,
                       int              order_A,
                       int const *      edge_len_A,
                       int const *      sym_A,
                       int const *      idx_map_A,
                       char const *     B,
                       algstrct const * sr_B,
                       int              order_B,
                       int const *      edge_len_B,
                       int const *      sym_B,
                       int const *      idx_map_B,
                       char const *     beta,
                       char *           C,
                       algstrct const * sr_C,
                       int              order_C,
                       int const *      edge_len_C,
                       int const *      sym_C,
                       int const *      idx_map_C){
     TAU_FSTART(sym_seq_ctr_ref);
     int idx, i, idx_max, imin, imax, iA, iB, iC, j, k;
     int64_t sz;
     int off_idx, sym_pass;
     int * rev_idx_map;
     int * dlen_A, * dlen_B, * dlen_C;
     int64_t idx_A, idx_B, idx_C, off_lda;

     inv_idx(order_A,  idx_map_A,
             order_B,  idx_map_B,
             order_C,  idx_map_C,
             &idx_max, &rev_idx_map);

     if (idx_max == 0){
       if (alpha == NULL && beta == NULL){
         sr_C->mul(A, B, C);
         CTF_FLOPS_ADD(1);
       } else  if (alpha == NULL){
         char tmp[sr_C->el_size];
         sr_C->mul(A, B, tmp);
         sr_C->mul(C, beta, C);
         sr_C->add(tmp, C, C);
         CTF_FLOPS_ADD(2);
       } else {
         char tmp[sr_C->el_size];
         sr_C->mul(A, B, tmp);
         sr_C->mul(tmp, alpha, tmp);
         sr_C->mul(C, beta, C);
         sr_C->add(tmp, C, C);
         CTF_FLOPS_ADD(3);
       }
       TAU_FSTOP(sym_seq_ctr_ref);
       return 0;
     }
     dlen_A = (int*)CTF_int::alloc(sizeof(int)*order_A);
     dlen_B = (int*)CTF_int::alloc(sizeof(int)*order_B);
     dlen_C = (int*)CTF_int::alloc(sizeof(int)*order_C);
     memcpy(dlen_A, edge_len_A, sizeof(int)*order_A);
     memcpy(dlen_B, edge_len_B, sizeof(int)*order_B);
     memcpy(dlen_C, edge_len_C, sizeof(int)*order_C);


     /* Scale C immediately. FIXME: wrong for iterators over subset of C */
     if (!sr_C->isequal(beta, sr_C->mulid())){
       sz = sy_packed_size(order_C, edge_len_C, sym_C);
       if (sr_C->isequal(beta, sr_C->addid()) || sr_C->isequal(beta, NULL)){
         sr_C->set(C, sr_C->addid(), sz);
       } else {
         sr_C->scal(sz, beta, C, 1);
         /*for (i=0; i<sz; i++){
           sr_C->mul(C+i*sr_C->el_size, beta,
                     C+i*sr_C->el_size);
         }*/
       }
     }
     if (idx_max <= MAX_ORD){
       uint64_t ** offsets_A;
       uint64_t ** offsets_B;
       uint64_t ** offsets_C;
       compute_syoffs(sr_A, order_A, edge_len_A, sym_A, idx_map_A, sr_B, order_B, edge_len_B, sym_B, idx_map_B, sr_C, order_C, edge_len_C, sym_C, idx_map_C, idx_max, rev_idx_map, offsets_A, offsets_B, offsets_C);

       //if we have something to parallelize without needing to replicate C
       if (order_C > 1 || (order_C > 0 && idx_map_C[0] != 0)){
 #ifdef USE_OMP
         #pragma omp parallel
 #endif
         {
           int * idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
           memset(idx_glb, 0, sizeof(int)*idx_max);

           SWITCH_ORD_CALL(sym_seq_ctr_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C, sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, NULL, idx_glb, rev_idx_map, idx_max);
           cdealloc(idx_glb);
         }
       } else {
         {
           int * idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
           memset(idx_glb, 0, sizeof(int)*idx_max);

           SWITCH_ORD_CALL(sym_seq_ctr_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C, sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, NULL, idx_glb, rev_idx_map, idx_max);
           cdealloc(idx_glb);
         }
       }
       for (int l=0; l<idx_max; l++){
         cdealloc(offsets_A[l]);
         cdealloc(offsets_B[l]);
         cdealloc(offsets_C[l]);
       }
       cdealloc(offsets_A);
       cdealloc(offsets_B);
       cdealloc(offsets_C);
     } else {
       int * idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
       memset(idx_glb, 0, sizeof(int)*idx_max);

       idx_A = 0, idx_B = 0, idx_C = 0;
       sym_pass = 1;
       for (;;){
         //printf("[%d] <- [%d]*[%d]\n",idx_C, idx_A, idx_B);
         if (sym_pass){
           /*if (alpha == NULL && beta == NULL){
             sr_C->mul(A+idx_A*sr_A->el_size, B+idx_B*sr_B->el_size,
                      C+idx_C*sr_C->el_size);
             CTF_FLOPS_ADD(1);
           } else*/  if (alpha == NULL){
             char tmp[sr_C->el_size];
             sr_C->mul(A+idx_A*sr_A->el_size, B+idx_B*sr_B->el_size,
                      tmp);
             sr_C->add(tmp, C+idx_C*sr_C->el_size, C+idx_C*sr_C->el_size);
             CTF_FLOPS_ADD(2);
           } else {
             char tmp[sr_C->el_size];
             sr_C->mul(A+idx_A*sr_A->el_size, B+idx_B*sr_B->el_size,
                      tmp);
             sr_C->mul(tmp, alpha, tmp);
             sr_C->add(tmp, C+idx_C*sr_C->el_size, C+idx_C*sr_C->el_size);
             CTF_FLOPS_ADD(3);
           }
         }
         //printf("[%lf] <- [%lf]*[%lf]\n",C[idx_C],A[idx_A],B[idx_B]);

         for (idx=0; idx<idx_max; idx++){
           imin = 0, imax = INT_MAX;

           GET_MIN_MAX(A,0,3);
           GET_MIN_MAX(B,1,3);
           GET_MIN_MAX(C,2,3);

           ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);

           idx_glb[idx]++;

           if (idx_glb[idx] >= imax){
             idx_glb[idx] = imin;
           }
           if (idx_glb[idx] != imin) {
             break;
           }
         }
         if (idx == idx_max) break;

         CHECK_SYM(A);
         if (!sym_pass) continue;
         CHECK_SYM(B);
         if (!sym_pass) continue;
         CHECK_SYM(C);
         if (!sym_pass) continue;

         if (order_A > 0)
           RESET_IDX(A);
         if (order_B > 0)
           RESET_IDX(B);
         if (order_C > 0)
           RESET_IDX(C);
       }
       CTF_int::cdealloc(idx_glb);
     }
     CTF_int::cdealloc(dlen_A);
     CTF_int::cdealloc(dlen_B);
     CTF_int::cdealloc(dlen_C);
     CTF_int::cdealloc(rev_idx_map);
     TAU_FSTOP(sym_seq_ctr_ref);
     return 0;
   }

   int sym_seq_ctr_cust(char const *     alpha,
                        char const *     A,
                        algstrct const * sr_A,
                        int              order_A,
                        int const *      edge_len_A,
                        int const *      sym_A,
                        int const *      idx_map_A,
                        char const *     B,
                        algstrct const * sr_B,
                        int              order_B,
                        int const *      edge_len_B,
                        int const *      sym_B,
                        int const *      idx_map_B,
                        char const *     beta,
                        char *           C,
                        algstrct const * sr_C,
                        int              order_C,
                        int const *      edge_len_C,
                        int const *      sym_C,
                        int const *      idx_map_C,
                        bivar_function const * func){
     TAU_FSTART(sym_seq_ctr_cust);
     int idx, i, idx_max, imin, imax, iA, iB, iC, j, k;
     int off_idx, sym_pass;
     int * idx_glb, * rev_idx_map;
     int * dlen_A, * dlen_B, * dlen_C;
     //int64_t sz;
     int64_t idx_A, idx_B, idx_C, off_lda;

     inv_idx(order_A,       idx_map_A,
             order_B,       idx_map_B,
             order_C,       idx_map_C,
             &idx_max,     &rev_idx_map);

     dlen_A = (int*)CTF_int::alloc(sizeof(int)*order_A);
     dlen_B = (int*)CTF_int::alloc(sizeof(int)*order_B);
     dlen_C = (int*)CTF_int::alloc(sizeof(int)*order_C);
     memcpy(dlen_A, edge_len_A, sizeof(int)*order_A);
     memcpy(dlen_B, edge_len_B, sizeof(int)*order_B);
     memcpy(dlen_C, edge_len_C, sizeof(int)*order_C);

     idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
     memset(idx_glb, 0, sizeof(int)*idx_max);

     /* Scale C immediately. FIXME: wrong for iterators over subset of C */
     /*if (beta != get_one<dtype>()) {
       sz = sy_packed_size(order_C, edge_len_C, sym_C);
       for (i=0; i<sz; i++){
         C[i] = C[i]*beta;
       }
     }*/
 /*    if (beta != NULL && !sr_C->isequal(beta, sr_C->mulid())){
       int64_t sz = sy_packed_size(order_C, edge_len_C, sym_C);
       if (sr_C->isequal(beta, sr_C->addid())){
         sr_C->set(C, sr_C->addid(), sz);
       } else {
         for (i=0; i<sz; i++){
           sr_C->mul(C+i*sr_C->el_size, beta,
                     C+i*sr_C->el_size);
         }
       }
     }*/
     if (!sr_C->isequal(beta, sr_C->mulid())){
       int64_t sz = sy_packed_size(order_C, edge_len_C, sym_C);
       if (sr_C->isequal(beta, sr_C->addid()) || sr_C->isequal(beta, NULL)){
         sr_C->set(C, sr_C->addid(), sz);
       } else {
         sr_C->scal(sz, beta, C, 1);
         /*for (i=0; i<sz; i++){
           sr_C->mul(C+i*sr_C->el_size, beta,
                     C+i*sr_C->el_size);
         }*/
       }
     }


     if (idx_max <= MAX_ORD){
       uint64_t ** offsets_A;
       uint64_t ** offsets_B;
       uint64_t ** offsets_C;
       compute_syoffs(sr_A, order_A, edge_len_A, sym_A, idx_map_A, sr_B, order_B, edge_len_B, sym_B, idx_map_B, sr_C, order_C, edge_len_C, sym_C, idx_map_C, idx_max, rev_idx_map, offsets_A, offsets_B, offsets_C);

       //if we have something to parallelize without needing to replicate C
       if (order_C > 1 || (order_C > 0 && idx_map_C[0] != 0)){
 #ifdef USE_OMP
         #pragma omp parallel
 #endif
         {
           int * idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
           memset(idx_glb, 0, sizeof(int)*idx_max);

           SWITCH_ORD_CALL(sym_seq_ctr_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C, sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, func, idx_glb, rev_idx_map, idx_max);
           cdealloc(idx_glb);
         }
       } else {
         {
           int * idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
           memset(idx_glb, 0, sizeof(int)*idx_max);

           SWITCH_ORD_CALL(sym_seq_ctr_loop, idx_max-1, alpha, A, sr_A, order_A, edge_len_A, sym_A, idx_map_A, offsets_A, B, sr_B, order_B, edge_len_B, sym_B, idx_map_B, offsets_B, beta, C, sr_C, order_C, edge_len_C, sym_C, idx_map_C, offsets_C, func, idx_glb, rev_idx_map, idx_max);
           cdealloc(idx_glb);
         }
       }
       for (int l=0; l<idx_max; l++){
         cdealloc(offsets_A[l]);
         cdealloc(offsets_B[l]);
         cdealloc(offsets_C[l]);
       }
       cdealloc(offsets_A);
       cdealloc(offsets_B);
       cdealloc(offsets_C);
     } else {


       idx_A = 0, idx_B = 0, idx_C = 0;
       sym_pass = 1;
       for (;;){
         //printf("[%d] <- [%d]*[%d]\n",idx_C, idx_A, idx_B);
         if (sym_pass){
           /*if (alpha == NULL && beta == NULL){
             func->apply_f(A+idx_A*sr_A->el_size, B+idx_B*sr_B->el_size,
                           C+idx_C*sr_C->el_size);
             CTF_FLOPS_ADD(1);
           } else */ if (alpha == NULL){
             func->acc_f(A+idx_A*sr_A->el_size, B+idx_B*sr_B->el_size, C+idx_C*sr_C->el_size, sr_C);
             CTF_FLOPS_ADD(2);
           } else {
             char tmp[sr_C->el_size];
             sr_C->mul(A+idx_A*sr_A->el_size, alpha, tmp);
             func->acc_f(tmp, B+idx_B*sr_B->el_size, C+idx_C*sr_C->el_size, sr_C);
             CTF_FLOPS_ADD(3);
           }
         }

         for (idx=0; idx<idx_max; idx++){
           imin = 0, imax = INT_MAX;

           GET_MIN_MAX(A,0,3);
           GET_MIN_MAX(B,1,3);
           GET_MIN_MAX(C,2,3);

           ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);

           idx_glb[idx]++;

           if (idx_glb[idx] >= imax){
             idx_glb[idx] = imin;
           }
           if (idx_glb[idx] != imin) {
             break;
           }
         }
         if (idx == idx_max) break;

         CHECK_SYM(A);
         if (!sym_pass) continue;
         CHECK_SYM(B);
         if (!sym_pass) continue;
         CHECK_SYM(C);
         if (!sym_pass) continue;


         if (order_A > 0)
           RESET_IDX(A);
         if (order_B > 0)
           RESET_IDX(B);
         if (order_C > 0)
           RESET_IDX(C);
       }
     }
     CTF_int::cdealloc(dlen_A);
     CTF_int::cdealloc(dlen_B);
     CTF_int::cdealloc(dlen_C);
     CTF_int::cdealloc(idx_glb);
     CTF_int::cdealloc(rev_idx_map);
     TAU_FSTOP(sym_seq_ctr_cust);
     return 0;
   }

   int sym_seq_ctr_inr(char const *     alpha,
                       char const *     A,
                       algstrct const * sr_A,
                       int              order_A,
                       int const *      edge_len_A,
                       int const *      sym_A,
                       int const *      idx_map_A,
                       char const *     B,
                       algstrct const * sr_B,
                       int              order_B,
                       int const *      edge_len_B,
                       int const *      sym_B,
                       int const *      idx_map_B,
                       char const *     beta,
                       char *           C,
                       algstrct const * sr_C,
                       int              order_C,
                       int const *      edge_len_C,
                       int const *      sym_C,
                       int const *      idx_map_C,
                       iparam const *   prm,
                       bivar_function const * func){
     TAU_FSTART(sym_seq_ctr_inner);
     int idx, i, idx_max, imin, imax, iA, iB, iC, j, k;
     int off_idx, sym_pass, stride_A, stride_B, stride_C;
     int * idx_glb, * rev_idx_map;
     int * dlen_A, * dlen_B, * dlen_C;
     int64_t idx_A, idx_B, idx_C, off_lda;

     stride_A = prm->m*prm->k*prm->l;
     stride_B = prm->k*prm->n*prm->l;
     stride_C = prm->m*prm->n*prm->l;

     inv_idx(order_A,       idx_map_A,
             order_B,       idx_map_B,
             order_C,       idx_map_C,
             &idx_max,     &rev_idx_map);

     dlen_A = (int*)CTF_int::alloc(sizeof(int)*order_A);
     dlen_B = (int*)CTF_int::alloc(sizeof(int)*order_B);
     dlen_C = (int*)CTF_int::alloc(sizeof(int)*order_C);
     memcpy(dlen_A, edge_len_A, sizeof(int)*order_A);
     memcpy(dlen_B, edge_len_B, sizeof(int)*order_B);
     memcpy(dlen_C, edge_len_C, sizeof(int)*order_C);

     idx_glb = (int*)CTF_int::alloc(sizeof(int)*idx_max);
     memset(idx_glb, 0, sizeof(int)*idx_max);


     /* Scale C immediately. WARNING: wrong for iterators over subset of C */
     if (!prm->offload){
       if (!sr_C->isequal(beta, sr_C->mulid())){
         CTF_FLOPS_ADD(prm->sz_C);
     /*    for (i=0; i<prm->sz_C; i++){
           C[i] = C[i]*beta;
         }*/
         if (sr_C->isequal(beta, sr_C->addid())){
           sr_C->set(C, sr_C->addid(), prm->sz_C);
         } else {
           sr_C->scal(prm->sz_C, beta, C, 1);
         }
       }
     }
     idx_A = 0, idx_B = 0, idx_C = 0;
     sym_pass = 1;
    // int cntr=0;
     for (;;){
       if (sym_pass){
         TAU_FSTART(gemm);
         if (prm->tC == 'N'){
           if (prm->offload){
             //FIXME: Add GPU batched gemm support
             ASSERT(prm->l == 1);
             if (func == NULL){
               sr_C->offload_gemm(prm->tA, prm->tB, prm->m, prm->n, prm->k, alpha,
                                  A+idx_A*stride_A*sr_A->el_size,
                                  B+idx_B*stride_B*sr_B->el_size, sr_C->mulid(),
                                  C+idx_C*stride_C*sr_C->el_size);
             } else {
               ASSERT(sr_C->isequal(alpha,sr_C->mulid()));
               func->coffload_gemm(prm->tA, prm->tB, prm->m, prm->n, prm->k,
                                   A+idx_A*stride_A*sr_A->el_size,
                                   B+idx_B*stride_B*sr_B->el_size,
                                   C+idx_C*stride_C*sr_C->el_size);
             }
           } else {
             if (func == NULL){
               sr_C->gemm_batch(prm->tA, prm->tB, prm->l, prm->m, prm->n, prm->k, alpha,
                          A+idx_A*stride_A*sr_A->el_size,
                          B+idx_B*stride_B*sr_B->el_size, sr_C->mulid(),
                          C+idx_C*stride_C*sr_C->el_size);
             } else {
               ASSERT(prm->l == 1);
               ASSERT(sr_C->isequal(alpha,sr_C->mulid()));
               func->cgemm(prm->tA, prm->tB, prm->m, prm->n, prm->k,
                            A+idx_A*stride_A*sr_A->el_size,
                            B+idx_B*stride_B*sr_B->el_size,
                            C+idx_C*stride_C*sr_C->el_size);
             }
           }
         } else {
           if (prm->offload){
             ASSERT(prm->l == 1);
             if (func == NULL){
               sr_C->offload_gemm(prm->tB, prm->tA, prm->n, prm->m, prm->k, alpha,
                                  B+idx_B*stride_B*sr_B->el_size,
                                  A+idx_A*stride_A*sr_A->el_size, sr_C->mulid(),
                                  C+idx_C*stride_C*sr_C->el_size);
             } else {
               ASSERT(sr_C->isequal(alpha,sr_C->mulid()));
               func->coffload_gemm(prm->tB, prm->tA, prm->n, prm->m, prm->k,
                                   B+idx_B*stride_B*sr_B->el_size,
                                   A+idx_A*stride_A*sr_A->el_size,
                                   C+idx_C*stride_C*sr_C->el_size);
             }
           } else {
             if (func == NULL){
               sr_C->gemm_batch(prm->tB, prm->tA, prm->l, prm->n, prm->m, prm->k, alpha,
                          B+idx_B*stride_B*sr_B->el_size,
                          A+idx_A*stride_A*sr_A->el_size, sr_C->mulid(),
                          C+idx_C*stride_C*sr_C->el_size);
             } else {
               ASSERT(sr_C->isequal(alpha,sr_C->mulid()));
               ASSERT(prm->l == 1);
               func->cgemm(prm->tB, prm->tA, prm->n, prm->m, prm->k,
                            B+idx_B*stride_B*sr_B->el_size,
                            A+idx_A*stride_A*sr_A->el_size,
                            C+idx_C*stride_C*sr_C->el_size);

             }
           }
         }
         //printf("[%d] <- [%d]*[%d] (%d)\n",idx_C, idx_A, idx_B, cntr++);
         //printf("%c %c %c %d %d %d\n", prm->tC, prm->tA, prm->tB, prm->m, prm->n, prm->k);
         /*printf("multiplying %lf by %lf and got %lf\n",
     ((double*)(A+idx_A*stride_A*sr_A->el_size))[0],
     ((double*)(B+idx_B*stride_B*sr_B->el_size))[0],
     ((double*)(C+idx_C*stride_C*sr_C->el_size))[0]);*/
         TAU_FSTOP(gemm);
         // count n^2 FLOPS too
         CTF_FLOPS_ADD((2 * (int64_t)prm->l * (int64_t)prm->n * (int64_t)prm->m * (int64_t)(prm->k+1)));
       }
       //printf("[%ld] <- [%ld]*[%ld] (%d <- %d, %d)\n",idx_C,idx_A,idx_B,stride_C,stride_A,stride_B);

       for (idx=0; idx<idx_max; idx++){
         imin = 0, imax = INT_MAX;

         GET_MIN_MAX(A,0,3);
         GET_MIN_MAX(B,1,3);
         GET_MIN_MAX(C,2,3);

         ASSERT(idx_glb[idx] >= imin && idx_glb[idx] < imax);

         idx_glb[idx]++;

         if (idx_glb[idx] >= imax){
           idx_glb[idx] = imin;
         }
         if (idx_glb[idx] != imin) {
           break;
         }
       }
       if (idx == idx_max) break;

       CHECK_SYM(A);
       if (!sym_pass) continue;
       CHECK_SYM(B);
       if (!sym_pass) continue;
       CHECK_SYM(C);
       if (!sym_pass) continue;


       if (order_A > 0)
         RESET_IDX(A);
       if (order_B > 0)
         RESET_IDX(B);
       if (order_C > 0)
         RESET_IDX(C);
     }
     CTF_int::cdealloc(dlen_A);
     CTF_int::cdealloc(dlen_B);
     CTF_int::cdealloc(dlen_C);
     CTF_int::cdealloc(idx_glb);
     CTF_int::cdealloc(rev_idx_map);
     TAU_FSTOP(sym_seq_ctr_inner);
     return 0;
   }
 }
CTF_int::compute_syoffs
void compute_syoffs(algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, int tot_order, int const *rev_idx_map, uint64_t **&offsets_A, uint64_t **&offsets_B, uint64_t **&offsets_C)
Definition: sym_seq_ctr.cxx:332

CTF_int::iparam::tC
char tC
Definition: ctr_tsr.h:83

CTF_int::iparam::offload
bool offload
Definition: ctr_tsr.h:84

CTF_int::iparam::k
int64_t k
Definition: ctr_tsr.h:79

CTF_int::algstrct::isequal
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
Definition: algstrct.cxx:340

RESET_IDX
#define RESET_IDX(__X)
Definition: iter_tsr.h:67

CTF_int::inv_idx
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
Definition: ctr_tsr.cxx:592

MAX_ORD
#define MAX_ORD
Definition: util.h:103

ASSERT
#define ASSERT(...)
Definition: util.h:88

CTF_int::alloc
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365

NS
Definition: common.h:37

CTF_int::iparam::m
int64_t m
Definition: ctr_tsr.h:78

CTF_int::algstrct::addid
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89

CTF_int::gemm
void gemm(char tA, char tB, int m, int n, int k, dtype alpha, dtype const *A, dtype const *B, dtype beta, dtype *C)
Definition: semiring.cxx:82

CTF_int::bivar_function
untyped internal class for triply-typed bivariate function
Definition: ctr_comm.h:16

GET_MIN_MAX
#define GET_MIN_MAX(__X, nr, wd)
Definition: iter_tsr.h:16

CTF_FLOPS_ADD
#define CTF_FLOPS_ADD(n)
Definition: util.h:138

CTF_int::algstrct::gemm_batch
virtual void gemm_batch(char tA, char tB, int l, int m, int n, int k, char const *alpha, char const *A, char const *B, char const *beta, char *C) const
beta*C["ijl"]=alpha*A^tA["ikl"]*B^tB["kjl"];
Definition: algstrct.cxx:291

CTF_int::algstrct::set
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629

CTF_int::bivar_function::coffload_gemm
virtual void coffload_gemm(char tA, char tB, int m, int n, int k, char const *A, char const *B, char *C) const
Definition: ctr_comm.h:87

CTF_int::iparam::n
int64_t n
Definition: ctr_tsr.h:77

CTF_int::iparam::sz_C
int64_t sz_C
Definition: ctr_tsr.h:80

SWITCH_ORD_CALL
#define SWITCH_ORD_CALL(F, act_ord,...)
Definition: util.h:119

CTF_int::sym_seq_ctr_cust
int sym_seq_ctr_cust(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, bivar_function const *func)
performs symmetric contraction with custom elementwise function
Definition: sym_seq_ctr.cxx:566

CTF_int::iparam::l
int64_t l
Definition: ctr_tsr.h:76

CTF_int::sym_seq_ctr_ref
int sym_seq_ctr_ref(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C)
performs symmetric contraction with reference (unblocked) kernel
Definition: sym_seq_ctr.cxx:381

TAU_FSTOP
#define TAU_FSTOP(ARG)
Definition: util.h:281

sym_seq_ctr.h

TAU_FSTART
#define TAU_FSTART(ARG)
Definition: util.h:280

CHECK_SYM
#define CHECK_SYM(__X)
Definition: iter_tsr.h:52

CTF_int::algstrct::scal
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
Definition: algstrct.cxx:262

CTF_int::sym_seq_ctr_loop
void sym_seq_ctr_loop(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, uint64_t *const *offsets_C, bivar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
Definition: sym_seq_ctr.cxx:12

CTF_int::sym_seq_ctr_inr
int sym_seq_ctr_inr(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, iparam const *prm, bivar_function const *func)
performs symmetric contraction with blocked gemm
Definition: sym_seq_ctr.cxx:745

CTF_int::algstrct::add
virtual void add(char const *a, char const *b, char *c) const
c = a+b
Definition: algstrct.cxx:109

CTF_int::accumulatable::el_size
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16

CTF_int::cdealloc
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480

CTF_int::algstrct
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34

CTF_int::iparam
Definition: ctr_tsr.h:75

CTF_int::sym_seq_ctr_loop< 0 >
void sym_seq_ctr_loop< 0 >(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, uint64_t *const *offsets_C, bivar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)
Definition: sym_seq_ctr.cxx:137

CTF_int::compute_syoff
void compute_syoff(int r, int len, algstrct const *sr, int const *edge_len, int const *sym, uint64_t *offsets)
Definition: sym_seq_ctr.cxx:301

CTF_int::algstrct::mul
virtual void mul(char const *a, char const *b, char *c) const
c = a*b
Definition: algstrct.cxx:120

CTF_int
Definition: model_trainer.cxx:16

CTF_int::sym_seq_ctr_loop< MAX_ORD >
template void sym_seq_ctr_loop< MAX_ORD >(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, uint64_t *const *offsets_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, uint64_t *const *offsets_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, uint64_t *const *offsets_C, bivar_function const *func, int const *idx, int const *rev_idx_map, int idx_max)

CTF_int::algstrct::mulid
virtual char const * mulid() const
identity element for multiplication i.e. 1
Definition: algstrct.cxx:93

CTF_int::iparam::tB
char tB
Definition: ctr_tsr.h:82

CTF_int::algstrct::offload_gemm
virtual void offload_gemm(char tA, char tB, int m, int n, int k, char const *alpha, char const *A, char const *B, char const *beta, char *C) const
Definition: algstrct.cxx:322

CTF_int::sy_packed_size
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
Definition: util.cxx:10

CTF_int::bivar_function::cgemm
virtual void cgemm(char tA, char tB, int m, int n, int k, char const *A, char const *B, char *C) const
Definition: ctr_comm.h:78

CTF_int::bivar_function::acc_f
virtual void acc_f(char const *a, char const *b, char *c, CTF_int::algstrct const *sr_C) const  =0
compute c = c+f(a,b)

CTF_int::iparam::tA
char tA
Definition: ctr_tsr.h:81