4 #include "../tensor/untyped_tensor.h"     5 #include "../mapping/mapping.h"     6 #include "../shared/util.h"    40     alloc_host_buf = o->alloc_host_buf;
    45     printf(
"spctr_2d_general: edge_len = %d\n", 
edge_len);
    46     printf(
"move_A = %d, ctr_lda_A = %ld, ctr_sub_lda_A = %ld\n",
    49     printf(
"move_B = %d, ctr_lda_B = %ld, ctr_sub_lda_B = %ld\n",
    52     printf(
"move_C = %d, ctr_lda_C = %ld, ctr_sub_lda_C = %ld\n",
    57       printf(
"alloc_host_buf is true\n");
    59       printf(
"alloc_host_buf is false\n");
    75     b_A = 0, b_B = 0, b_C = 0;
    93     int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
    94     find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
    95     double est_bcast_time = 0.0;
   122     int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
   123     find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
   124     int64_t mem_usage = 0;
   138   char * 
bcast_step(
int edge_len, 
char * A, 
bool is_sparse_A, 
bool move_A, 
algstrct const * 
sr_A, int64_t b_A, int64_t s_A, 
char * buf_A, 
CommData * 
cdt_A, int64_t 
ctr_sub_lda_A, int64_t 
ctr_lda_A, 
int nblk_A, int64_t 
const * size_blk_A, 
int & new_nblk_A, int64_t *& new_size_blk_A, int64_t * offsets_A, 
int ib){
   141     new_size_blk_A = (int64_t*)size_blk_A;
   143       new_nblk_A  = nblk_A/b_A;
   144       int owner_A = ib % cdt_A->
np;
   145       if (cdt_A->
rank == owner_A){
   150             int64_t * new_offsets_A;
   151             socopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*b_A, ctr_sub_lda_A,
   152                    size_blk_A+(ib/cdt_A->
np)*ctr_sub_lda_A, 
   153                    new_size_blk_A, new_offsets_A);
   155             int64_t bc_size_A = 0;
   156             for (
int z=0; z<new_nblk_A; z++) bc_size_A += new_size_blk_A[z];
   160             spcopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*b_A, ctr_sub_lda_A,
   161                    size_blk_A+(ib/cdt_A->
np)*ctr_sub_lda_A, 
   162                    offsets_A+(ib/cdt_A->
np)*ctr_sub_lda_A, 
   164                    new_size_blk_A, new_offsets_A, op_A);
   168             sr_A->
copy(ctr_sub_lda_A, ctr_lda_A, 
   169                        A+sr_A->
el_size*(ib/cdt_A->
np)*ctr_sub_lda_A, ctr_sub_lda_A*b_A, 
   170                        op_A, ctr_sub_lda_A);
   180         cdt_A->
bcast(new_size_blk_A, new_nblk_A, MPI_INT64_T, owner_A);
   181         int64_t bc_size_A = 0;
   182         for (
int z=0; z<new_nblk_A; z++) bc_size_A += new_size_blk_A[z];
   184         if (cdt_A->
rank != owner_A){ 
   189         cdt_A->
bcast(op_A, bc_size_A, MPI_CHAR, owner_A);
   200       if (ctr_sub_lda_A == 0)
   208             memcpy(new_size_blk_A, size_blk_A+ib*ctr_sub_lda_A, 
sizeof(int64_t)*new_nblk_A);
   217             int64_t * new_offsets_A;
   218             socopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*edge_len, ctr_sub_lda_A,
   219                    size_blk_A+ib*ctr_sub_lda_A,
   220                    new_size_blk_A, new_offsets_A);
   221             int64_t bc_size_A = 0;
   222             for (
int z=0; z<new_nblk_A; z++) bc_size_A += new_size_blk_A[z];
   227             spcopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*edge_len, ctr_sub_lda_A,
   228                    size_blk_A+ib*ctr_sub_lda_A, offsets_A+ib*ctr_sub_lda_A, A,
   229                    new_size_blk_A, new_offsets_A, op_A);
   233             sr_A->
copy(ctr_sub_lda_A, ctr_lda_A,
   234                        A+sr_A->
el_size*ib*ctr_sub_lda_A, ctr_sub_lda_A*edge_len, 
   235                        buf_A, ctr_sub_lda_A);
   244   char * 
reduce_step_pre(
int edge_len, 
char * C, 
bool is_sparse_C, 
bool move_C, 
algstrct const * 
sr_C, int64_t b_C, int64_t s_C, 
char * buf_C, 
CommData * 
cdt_C, int64_t 
ctr_sub_lda_C, int64_t 
ctr_lda_C, 
int nblk_C, int64_t 
const * size_blk_C, 
int & new_nblk_C, int64_t *& new_size_blk_C, int64_t * offsets_C, 
int ib, 
char const *& rec_beta){
   246     new_size_blk_C = (int64_t*)size_blk_C;
   249       rec_beta = sr_C->
addid();
   250       new_nblk_C = nblk_C/b_C;
   253         memset(new_size_blk_C, 0, 
sizeof(int64_t)*new_nblk_C);
   256       if (ctr_sub_lda_C == 0){
   264             memcpy(new_size_blk_C, size_blk_C+ib*ctr_sub_lda_C, 
sizeof(int64_t)*new_nblk_C);
   271           rec_beta = sr_C->
addid();
   273           memset(new_size_blk_C, 0, 
sizeof(int64_t)*new_nblk_C);
   281   void reduce_step_post(
int edge_len, 
char * C, 
bool is_sparse_C, 
bool move_C, 
algstrct const * 
sr_C, int64_t b_C, int64_t s_C, 
char * buf_C, 
CommData * 
cdt_C, int64_t 
ctr_sub_lda_C, int64_t 
ctr_lda_C, 
int nblk_C, int64_t * size_blk_C, 
int & new_nblk_C, int64_t *& new_size_blk_C, int64_t * offsets_C, 
int ib, 
char const *& rec_beta, 
char const * 
beta, 
char *& up_C, 
char *& 
new_C, 
int n_new_C_grps, 
int & i_new_C_grp, 
char ** new_C_grps){
   285       MPI_Barrier(cdt_C->
cm);
   288       int owner_C   = ib % cdt_C->
np;
   290         int64_t csr_sz_acc = 0;
   291         int64_t new_csr_sz_acc = 0;
   292         char * new_Cs[new_nblk_C];
   293         for (
int blk=0; blk<new_nblk_C; blk++){
   294           new_Cs[blk] = sr_C->
csr_reduce(up_C+csr_sz_acc, owner_C, cdt_C->
cm);
   296           csr_sz_acc += new_size_blk_C[blk];
   297           new_size_blk_C[blk] = cdt_C->
rank == owner_C ? ((
CSR_Matrix)(new_Cs[blk])).size() : 0;
   298           new_csr_sz_acc += new_size_blk_C[blk];
   301         if (cdt_C->
rank == owner_C){
   302           if (n_new_C_grps == 1){
   303             alloc_ptr(new_csr_sz_acc, (
void**)&up_C);
   305             ASSERT(nblk_C == new_nblk_C);
   306             for (
int blk=0; blk<nblk_C; blk++){
   307               memcpy(up_C+new_csr_sz_acc, new_Cs[blk], new_size_blk_C[blk]);
   309               new_csr_sz_acc += new_size_blk_C[blk];
   317                 size_blk_C[ctr_sub_lda_C*(k*n_new_C_grps+i_new_C_grp)+j] = new_size_blk_C[ctr_sub_lda_C*k+j];
   320             new_C_grps[i_new_C_grp] = new_Cs[0];
   327         if (cdt_C->
rank == owner_C)
   328           cdt_C->
red(MPI_IN_PLACE, up_C, s_C, sr_C->
mdtype(), sr_C->
addmop(), owner_C);
   331         if (cdt_C->
rank == owner_C){
   332           sr_C->
copy(ctr_sub_lda_C, ctr_lda_C,
   333                      up_C, ctr_sub_lda_C, sr_C->
mulid(),
   334                      C+sr_C->
el_size*(ib/cdt_C->
np)*ctr_sub_lda_C, 
   335                      ctr_sub_lda_C*b_C, beta);
   339       if (ctr_sub_lda_C != 0){
   341           new_C_grps[i_new_C_grp] = up_C;
   344               size_blk_C[ctr_sub_lda_C*(k*n_new_C_grps+i_new_C_grp)+j] = new_size_blk_C[ctr_sub_lda_C*k+j];
   348         } 
else if (ctr_lda_C != 1){
   349           sr_C->
copy(ctr_sub_lda_C, ctr_lda_C,
   350                      buf_C, ctr_sub_lda_C, sr_C->
mulid(), 
   355         rec_beta = sr_C->
mulid();
   357           size_blk_C[0] = new_size_blk_C[0];
   366                              char * B, 
int nblk_B, int64_t 
const * size_blk_B,
   367                              char * C, 
int nblk_C, int64_t * size_blk_C,
   369     int ret, n_new_C_grps;
   371     char * buf_A, * buf_B, * buf_C, * buf_aux, * up_C;
   376     int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
   392     if (n_new_C_grps > 1)
   393       alloc_ptr(n_new_C_grps*
sizeof(
char*), (
void**)&new_C_grps);
   402     int iidx_lyr, inum_lyr;
   421     find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
   451       for (
int i=0; i<nblk_A; i++){
   452         if (i==0) offsets_A[0] = 0;
   453         else offsets_A[i] = offsets_A[i-1]+size_blk_A[i-1];
   459       for (
int i=0; i<nblk_B; i++){
   460         if (i==0) offsets_B[0] = 0;
   461         else offsets_B[i] = offsets_B[i-1]+size_blk_B[i-1];
   467       for (
int i=0; i<nblk_C; i++){
   468         if (i==0) offsets_C[0] = 0;
   469         else offsets_C[i] = offsets_C[i-1]+size_blk_C[i-1];
   474     int64_t * new_size_blk_A;
   475     int new_nblk_A = nblk_A;
   476     int64_t * new_size_blk_B;
   477     int new_nblk_B = nblk_B;
   478     int64_t * new_size_blk_C;
   479     int new_nblk_C = nblk_C;
   484     for (ib=iidx_lyr; ib<
edge_len; ib+=inum_lyr){
   485       op_A = 
bcast_step(edge_len, A, 
is_sparse_A, 
move_A, 
sr_A, b_A, s_A, buf_A, 
cdt_A, 
ctr_sub_lda_A, 
ctr_lda_A, nblk_A, size_blk_A, new_nblk_A, new_size_blk_A, offsets_A, ib);
   486       op_B = 
bcast_step(edge_len, B, 
is_sparse_B, 
move_B, 
sr_B, b_B, s_B, buf_B, 
cdt_B, 
ctr_sub_lda_B, 
ctr_lda_B, nblk_B, size_blk_B, new_nblk_B, new_size_blk_B, offsets_B, ib);
   487       op_C = 
reduce_step_pre(edge_len, new_C, 
is_sparse_C, 
move_C, 
sr_C, b_C, s_C, buf_C, 
cdt_C, 
ctr_sub_lda_C, 
ctr_lda_C, nblk_C, size_blk_C, new_nblk_C, new_size_blk_C, offsets_C, ib, 
rec_ctr->
beta);
   491       rec_ctr->
run(op_A, new_nblk_A, new_size_blk_A,
   492                    op_B, new_nblk_B, new_size_blk_B,
   493                    op_C, new_nblk_C, new_size_blk_C,
   506       reduce_step_post(edge_len, C, 
is_sparse_C, 
move_C, 
sr_C, b_C, s_C, buf_C, 
cdt_C, 
ctr_sub_lda_C, 
ctr_lda_C, nblk_C, size_blk_C, new_nblk_C, new_size_blk_C, offsets_C, ib, 
rec_ctr->
beta, this->beta, up_C, new_C, n_new_C_grps, i_new_C_grp, new_C_grps);
   508       if (new_size_blk_A != size_blk_A)
   510       if (new_size_blk_B != size_blk_B)
   520       if (new_size_blk_C != size_blk_C)
   530     if (n_new_C_grps > 1){
   531       ASSERT(i_new_C_grp == n_new_C_grps);
   532       int64_t new_sz_C = 0;
   533       int64_t * new_offsets_C;
   534       int64_t * grp_offsets_C;
   535       int64_t * grp_sizes_C;
   539       for (
int i=0; i<nblk_C; i++){
   540         new_offsets_C[i] = new_sz_C;
   541         new_sz_C += size_blk_C[i];
   544       for (
int i=0; i<n_new_C_grps; i++){
   545         int64_t last_grp_offset = 0;
   548             grp_offsets_C[ctr_sub_lda_C*k+j] = last_grp_offset;
   549             grp_sizes_C[ctr_sub_lda_C*k+j] = size_blk_C[ctr_sub_lda_C*(i+n_new_C_grps*k)+j];
   550             last_grp_offset += grp_sizes_C[ctr_sub_lda_C*k+j];
   554         spcopy(ctr_sub_lda_C, 
ctr_lda_C, ctr_sub_lda_C, ctr_sub_lda_C*n_new_C_grps,
   555                grp_sizes_C, grp_offsets_C, new_C_grps[i],
   556                size_blk_C+i*ctr_sub_lda_C, new_offsets_C+i*ctr_sub_lda_C, new_C);
   564       char * new_Cs[nblk_C];
   565       int64_t org_offset = 0;
   566       int64_t cmp_offset = 0;
   567       int64_t new_offset = 0;
   568       for (
int i=0; i<nblk_C; i++){
   569         new_Cs[i] = 
sr_C->
csr_add(C+org_offset, new_C+cmp_offset);
   571         org_offset += ((
CSR_Matrix)(C+org_offset)).size();
   572         cmp_offset += ((
CSR_Matrix)(new_C+cmp_offset)).size();
   576       new_C = (
char*)
alloc(new_offset);
   578       for (
int i=0; i<nblk_C; i++){
   579         size_blk_C[i] = ((
CSR_Matrix)new_Cs[i]).size();
   580         memcpy(new_C+new_offset, new_Cs[i], size_blk_C[i]);
   581         new_offset += size_blk_C[i];
 virtual char * csr_add(char *cA, char *cB) const 
adds CSR matrices A (stored in cA) and B (stored in cB) to create matric C (pointer to all_data retur...
virtual int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
double est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the time this kernel will take including calls to rec_ctr 
virtual int pair_size() const 
gets pair size el_size plus the key size 
void red(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op, int root)
reduce, same interface as MPI_Reduce, but excluding the comm 
virtual double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
virtual void copy(char *a, char const *b) const 
copies element b to element a 
void host_pinned_alloc(void **ptr, int64_t size)
allocate a pinned host buffer 
void * alloc(int64_t len)
alloc abstraction 
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the time this kernel will take including calls to rec_ctr 
virtual char const * addid() const 
MPI datatype for pairs. 
double estimate_red_time(int64_t msg_sz, MPI_Op op)
double estimate_csr_red_time(int64_t msg_sz, CommData const *cdt) const 
char * bcast_step(int edge_len, char *A, bool is_sparse_A, bool move_A, algstrct const *sr_A, int64_t b_A, int64_t s_A, char *buf_A, CommData *cdt_A, int64_t ctr_sub_lda_A, int64_t ctr_lda_A, int nblk_A, int64_t const *size_blk_A, int &new_nblk_A, int64_t *&new_size_blk_A, int64_t *offsets_A, int ib)
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction 
int64_t spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes of buffer space we need 
void host_pinned_free(void *ptr)
free a pinned host buffer 
int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes of buffer space we need recursively 
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction 
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
virtual char * csr_reduce(char *cA, int root, MPI_Comm cm) const 
reduces CSR matrices stored in cA on each processor in cm and returns result on processor root ...
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
Basically doing SUMMA, except assumes equal block size on each processor. Performs rank-b updates whe...
double estimate_bcast_time(int64_t msg_sz)
virtual MPI_Op addmop() const 
MPI addition operation for reductions. 
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm 
void reduce_step_post(int edge_len, char *C, bool is_sparse_C, bool move_C, algstrct const *sr_C, int64_t b_C, int64_t s_C, char *buf_C, CommData *cdt_C, int64_t ctr_sub_lda_C, int64_t ctr_lda_C, int nblk_C, int64_t *size_blk_C, int &new_nblk_C, int64_t *&new_size_blk_C, int64_t *offsets_C, int ib, char const *&rec_beta, char const *beta, char *&up_C, char *&new_C, int n_new_C_grps, int &i_new_C_grp, char **new_C_grps)
spctr_2d_general(spctr *other)
copies spctr object 
void socopy(int64_t m, int64_t n, int64_t lda_a, int64_t lda_b, int64_t const *sizes_a, int64_t *&sizes_b, int64_t *&offsets_b)
int el_size
size of each element of algstrct in bytes 
int cdealloc(void *ptr)
free abstraction 
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
void find_bsizes(int64_t &b_A, int64_t &b_B, int64_t &b_C, int64_t &s_A, int64_t &s_B, int64_t &s_C, int64_t &aux_size)
determines buffer and block sizes needed for spctr_2d_general 
char * reduce_step_pre(int edge_len, char *C, bool is_sparse_C, bool move_C, algstrct const *sr_C, int64_t b_C, int64_t s_C, char *buf_C, CommData *cdt_C, int64_t ctr_sub_lda_C, int64_t ctr_lda_C, int nblk_C, int64_t const *size_blk_C, int &new_nblk_C, int64_t *&new_size_blk_C, int64_t *offsets_C, int ib, char const *&rec_beta)
void print()
print ctr object 
~spctr_2d_general()
deallocs spctr_2d_general object 
virtual char const * mulid() const 
identity element for multiplication i.e. 1 
void run(char *A, char *B, char *C)
virtual MPI_Datatype mdtype() const 
MPI datatype. 
void spcopy(int64_t m, int64_t n, int64_t lda_a, int64_t lda_b, int64_t const *sizes_a, int64_t const *offsets_a, char const *a, int64_t const *sizes_b, int64_t const *offsets_b, char *b)