4 #include "../shared/util.h"     7 #include "../tensor/untyped_tensor.h"    11                                int const *         phys_mapped,
    27     for (i=0; i<nphys_dim; i++){
    28       if (phys_mapped[3*i+0] == 0 &&
    29           phys_mapped[3*i+1] == 0 &&
    30           phys_mapped[3*i+2] == 0){
    39         if (phys_mapped[3*i+0] == 0){
    42         if (phys_mapped[3*i+1] == 0){
    45         if (phys_mapped[3*i+2] == 0){
    59     for (i=0; i<nphys_dim; i++){
    60       if (!(phys_mapped[3*i+0] == 0 &&
    61           phys_mapped[3*i+1] == 0 &&
    62           phys_mapped[3*i+2] == 0)){
    63         if (phys_mapped[3*i+0] == 0){
    69         if (phys_mapped[3*i+1] == 0){
    75         if (phys_mapped[3*i+2] == 0){
   121     printf(
"spctr_replicate: \n");
   122     printf(
"cdt_A = %p, size_A = %ld, ncdt_A = %d\n",
   125       printf(
"cdt_A[%d] length = %d\n",i,
cdt_A[i]->
np);
   127     printf(
"cdt_B = %p, size_B = %ld, ncdt_B = %d\n",
   130       printf(
"cdt_B[%d] length = %d\n",i,
cdt_B[i]->
np);
   132     printf(
"cdt_C = %p, size_C = %ld, ncdt_C = %d\n",
   135       printf(
"cdt_C[%d] length = %d\n",i,
cdt_C[i]->
np);
   174     int64_t mem_usage = 0;
   187                             char * B, 
int nblk_B, int64_t 
const * size_blk_B,
   188                             char * C, 
int nblk_C, int64_t * size_blk_C,
   190     int arank, brank, crank, i;
   192     arank = 0, brank = 0, crank = 0;
   201     int64_t new_size_blk_A[nblk_A];
   202     int64_t new_size_blk_B[nblk_B];
   203     int64_t new_size_blk_C[nblk_C];
   205       memcpy(new_size_blk_A, size_blk_A, nblk_A*
sizeof(int64_t));
   211         cdt_A[i]->
bcast(new_size_blk_A, nblk_A, MPI_INT64_T, 0);
   213       int64_t new_size_A = 0;
   214       for (i=0; i<nblk_A; i++){
   215         new_size_A += new_size_blk_A[i];
   218         buf_A = (
char*)
alloc(new_size_A);
   220         cdt_A[i]->
bcast(buf_A, new_size_A, MPI_CHAR, 0);
   228       memcpy(new_size_blk_B, size_blk_B, nblk_B*
sizeof(int64_t));
   234         cdt_B[i]->
bcast(new_size_blk_B, nblk_B, MPI_INT64_T, 0);
   236       int64_t new_size_B = 0;
   237       for (i=0; i<nblk_B; i++){
   238         new_size_B += new_size_blk_B[i];
   241         buf_B = (
char*)
alloc(new_size_B);
   243         cdt_B[i]->
bcast(buf_B, new_size_B, MPI_CHAR, 0);
   251       memset(new_size_blk_C, 0, 
sizeof(int64_t)*nblk_C);
   273                  buf_B, nblk_B, new_size_blk_B,
   274                      C, nblk_C,     size_blk_C,
   284         int64_t csr_sz_acc = 0;
   285         int64_t new_csr_sz_acc = 0;
   286         char * new_Cs[nblk_C];
   287         for (
int blk=0; blk<nblk_C; blk++){
   291           csr_sz_acc += size_blk_C[blk];
   293           new_csr_sz_acc += size_blk_C[blk];
   297         alloc_ptr(new_csr_sz_acc, (
void**)&new_C);
   299         for (
int blk=0; blk<nblk_C; blk++){
   300           memcpy(new_C+new_csr_sz_acc, new_Cs[blk], size_blk_C[blk]);
   302           new_csr_sz_acc += size_blk_C[blk];
 
virtual int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
virtual int pair_size() const 
gets pair size el_size plus the key size 
void red(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op, int root)
reduce, same interface as MPI_Reduce, but excluding the comm 
virtual bool isequal(char const *a, char const *b) const 
returns true if algstrct elements a and b are equal 
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
virtual double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
void * alloc(int64_t len)
alloc abstraction 
virtual char const * addid() const 
MPI datatype for pairs. 
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
spctr_replicate(spctr *other)
double estimate_red_time(int64_t msg_sz, MPI_Op op)
double estimate_csr_red_time(int64_t msg_sz, CommData const *cdt) const 
int64_t spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes of buffer space we need 
virtual void set(char *a, char const *b, int64_t n) const 
sets n elements of array a to value b 
class for execution distributed contraction of tensors 
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction 
double est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time the local part this kernel is estimated to take 
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
virtual char * csr_reduce(char *cA, int root, MPI_Comm cm) const 
reduces CSR matrices stored in cA on each processor in cm and returns result on processor root ...
double estimate_bcast_time(int64_t msg_sz)
virtual MPI_Op addmop() const 
MPI addition operation for reductions. 
virtual void scal(int n, char const *alpha, char *X, int incX) const 
X["i"]=alpha*X["i"];. 
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm 
int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
int el_size
size of each element of algstrct in bytes 
int cdealloc(void *ptr)
free abstraction 
topology * topo
topology to which the tensor is mapped 
virtual char const * mulid() const 
identity element for multiplication i.e. 1 
void run(char *A, char *B, char *C)
virtual MPI_Datatype mdtype() const 
MPI datatype.