4 #include "../shared/util.h" 7 #include "../tensor/untyped_tensor.h" 11 int const * phys_mapped,
27 for (i=0; i<nphys_dim; i++){
28 if (phys_mapped[3*i+0] == 0 &&
29 phys_mapped[3*i+1] == 0 &&
30 phys_mapped[3*i+2] == 0){
39 if (phys_mapped[3*i+0] == 0){
42 if (phys_mapped[3*i+1] == 0){
45 if (phys_mapped[3*i+2] == 0){
59 for (i=0; i<nphys_dim; i++){
60 if (!(phys_mapped[3*i+0] == 0 &&
61 phys_mapped[3*i+1] == 0 &&
62 phys_mapped[3*i+2] == 0)){
63 if (phys_mapped[3*i+0] == 0){
69 if (phys_mapped[3*i+1] == 0){
75 if (phys_mapped[3*i+2] == 0){
121 printf(
"spctr_replicate: \n");
122 printf(
"cdt_A = %p, size_A = %ld, ncdt_A = %d\n",
125 printf(
"cdt_A[%d] length = %d\n",i,
cdt_A[i]->
np);
127 printf(
"cdt_B = %p, size_B = %ld, ncdt_B = %d\n",
130 printf(
"cdt_B[%d] length = %d\n",i,
cdt_B[i]->
np);
132 printf(
"cdt_C = %p, size_C = %ld, ncdt_C = %d\n",
135 printf(
"cdt_C[%d] length = %d\n",i,
cdt_C[i]->
np);
174 int64_t mem_usage = 0;
187 char * B,
int nblk_B, int64_t
const * size_blk_B,
188 char * C,
int nblk_C, int64_t * size_blk_C,
190 int arank, brank, crank, i;
192 arank = 0, brank = 0, crank = 0;
201 int64_t new_size_blk_A[nblk_A];
202 int64_t new_size_blk_B[nblk_B];
203 int64_t new_size_blk_C[nblk_C];
205 memcpy(new_size_blk_A, size_blk_A, nblk_A*
sizeof(int64_t));
211 cdt_A[i]->
bcast(new_size_blk_A, nblk_A, MPI_INT64_T, 0);
213 int64_t new_size_A = 0;
214 for (i=0; i<nblk_A; i++){
215 new_size_A += new_size_blk_A[i];
218 buf_A = (
char*)
alloc(new_size_A);
220 cdt_A[i]->
bcast(buf_A, new_size_A, MPI_CHAR, 0);
228 memcpy(new_size_blk_B, size_blk_B, nblk_B*
sizeof(int64_t));
234 cdt_B[i]->
bcast(new_size_blk_B, nblk_B, MPI_INT64_T, 0);
236 int64_t new_size_B = 0;
237 for (i=0; i<nblk_B; i++){
238 new_size_B += new_size_blk_B[i];
241 buf_B = (
char*)
alloc(new_size_B);
243 cdt_B[i]->
bcast(buf_B, new_size_B, MPI_CHAR, 0);
251 memset(new_size_blk_C, 0,
sizeof(int64_t)*nblk_C);
273 buf_B, nblk_B, new_size_blk_B,
274 C, nblk_C, size_blk_C,
284 int64_t csr_sz_acc = 0;
285 int64_t new_csr_sz_acc = 0;
286 char * new_Cs[nblk_C];
287 for (
int blk=0; blk<nblk_C; blk++){
291 csr_sz_acc += size_blk_C[blk];
293 new_csr_sz_acc += size_blk_C[blk];
297 alloc_ptr(new_csr_sz_acc, (
void**)&new_C);
299 for (
int blk=0; blk<nblk_C; blk++){
300 memcpy(new_C+new_csr_sz_acc, new_Cs[blk], size_blk_C[blk]);
302 new_csr_sz_acc += size_blk_C[blk];
virtual int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
virtual int pair_size() const
gets pair size el_size plus the key size
void red(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op, int root)
reduce, same interface as MPI_Reduce, but excluding the comm
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
virtual double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
void * alloc(int64_t len)
alloc abstraction
virtual char const * addid() const
MPI datatype for pairs.
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
spctr_replicate(spctr *other)
double estimate_red_time(int64_t msg_sz, MPI_Op op)
double estimate_csr_red_time(int64_t msg_sz, CommData const *cdt) const
int64_t spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes of buffer space we need
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
class for execution distributed contraction of tensors
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
double est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time the local part this kernel is estimated to take
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
virtual char * csr_reduce(char *cA, int root, MPI_Comm cm) const
reduces CSR matrices stored in cA on each processor in cm and returns result on processor root ...
double estimate_bcast_time(int64_t msg_sz)
virtual MPI_Op addmop() const
MPI addition operation for reductions.
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
int el_size
size of each element of algstrct in bytes
int cdealloc(void *ptr)
free abstraction
topology * topo
topology to which the tensor is mapped
virtual char const * mulid() const
identity element for multiplication i.e. 1
void run(char *A, char *B, char *C)
virtual MPI_Datatype mdtype() const
MPI datatype.