4 #include "../tensor/untyped_tensor.h" 5 #include "../mapping/mapping.h" 6 #include "../shared/util.h" 40 alloc_host_buf = o->alloc_host_buf;
45 printf(
"spctr_2d_general: edge_len = %d\n",
edge_len);
46 printf(
"move_A = %d, ctr_lda_A = %ld, ctr_sub_lda_A = %ld\n",
49 printf(
"move_B = %d, ctr_lda_B = %ld, ctr_sub_lda_B = %ld\n",
52 printf(
"move_C = %d, ctr_lda_C = %ld, ctr_sub_lda_C = %ld\n",
57 printf(
"alloc_host_buf is true\n");
59 printf(
"alloc_host_buf is false\n");
75 b_A = 0, b_B = 0, b_C = 0;
93 int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
94 find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
95 double est_bcast_time = 0.0;
122 int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
123 find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
124 int64_t mem_usage = 0;
138 char *
bcast_step(
int edge_len,
char * A,
bool is_sparse_A,
bool move_A,
algstrct const *
sr_A, int64_t b_A, int64_t s_A,
char * buf_A,
CommData *
cdt_A, int64_t
ctr_sub_lda_A, int64_t
ctr_lda_A,
int nblk_A, int64_t
const * size_blk_A,
int & new_nblk_A, int64_t *& new_size_blk_A, int64_t * offsets_A,
int ib){
141 new_size_blk_A = (int64_t*)size_blk_A;
143 new_nblk_A = nblk_A/b_A;
144 int owner_A = ib % cdt_A->
np;
145 if (cdt_A->
rank == owner_A){
150 int64_t * new_offsets_A;
151 socopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*b_A, ctr_sub_lda_A,
152 size_blk_A+(ib/cdt_A->
np)*ctr_sub_lda_A,
153 new_size_blk_A, new_offsets_A);
155 int64_t bc_size_A = 0;
156 for (
int z=0; z<new_nblk_A; z++) bc_size_A += new_size_blk_A[z];
160 spcopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*b_A, ctr_sub_lda_A,
161 size_blk_A+(ib/cdt_A->
np)*ctr_sub_lda_A,
162 offsets_A+(ib/cdt_A->
np)*ctr_sub_lda_A,
164 new_size_blk_A, new_offsets_A, op_A);
168 sr_A->
copy(ctr_sub_lda_A, ctr_lda_A,
169 A+sr_A->
el_size*(ib/cdt_A->
np)*ctr_sub_lda_A, ctr_sub_lda_A*b_A,
170 op_A, ctr_sub_lda_A);
180 cdt_A->
bcast(new_size_blk_A, new_nblk_A, MPI_INT64_T, owner_A);
181 int64_t bc_size_A = 0;
182 for (
int z=0; z<new_nblk_A; z++) bc_size_A += new_size_blk_A[z];
184 if (cdt_A->
rank != owner_A){
189 cdt_A->
bcast(op_A, bc_size_A, MPI_CHAR, owner_A);
200 if (ctr_sub_lda_A == 0)
208 memcpy(new_size_blk_A, size_blk_A+ib*ctr_sub_lda_A,
sizeof(int64_t)*new_nblk_A);
217 int64_t * new_offsets_A;
218 socopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*edge_len, ctr_sub_lda_A,
219 size_blk_A+ib*ctr_sub_lda_A,
220 new_size_blk_A, new_offsets_A);
221 int64_t bc_size_A = 0;
222 for (
int z=0; z<new_nblk_A; z++) bc_size_A += new_size_blk_A[z];
227 spcopy(ctr_sub_lda_A, ctr_lda_A, ctr_sub_lda_A*edge_len, ctr_sub_lda_A,
228 size_blk_A+ib*ctr_sub_lda_A, offsets_A+ib*ctr_sub_lda_A, A,
229 new_size_blk_A, new_offsets_A, op_A);
233 sr_A->
copy(ctr_sub_lda_A, ctr_lda_A,
234 A+sr_A->
el_size*ib*ctr_sub_lda_A, ctr_sub_lda_A*edge_len,
235 buf_A, ctr_sub_lda_A);
244 char *
reduce_step_pre(
int edge_len,
char * C,
bool is_sparse_C,
bool move_C,
algstrct const *
sr_C, int64_t b_C, int64_t s_C,
char * buf_C,
CommData *
cdt_C, int64_t
ctr_sub_lda_C, int64_t
ctr_lda_C,
int nblk_C, int64_t
const * size_blk_C,
int & new_nblk_C, int64_t *& new_size_blk_C, int64_t * offsets_C,
int ib,
char const *& rec_beta){
246 new_size_blk_C = (int64_t*)size_blk_C;
249 rec_beta = sr_C->
addid();
250 new_nblk_C = nblk_C/b_C;
253 memset(new_size_blk_C, 0,
sizeof(int64_t)*new_nblk_C);
256 if (ctr_sub_lda_C == 0){
264 memcpy(new_size_blk_C, size_blk_C+ib*ctr_sub_lda_C,
sizeof(int64_t)*new_nblk_C);
271 rec_beta = sr_C->
addid();
273 memset(new_size_blk_C, 0,
sizeof(int64_t)*new_nblk_C);
281 void reduce_step_post(
int edge_len,
char * C,
bool is_sparse_C,
bool move_C,
algstrct const *
sr_C, int64_t b_C, int64_t s_C,
char * buf_C,
CommData *
cdt_C, int64_t
ctr_sub_lda_C, int64_t
ctr_lda_C,
int nblk_C, int64_t * size_blk_C,
int & new_nblk_C, int64_t *& new_size_blk_C, int64_t * offsets_C,
int ib,
char const *& rec_beta,
char const *
beta,
char *& up_C,
char *&
new_C,
int n_new_C_grps,
int & i_new_C_grp,
char ** new_C_grps){
285 MPI_Barrier(cdt_C->
cm);
288 int owner_C = ib % cdt_C->
np;
290 int64_t csr_sz_acc = 0;
291 int64_t new_csr_sz_acc = 0;
292 char * new_Cs[new_nblk_C];
293 for (
int blk=0; blk<new_nblk_C; blk++){
294 new_Cs[blk] = sr_C->
csr_reduce(up_C+csr_sz_acc, owner_C, cdt_C->
cm);
296 csr_sz_acc += new_size_blk_C[blk];
297 new_size_blk_C[blk] = cdt_C->
rank == owner_C ? ((
CSR_Matrix)(new_Cs[blk])).size() : 0;
298 new_csr_sz_acc += new_size_blk_C[blk];
301 if (cdt_C->
rank == owner_C){
302 if (n_new_C_grps == 1){
303 alloc_ptr(new_csr_sz_acc, (
void**)&up_C);
305 ASSERT(nblk_C == new_nblk_C);
306 for (
int blk=0; blk<nblk_C; blk++){
307 memcpy(up_C+new_csr_sz_acc, new_Cs[blk], new_size_blk_C[blk]);
309 new_csr_sz_acc += new_size_blk_C[blk];
317 size_blk_C[ctr_sub_lda_C*(k*n_new_C_grps+i_new_C_grp)+j] = new_size_blk_C[ctr_sub_lda_C*k+j];
320 new_C_grps[i_new_C_grp] = new_Cs[0];
327 if (cdt_C->
rank == owner_C)
328 cdt_C->
red(MPI_IN_PLACE, up_C, s_C, sr_C->
mdtype(), sr_C->
addmop(), owner_C);
331 if (cdt_C->
rank == owner_C){
332 sr_C->
copy(ctr_sub_lda_C, ctr_lda_C,
333 up_C, ctr_sub_lda_C, sr_C->
mulid(),
334 C+sr_C->
el_size*(ib/cdt_C->
np)*ctr_sub_lda_C,
335 ctr_sub_lda_C*b_C, beta);
339 if (ctr_sub_lda_C != 0){
341 new_C_grps[i_new_C_grp] = up_C;
344 size_blk_C[ctr_sub_lda_C*(k*n_new_C_grps+i_new_C_grp)+j] = new_size_blk_C[ctr_sub_lda_C*k+j];
348 }
else if (ctr_lda_C != 1){
349 sr_C->
copy(ctr_sub_lda_C, ctr_lda_C,
350 buf_C, ctr_sub_lda_C, sr_C->
mulid(),
355 rec_beta = sr_C->
mulid();
357 size_blk_C[0] = new_size_blk_C[0];
366 char * B,
int nblk_B, int64_t
const * size_blk_B,
367 char * C,
int nblk_C, int64_t * size_blk_C,
369 int ret, n_new_C_grps;
371 char * buf_A, * buf_B, * buf_C, * buf_aux, * up_C;
376 int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
392 if (n_new_C_grps > 1)
393 alloc_ptr(n_new_C_grps*
sizeof(
char*), (
void**)&new_C_grps);
402 int iidx_lyr, inum_lyr;
421 find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
451 for (
int i=0; i<nblk_A; i++){
452 if (i==0) offsets_A[0] = 0;
453 else offsets_A[i] = offsets_A[i-1]+size_blk_A[i-1];
459 for (
int i=0; i<nblk_B; i++){
460 if (i==0) offsets_B[0] = 0;
461 else offsets_B[i] = offsets_B[i-1]+size_blk_B[i-1];
467 for (
int i=0; i<nblk_C; i++){
468 if (i==0) offsets_C[0] = 0;
469 else offsets_C[i] = offsets_C[i-1]+size_blk_C[i-1];
474 int64_t * new_size_blk_A;
475 int new_nblk_A = nblk_A;
476 int64_t * new_size_blk_B;
477 int new_nblk_B = nblk_B;
478 int64_t * new_size_blk_C;
479 int new_nblk_C = nblk_C;
484 for (ib=iidx_lyr; ib<
edge_len; ib+=inum_lyr){
485 op_A =
bcast_step(edge_len, A,
is_sparse_A,
move_A,
sr_A, b_A, s_A, buf_A,
cdt_A,
ctr_sub_lda_A,
ctr_lda_A, nblk_A, size_blk_A, new_nblk_A, new_size_blk_A, offsets_A, ib);
486 op_B =
bcast_step(edge_len, B,
is_sparse_B,
move_B,
sr_B, b_B, s_B, buf_B,
cdt_B,
ctr_sub_lda_B,
ctr_lda_B, nblk_B, size_blk_B, new_nblk_B, new_size_blk_B, offsets_B, ib);
487 op_C =
reduce_step_pre(edge_len, new_C,
is_sparse_C,
move_C,
sr_C, b_C, s_C, buf_C,
cdt_C,
ctr_sub_lda_C,
ctr_lda_C, nblk_C, size_blk_C, new_nblk_C, new_size_blk_C, offsets_C, ib,
rec_ctr->
beta);
491 rec_ctr->
run(op_A, new_nblk_A, new_size_blk_A,
492 op_B, new_nblk_B, new_size_blk_B,
493 op_C, new_nblk_C, new_size_blk_C,
506 reduce_step_post(edge_len, C,
is_sparse_C,
move_C,
sr_C, b_C, s_C, buf_C,
cdt_C,
ctr_sub_lda_C,
ctr_lda_C, nblk_C, size_blk_C, new_nblk_C, new_size_blk_C, offsets_C, ib,
rec_ctr->
beta, this->beta, up_C, new_C, n_new_C_grps, i_new_C_grp, new_C_grps);
508 if (new_size_blk_A != size_blk_A)
510 if (new_size_blk_B != size_blk_B)
520 if (new_size_blk_C != size_blk_C)
530 if (n_new_C_grps > 1){
531 ASSERT(i_new_C_grp == n_new_C_grps);
532 int64_t new_sz_C = 0;
533 int64_t * new_offsets_C;
534 int64_t * grp_offsets_C;
535 int64_t * grp_sizes_C;
539 for (
int i=0; i<nblk_C; i++){
540 new_offsets_C[i] = new_sz_C;
541 new_sz_C += size_blk_C[i];
544 for (
int i=0; i<n_new_C_grps; i++){
545 int64_t last_grp_offset = 0;
548 grp_offsets_C[ctr_sub_lda_C*k+j] = last_grp_offset;
549 grp_sizes_C[ctr_sub_lda_C*k+j] = size_blk_C[ctr_sub_lda_C*(i+n_new_C_grps*k)+j];
550 last_grp_offset += grp_sizes_C[ctr_sub_lda_C*k+j];
554 spcopy(ctr_sub_lda_C,
ctr_lda_C, ctr_sub_lda_C, ctr_sub_lda_C*n_new_C_grps,
555 grp_sizes_C, grp_offsets_C, new_C_grps[i],
556 size_blk_C+i*ctr_sub_lda_C, new_offsets_C+i*ctr_sub_lda_C, new_C);
564 char * new_Cs[nblk_C];
565 int64_t org_offset = 0;
566 int64_t cmp_offset = 0;
567 int64_t new_offset = 0;
568 for (
int i=0; i<nblk_C; i++){
569 new_Cs[i] =
sr_C->
csr_add(C+org_offset, new_C+cmp_offset);
571 org_offset += ((
CSR_Matrix)(C+org_offset)).size();
572 cmp_offset += ((
CSR_Matrix)(new_C+cmp_offset)).size();
576 new_C = (
char*)
alloc(new_offset);
578 for (
int i=0; i<nblk_C; i++){
579 size_blk_C[i] = ((
CSR_Matrix)new_Cs[i]).size();
580 memcpy(new_C+new_offset, new_Cs[i], size_blk_C[i]);
581 new_offset += size_blk_C[i];
virtual char * csr_add(char *cA, char *cB) const
adds CSR matrices A (stored in cA) and B (stored in cB) to create matric C (pointer to all_data retur...
virtual int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
double est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the time this kernel will take including calls to rec_ctr
virtual int pair_size() const
gets pair size el_size plus the key size
void red(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op, int root)
reduce, same interface as MPI_Reduce, but excluding the comm
virtual double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
virtual void copy(char *a, char const *b) const
copies element b to element a
void host_pinned_alloc(void **ptr, int64_t size)
allocate a pinned host buffer
void * alloc(int64_t len)
alloc abstraction
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the time this kernel will take including calls to rec_ctr
virtual char const * addid() const
MPI datatype for pairs.
double estimate_red_time(int64_t msg_sz, MPI_Op op)
double estimate_csr_red_time(int64_t msg_sz, CommData const *cdt) const
char * bcast_step(int edge_len, char *A, bool is_sparse_A, bool move_A, algstrct const *sr_A, int64_t b_A, int64_t s_A, char *buf_A, CommData *cdt_A, int64_t ctr_sub_lda_A, int64_t ctr_lda_A, int nblk_A, int64_t const *size_blk_A, int &new_nblk_A, int64_t *&new_size_blk_A, int64_t *offsets_A, int ib)
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction
int64_t spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes of buffer space we need
void host_pinned_free(void *ptr)
free a pinned host buffer
int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes of buffer space we need recursively
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
virtual char * csr_reduce(char *cA, int root, MPI_Comm cm) const
reduces CSR matrices stored in cA on each processor in cm and returns result on processor root ...
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
Basically doing SUMMA, except assumes equal block size on each processor. Performs rank-b updates whe...
double estimate_bcast_time(int64_t msg_sz)
virtual MPI_Op addmop() const
MPI addition operation for reductions.
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
void reduce_step_post(int edge_len, char *C, bool is_sparse_C, bool move_C, algstrct const *sr_C, int64_t b_C, int64_t s_C, char *buf_C, CommData *cdt_C, int64_t ctr_sub_lda_C, int64_t ctr_lda_C, int nblk_C, int64_t *size_blk_C, int &new_nblk_C, int64_t *&new_size_blk_C, int64_t *offsets_C, int ib, char const *&rec_beta, char const *beta, char *&up_C, char *&new_C, int n_new_C_grps, int &i_new_C_grp, char **new_C_grps)
spctr_2d_general(spctr *other)
copies spctr object
void socopy(int64_t m, int64_t n, int64_t lda_a, int64_t lda_b, int64_t const *sizes_a, int64_t *&sizes_b, int64_t *&offsets_b)
int el_size
size of each element of algstrct in bytes
int cdealloc(void *ptr)
free abstraction
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
void find_bsizes(int64_t &b_A, int64_t &b_B, int64_t &b_C, int64_t &s_A, int64_t &s_B, int64_t &s_C, int64_t &aux_size)
determines buffer and block sizes needed for spctr_2d_general
char * reduce_step_pre(int edge_len, char *C, bool is_sparse_C, bool move_C, algstrct const *sr_C, int64_t b_C, int64_t s_C, char *buf_C, CommData *cdt_C, int64_t ctr_sub_lda_C, int64_t ctr_lda_C, int nblk_C, int64_t const *size_blk_C, int &new_nblk_C, int64_t *&new_size_blk_C, int64_t *offsets_C, int ib, char const *&rec_beta)
void print()
print ctr object
~spctr_2d_general()
deallocs spctr_2d_general object
virtual char const * mulid() const
identity element for multiplication i.e. 1
void run(char *A, char *B, char *C)
virtual MPI_Datatype mdtype() const
MPI datatype.
void spcopy(int64_t m, int64_t n, int64_t lda_a, int64_t lda_b, int64_t const *sizes_a, int64_t const *offsets_a, char const *a, int64_t const *sizes_b, int64_t const *offsets_b, char *b)