4 #include "../tensor/untyped_tensor.h" 5 #include "../mapping/mapping.h" 6 #include "../shared/util.h" 7 #include "../shared/offload.h" 21 int64_t & cg_ctr_lda_A,
22 int64_t & cg_ctr_sub_lda_A,
26 int const * virt_blk_len_A,
31 int64_t & cg_ctr_lda_B,
32 int64_t & cg_ctr_sub_lda_B,
36 int const * virt_blk_len_B,
41 int64_t & cg_ctr_lda_C,
42 int64_t & cg_ctr_sub_lda_C,
46 int const * virt_blk_len_C,
55 virt_dim[i] = map->
np;
93 cg_ctr_sub_lda_B= blk_sz_B*B->
edge_map[i_B].
np/cg_edge_len;
95 cg_ctr_sub_lda_B= blk_sz_B/cg_edge_len;
96 for (j=i_B+1; j<B->
order; j++) {
97 cg_ctr_sub_lda_B = (cg_ctr_sub_lda_B *
98 virt_blk_len_B[j]) / blk_len_B[j];
99 cg_ctr_lda_B = (cg_ctr_lda_B*blk_len_B[j])
104 cg_ctr_sub_lda_C= blk_sz_C*C->
edge_map[i_C].
np/cg_edge_len;
106 cg_ctr_sub_lda_C= blk_sz_C/cg_edge_len;
107 for (j=i_C+1; j<C->
order; j++) {
108 cg_ctr_sub_lda_C = (cg_ctr_sub_lda_C *
109 virt_blk_len_C[j]) / blk_len_C[j];
110 cg_ctr_lda_C = (cg_ctr_lda_C*blk_len_C[j])
114 if (blk_sz_B / nstep == 0)
115 printf(
"blk_len_B[%d] = %d, nstep = %ld blk_sz_B = %ld\n",i_B,blk_len_B[i_B],nstep,blk_sz_B);
116 blk_sz_B = blk_sz_B / nstep;
117 blk_len_B[i_B] = blk_len_B[i_B] / nstep;
119 if (blk_sz_B * B->
edge_map[i_B].
np/ nstep == 0)
120 printf(
"blk_len_B[%d] = %d B->edge_map[%d].np = %d, nstep = %ld blk_sz_B = %ld\n",i_B,blk_len_B[i_B],i_B,B->
edge_map[i_B].
np,nstep,blk_sz_B);
121 blk_sz_B = blk_sz_B * B->
edge_map[i_B].
np / nstep;
122 blk_len_B[i_B] = blk_len_B[i_B] * B->
edge_map[i_B].
np / nstep;
125 if (blk_sz_C / nstep == 0)
126 printf(
"blk_len_C[%d] = %d, nstep = %ld blk_sz_C = %ld\n",i_C,blk_len_C[i_C],nstep,blk_sz_C);
127 blk_sz_C = blk_sz_C / nstep;
128 blk_len_C[i_C] = blk_len_C[i_C] / nstep;
130 if (blk_sz_C * C->
edge_map[i_C].
np/ nstep == 0)
131 printf(
"blk_len_C[%d] = %d C->edge_map[%d].np = %d, nstep = %ld blk_sz_C = %ld\n",i_C,blk_len_C[i_C],i_C,C->
edge_map[i_C].
np,nstep,blk_sz_C);
132 blk_sz_C = blk_sz_C * C->
edge_map[i_C].
np / nstep;
133 blk_len_C[i_C] = blk_len_C[i_C] * C->
edge_map[i_C].
np / nstep;
151 if (cg_ctr_sub_lda_A == 0)
152 load_phase_A *= nstep;
155 if (cg_ctr_sub_lda_B == 0)
156 load_phase_B *= nstep;
159 if (cg_ctr_sub_lda_C == 0)
160 load_phase_C *= nstep;
196 alloc_host_buf = o->alloc_host_buf;
201 printf(
"ctr_2d_general: edge_len = %d\n",
edge_len);
202 printf(
"move_A = %d, ctr_lda_A = %ld, ctr_sub_lda_A = %ld\n",
205 printf(
"move_B = %d, ctr_lda_B = %ld, ctr_sub_lda_B = %ld\n",
208 printf(
"move_C = %d, ctr_lda_C = %ld, ctr_sub_lda_C = %ld\n",
213 printf(
"alloc_host_buf is true\n");
215 printf(
"alloc_host_buf is false\n");
231 b_A = 0, b_B = 0, b_C = 0;
249 int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
250 find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
251 double est_comm_time = 0.0;
266 int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
267 find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
276 int owner_A, owner_B, owner_C, ret;
278 char * buf_A, * buf_B, * buf_C;
279 char * op_A, * op_B, * op_C;
280 int rank_A, rank_B, rank_C;
281 int64_t b_A, b_B, b_C, s_A, s_B, s_C, aux_size;
296 int iidx_lyr, inum_lyr;
315 find_bsizes(b_A, b_B, b_C, s_A, s_B, s_C, aux_size);
342 for (ib=iidx_lyr; ib<
edge_len; ib+=inum_lyr)
347 if (rank_A == owner_A){
368 A+
sr_A->
el_size*ib*ctr_sub_lda_A, ctr_sub_lda_A*edge_len,
369 buf_A, ctr_sub_lda_A);
375 if (rank_B == owner_B){
398 buf_B, ctr_sub_lda_B);
435 if (rank_C == owner_C){
447 if (ctr_sub_lda_C == 0)
~ctr_2d_general()
deallocs ctr_2d_general object
int calc_phase() const
compute the phase of a mapping
void red(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op, int root)
reduce, same interface as MPI_Reduce, but excluding the comm
void print()
print ctr object
virtual int64_t mem_rec()
virtual void copy(char *a, char const *b) const
copies element b to element a
void host_pinned_alloc(void **ptr, int64_t size)
allocate a pinned host buffer
int64_t mem_rec()
returns the number of bytes of buffer space we need recursively
virtual char const * addid() const
MPI datatype for pairs.
int order
number of tensor dimensions
double est_time_rec(int nlyr)
returns the number of bytes send by each proc recursively
double estimate_red_time(int64_t msg_sz, MPI_Op op)
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction
void host_pinned_free(void *ptr)
free a pinned host buffer
int64_t mem_fp()
returns the number of bytes of buffer space we need
void run(char *A, char *B, char *C)
Basically doing SUMMA, except assumes equal block size on each processor. Performs rank-b updates whe...
ctr_2d_general(ctr *other)
copies ctr object
int comp_dim_map(mapping const *map_A, mapping const *map_B)
compares two mappings
double estimate_bcast_time(int64_t msg_sz)
virtual MPI_Op addmop() const
MPI addition operation for reductions.
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
virtual double est_time_rec(int nlyr)
double est_time_fp(int nlyr)
returns the number of bytes this kernel will send per processor
void find_bsizes(int64_t &b_A, int64_t &b_B, int64_t &b_C, int64_t &s_A, int64_t &s_B, int64_t &s_C, int64_t &aux_size)
determines buffer and block sizes needed for ctr_2d_general
virtual void run(char *A, char *B, char *C)
int el_size
size of each element of algstrct in bytes
int cdealloc(void *ptr)
free abstraction
internal distributed tensor class
topology * topo
topology to which the tensor is mapped
int ctr_2d_gen_build(int is_used, CommData global_comm, int i, int *virt_dim, int &cg_edge_len, int &total_iter, tensor *A, int i_A, CommData *&cg_cdt_A, int64_t &cg_ctr_lda_A, int64_t &cg_ctr_sub_lda_A, bool &cg_move_A, int *blk_len_A, int64_t &blk_sz_A, int const *virt_blk_len_A, int &load_phase_A, tensor *B, int i_B, CommData *&cg_cdt_B, int64_t &cg_ctr_lda_B, int64_t &cg_ctr_sub_lda_B, bool &cg_move_B, int *blk_len_B, int64_t &blk_sz_B, int const *virt_blk_len_B, int &load_phase_B, tensor *C, int i_C, CommData *&cg_cdt_C, int64_t &cg_ctr_lda_C, int64_t &cg_ctr_sub_lda_C, bool &cg_move_C, int *blk_len_C, int64_t &blk_sz_C, int const *virt_blk_len_C, int &load_phase_C)
sets up a ctr_2d_general (2D SUMMA) level where A is not communicated function will be called with A/...
virtual char const * mulid() const
identity element for multiplication i.e. 1
virtual MPI_Datatype mdtype() const
MPI datatype.