4 #include "../shared/util.h" 12 char ** tsr_cyclic_data,
15 int old_num_virt, new_num_virt, numPes;
16 int64_t new_nval, swp_nval;
18 int * virt_phase_rank, * old_virt_phase_rank, * sub_edge_len;
19 char * pairs, * tsr_new_data;
24 numPes = ord_glb_comm.
np;
27 alloc_ptr(old_dist.
order*
sizeof(
int), (
void**)&old_virt_phase_rank);
32 idx_lyr = ord_glb_comm.
rank;
33 for (
int i=0; i<old_dist.
order; i++){
34 old_num_virt = old_num_virt*old_dist.
virt_phase[i];
35 new_num_virt = new_num_virt*new_dist.
virt_phase[i];
36 virt_phase_rank[i] = new_dist.
perank[i];
37 old_virt_phase_rank[i] = old_dist.
perank[i];
43 old_dist.
virt_phase, old_virt_phase_rank, &new_nval, tsr_data,
54 for (
int i=0; i<old_dist.
order; i++){
57 if (ord_glb_comm.
rank == 0){
58 DPRINTF(1,
"Tensor now has virtualization factor of %d\n",new_num_virt);
61 if (ord_glb_comm.
rank == 0){
62 DPRINTF(1,
"Tensor is of size %ld, has factor of %lf growth due to padding\n",
64 ord_glb_comm.
np*(swp_nval/(
double)old_dist.
size));
69 if (sr->
addid() != NULL)
70 sr->
set(tsr_new_data, sr->
addid(), swp_nval);
100 *tsr_cyclic_data = tsr_new_data;
114 int const * old_phys_edge_len,
115 int const * old_virt_lda,
116 int const * old_offsets,
117 int *
const * old_permutation,
118 int const * new_phys_edge_len,
119 int const * new_virt_lda,
123 int const * old_virt_edge_len){
126 int **bucket_offset;
alloc_ptr(
sizeof(
int*)*old_dist.
order, (
void**)&bucket_offset);
129 alloc_ptr(
sizeof(
int)*old_phys_edge_len[
dim], (
void**)&bucket_offset[dim]);
131 for (
int vidx = 0;vidx < old_virt_edge_len[
dim];vidx++){
135 if (_gidx > len[dim] || (old_offsets != NULL && _gidx < old_offsets[dim])){
139 if (old_permutation == NULL || old_permutation[dim] == NULL){
142 gidx = old_permutation[
dim][_gidx];
148 bucket_offset[
dim][pidx] = phys_rank*
MAX(1,new_dist.
pe_lda[dim]);
153 bucket_offset[
dim][pidx] = phys_rank*
MAX(1,new_dist.
pe_lda[dim]);
158 bucket_offset[
dim][pidx] = -1;
166 return bucket_offset;
175 int const * old_virt_edge_len,
176 int const * new_virt_lda,
177 int64_t * send_counts,
178 int64_t * recv_counts,
179 int64_t * send_displs,
180 int64_t * recv_displs,
183 int *
const * bucket_offset){
184 int64_t * all_virt_counts;
189 int max_ntd = omp_get_max_threads();
190 max_ntd =
MAX(1,
MIN(max_ntd,vbs/np));
195 mst_alloc_ptr(np*
sizeof(int64_t)*max_ntd, (
void**)&all_virt_counts);
200 if (old_dist.
order == 0){
201 memset(all_virt_counts, 0, np*
sizeof(int64_t));
202 all_virt_counts[0]++;
205 #pragma omp parallel num_threads(max_ntd) 207 int imax, act_max, skip;
208 int start_ldim, end_ldim;
210 int64_t * virt_counts;
211 int * old_virt_idx, * virt_rank;
218 ntd = omp_get_num_threads();
219 tid = omp_get_thread_num();
246 virt_counts = all_virt_counts+np*tid;
247 start_ldim = (last_len/ntd)*tid;
248 start_ldim +=
MIN(tid,last_len%ntd);
249 end_ldim = (last_len/ntd)*(tid+1);
250 end_ldim +=
MIN(tid+1,last_len%ntd);
253 int imax, act_max, skip;
254 int start_ldim, end_ldim, i_st, vc,
dim;
255 int64_t * virt_counts;
256 int64_t * old_virt_idx, * virt_rank;
262 virt_counts = all_virt_counts;
271 memset(virt_counts, 0, np*
sizeof(int64_t));
272 memset(old_virt_idx, 0, old_dist.
order*
sizeof(int64_t));
274 for (
int i=0; i<old_dist.
order; i++){
275 virt_rank[i] = old_dist.
perank[i];
278 memset(idx, 0, old_dist.
order*
sizeof(int64_t));
279 memset(idx_offs, 0, old_dist.
order*
sizeof(int64_t));
282 idx[old_dist.
order-1] =
MAX(idx[old_dist.
order-1],start_ldim);
283 for (dim=0; dim<old_dist.
order; dim++) {
289 if (sym[dim] !=
SY && virt_rank[dim] < virt_rank[dim+1])
291 if (sym[dim] ==
SY && virt_rank[dim] <= virt_rank[dim+1])
294 if (sym[dim] !=
NS && idx[dim] >= idx[dim+1]-spad[dim]){
295 idx[dim+1] = idx[
dim]+spad[
dim];
303 if (dim == old_dist.
order - 1)
304 imax =
MIN(imax, end_ldim);
305 if (idx[dim] >= imax)
309 idx_offset += idx_offs[
dim];
321 imax =
MIN(imax,idx[1]+1-spad[0]);
323 if (old_dist.
order == 1){
324 imax =
MIN(imax, end_ldim);
330 for (
int i=i_st; i<imax; i++){
331 vc = bucket_offset[0][old_virt_idx[0]+i*old_dist.
virt_phase[0]];
332 virt_counts[idx_offset+vc]++;
335 for (dim=1; dim < old_dist.
order; dim++){
341 if (dim == old_dist.
order - 1)
342 act_max =
MIN(act_max, end_ldim);
344 act_max =
MIN(act_max,idx[dim+1]+1-spad[dim]);
346 if (idx[dim] >= act_max){
349 if (sym[dim-1] !=
NS) idx[
dim] = idx[dim-1]+spad[dim-1];
351 idx_offset -= idx_offs[
dim];
353 idx_offset += idx_offs[
dim];
357 if (dim == old_dist.
order)
break;
361 for (dim=0; dim<old_dist.
order; dim++){
363 if (old_virt_idx[dim] >= old_dist.
virt_phase[dim])
364 old_virt_idx[
dim] = 0;
368 if (old_virt_idx[dim] > 0)
371 if (dim == old_dist.
order)
break;
386 for (
int j=1; j<act_omp_ntd; j++){
387 for (int64_t i=0; i<
np; i++){
388 all_virt_counts[i] += all_virt_counts[i+np*j];
394 memset(all_virt_counts, 0,
sizeof(int64_t)*np);
399 memcpy(send_counts, all_virt_counts, np*
sizeof(int64_t));
417 MPI_Alltoall(send_counts, 1, MPI_INT64_T,
418 recv_counts, 1, MPI_INT64_T, ord_glb_comm.
cm);
423 for (
int i=1; i<
np; i++){
424 send_displs[i] = send_displs[i-1] + send_counts[i-1];
425 recv_displs[i] = recv_displs[i-1] + recv_counts[i-1];
450 double ps[] = {(double)nv0+nv1, (
double)tot_sz};
457 char *& tsr_cyclic_data,
461 int i, idx_lyr_new, idx_lyr_old, rem_idx, prc_idx, loc_idx;
462 int num_old_virt, num_new_virt;
463 int * idx, * old_loc_lda, * new_loc_lda, * phase_lda;
466 int * phase = old_dist.
phase;
467 int order = old_dist.
order;
471 tsr_cyclic_data = sr->
alloc(new_dist.
size);
473 if (glb_comm.
rank == 0){
474 sr->
copy(tsr_cyclic_data, tsr_data);
483 MPI_Barrier(glb_comm.
cm);
484 double st_time = MPI_Wtime();
487 tsr_cyclic_data = sr->
alloc(new_dist.
size);
488 alloc_ptr(
sizeof(
int)*order, (
void**)&idx);
489 alloc_ptr(
sizeof(
int)*order, (
void**)&old_loc_lda);
490 alloc_ptr(
sizeof(
int)*order, (
void**)&new_loc_lda);
491 alloc_ptr(
sizeof(
int)*order, (
void**)&phase_lda);
493 blk_sz = old_dist.
size;
499 idx_lyr_old = glb_comm.
rank;
500 idx_lyr_new = glb_comm.
rank;
502 for (i=0; i<order; i++){
509 old_loc_lda[i] = old_loc_lda[i-1]*old_dist.
virt_phase[i-1];
510 new_loc_lda[i] = new_loc_lda[i-1]*new_dist.
virt_phase[i-1];
511 phase_lda[i] = phase_lda[i-1]*phase[i-1];
516 double * tps = (
double*)malloc(3*
sizeof(
double));
518 tps[1] = (double)num_old_virt+num_new_virt;
519 tps[2] = (double)std::max(new_dist.
size, new_dist.
size);
530 alloc_ptr(
sizeof(MPI_Request)*(num_old_virt+num_new_virt), (
void**)&reqs);
532 if (idx_lyr_new == 0){
533 memset(idx, 0,
sizeof(
int)*order);
539 for (i=0; i<order; i++){
540 loc_idx += idx[i]*new_loc_lda[i];
544 DPRINTF(3,
"proc %d receiving blk %d (loc %d, size %ld) from proc %d\n",
545 glb_comm.
rank, rem_idx, loc_idx, blk_sz, prc_idx);
546 MPI_Irecv(tsr_cyclic_data+sr->
el_size*loc_idx*blk_sz, blk_sz,
547 sr->
mdtype(), prc_idx, rem_idx, glb_comm.
cm, reqs+loc_idx);
548 for (i=0; i<order; i++){
559 if (idx_lyr_old == 0){
560 memset(idx, 0,
sizeof(
int)*order);
565 for (i=0; i<order; i++){
566 loc_idx += idx[i]*old_loc_lda[i];
569 DPRINTF(3,
"proc %d sending blk %d (loc %d size %ld) to proc %d el_size = %d %p %p\n",
570 glb_comm.
rank, loc_idx, loc_idx, blk_sz, prc_idx, sr->
el_size, tsr_data, reqs+num_new_virt+loc_idx);
571 MPI_Isend(tsr_data+sr->
el_size*loc_idx*blk_sz, blk_sz,
572 sr->
mdtype(), prc_idx, loc_idx, glb_comm.
cm, reqs+num_new_virt+loc_idx);
573 for (i=0; i<order; i++){
584 if (idx_lyr_new == 0 && idx_lyr_old == 0){
585 MPI_Waitall(num_new_virt+num_old_virt, reqs, MPI_STATUSES_IGNORE);
586 }
else if (idx_lyr_new == 0){
587 MPI_Waitall(num_new_virt, reqs, MPI_STATUSES_IGNORE);
588 }
else if (idx_lyr_old == 0){
589 MPI_Waitall(num_old_virt, reqs+num_new_virt, MPI_STATUSES_IGNORE);
590 if (sr->
addid() != NULL)
593 if (sr->
addid() != NULL)
604 MPI_Barrier(glb_comm.
cm);
605 double exe_time = MPI_Wtime()-st_time;
606 tps = (
double*)malloc(3*
sizeof(
double));
608 tps[1] = (double)num_old_virt+num_new_virt;
609 tps[2] = (double)std::max(new_dist.
size, new_dist.
size);
619 int const * old_phase,
622 int can_block_resh = 1;
623 for (j=0; j<order; j++){
625 if (new_phase != old_phase[j]) can_block_resh = 0;
627 return can_block_resh;
int ** compute_bucket_offsets(distribution const &old_dist, distribution const &new_dist, int const *len, int const *old_phys_edge_len, int const *old_virt_lda, int const *old_offsets, int *const *old_permutation, int const *new_phys_edge_len, int const *new_virt_lda, int forward, int old_virt_np, int new_virt_np, int const *old_virt_edge_len)
computes offsets for redistribution targets along each edge length
double est_time(double const *param)
estimates model time based on observarions
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values
int calc_phase() const
compute the phase of a mapping
void read_loc_pairs(int order, int64_t nval, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *phase_rank, int64_t *nread, char const *data, char **pairs, algstrct const *sr)
read tensor pairs local to processor
virtual void copy(char *a, char const *b) const
copies element b to element a
virtual char const * addid() const
MPI datatype for pairs.
#define DEBUG_PRINTF(...)
LinModel< 2 > blres_mdl(blres_mdl_init,"blres_mdl")
virtual char * alloc(int64_t n) const
allocate space for n items, necessary for object types
void padded_reshuffle(int const *sym, distribution const &old_dist, distribution const &new_dist, char *tsr_data, char **tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm)
Reshuffle elements using key-value pair read/write.
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction
Linear performance models, which given measurements, provides new model guess.
int can_block_reshuffle(int order, int const *old_phase, mapping const *map)
determines if tensor can be permuted by block
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
void calc_cnt_displs(int const *sym, distribution const &old_dist, distribution const &new_dist, int new_nvirt, int np, int const *old_virt_edge_len, int const *new_virt_lda, int64_t *send_counts, int64_t *recv_counts, int64_t *send_displs, int64_t *recv_displs, CommData ord_glb_comm, int idx_lyr, int *const *bucket_offset)
assigns keys to an array of values
bool should_observe(double const *time_param)
decides whether the current instance should be observed
void block_reshuffle(distribution const &old_dist, distribution const &new_dist, char *tsr_data, char *&tsr_cyclic_data, algstrct const *sr, CommData glb_comm)
Reshuffle elements by block given the global phases stay the same.
int el_size
size of each element of algstrct in bytes
int cdealloc(void *ptr)
free abstraction
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
double blres_est_time(int64_t tot_sz, int nv0, int nv1)
estimates execution time, given this processor sends a receives tot_sz across np procs ...
void wr_pairs_layout(int order, int np, int64_t inwrite, char const *alpha, char const *beta, char rw, int num_virt, int const *sym, int const *edge_len, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int *virt_phys_rank, int const *bucket_lda, char *wr_pairs_buf, char *rw_data, CommData glb_comm, algstrct const *sr, bool is_sparse, int64_t nnz_loc, int64_t *nnz_blk, char *&pprs_new, int64_t &nnz_loc_new)
read or write pairs from / to tensor
virtual char const * mulid() const
identity element for multiplication i.e. 1
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
virtual MPI_Datatype mdtype() const
MPI datatype.