8 #define MPI_Waitany(...) foMPI_Waitany(__VA_ARGS__) 17 int *
const * bucket_offset,
18 int const * rep_phase,
19 int64_t
const * counts,
20 int64_t
const * displs,
32 for (
int r=0; r<rep_phase[idim]; r++){
33 int rec_bucket_off = bucket_off+bucket_offset[idim][r];
34 int rec_pe_off = pe_off+pe_offset[idim][r];
35 isendrecv<idim-1>(pe_offset, bucket_offset, rep_phase, counts, displs, reqs, cm, buffer, sr, rec_bucket_off, rec_pe_off, dir);
41 (
int *
const * pe_offset,
42 int *
const * bucket_offset,
43 int const * rep_phase,
44 int64_t
const * counts,
45 int64_t
const * displs,
57 for (
int r=0; r<rep_phase[0]; r++){
58 int bucket = bucket_off+r;
59 int pe = pe_off+pe_offset[0][r];
62 foMPI_Notify_init(cm, pe,
MTAG, 1, reqs+bucket);
63 foMPI_Start(reqs+bucket);
66 MPI_Irecv(buffer+displs[bucket]*sr->el_size, counts[bucket], sr->mdtype(), pe,
MTAG, cm, reqs+bucket);
68 MPI_Isend(buffer+displs[bucket]*sr->el_size, counts[bucket], sr->mdtype(), pe,
MTAG, cm, reqs+bucket);
76 int64_t *
const * data_offset,
77 int *
const * ivmax_pre,
78 int const * rep_phase,
82 char * __restrict__ data,
83 char ** __restrict__ buckets,
89 int ivmax = ivmax_pre[idim][prev_idx];
90 for (
int iv=rep_idx[idim]; iv<=ivmax; iv+=rep_phase[idim]){
91 int64_t rec_data_off = data_off + data_offset[idim][iv];
92 redist_bucket_ror<idim-1>(bucket_offset, data_offset, ivmax_pre, rep_phase, rep_idx, virt_dim0, data_to_buckets, data, buckets, counts, sr, rec_data_off, bucket_off, iv);
98 (
int *
const * bucket_offset,
99 int64_t *
const * data_offset,
100 int *
const * ivmax_pre,
101 int const * rep_phase,
104 bool data_to_buckets,
105 char * __restrict__ data,
106 char ** __restrict__ buckets,
112 if (rep_idx[0] == -1)
113 redist_bucket<0>(bucket_offset, data_offset, ivmax_pre, rep_phase[0], virt_dim0, data_to_buckets, data, buckets, counts, sr, data_off, bucket_off, prev_idx);
115 CTF_int::redist_bucket_r0(bucket_offset, data_offset, ivmax_pre, rep_phase[0], rep_idx[0], virt_dim0, data_to_buckets, data, buckets, counts, sr, data_off, bucket_off, prev_idx);
121 int *
const * pe_offset,
122 int *
const * bucket_offset,
123 char *
const * __restrict__ buckets,
124 int64_t
const * counts,
126 int64_t
const * put_displs,
130 for (
int r=0; r<rep_phase[idim]; r++){
131 int rec_bucket_off = bucket_off+bucket_offset[idim][r];
132 int rec_pe_off = pe_off+pe_offset[idim][r];
133 put_buckets<idim-1>(rep_phase, pe_offset, bucket_offset, buckets, counts, sr, put_displs, win, rec_bucket_off, rec_pe_off);
139 int const * rep_phase,
140 int *
const * pe_offset,
141 int *
const * bucket_offset,
142 char *
const * __restrict__ buckets,
143 int64_t
const * counts,
145 int64_t
const * put_displs,
149 for (
int r=0; r<rep_phase[0]; r++){
150 int rec_pe_off = pe_off + pe_offset[0][r];
151 int rec_bucket_off = bucket_off + bucket_offset[0][r];
153 foMPI_Put_notify(buckets[rec_bucket_off], counts[rec_bucket_off], sr->
mdtype(), rec_pe_off, put_displs[rec_bucket_off], counts[rec_bucket_off], sr->
mdtype(), win,
MTAG);
155 MPI_Put(buckets[rec_bucket_off], counts[rec_bucket_off], sr->
mdtype(), rec_pe_off, put_displs[rec_bucket_off], counts[rec_bucket_off], sr->
mdtype(), win);
163 int *
const * pe_offset,
164 int *
const * bucket_offset,
165 int64_t *
const * data_offset,
166 int *
const * ivmax_pre,
167 int const * rep_phase,
175 int64_t
const * put_displs,
178 bool data_to_buckets,
179 char * __restrict__ data,
180 char ** __restrict__ buckets,
186 int tothi_rep_phase = 1;
187 for (
int id=1;
id<=idim;
id++){
188 tothi_rep_phase *= rep_phase[id];
190 #pragma omp parallel for 191 for (
int t=0; t<tothi_rep_phase; t++){
193 memcpy(rep_idx2, rep_idx,
sizeof(
int)*order);
195 int rec_bucket_off = bucket_off;
196 int rec_pe_off = pe_off;
198 for (
int id=1;
id<=idim;
id++){
199 int r = tleft%rep_phase[id];
200 tleft = tleft / rep_phase[id];
202 rec_bucket_off += bucket_offset[id][r];
203 rec_pe_off += pe_offset[id][r];
205 redist_bucket_isr<0>(order, pe_offset, bucket_offset, data_offset, ivmax_pre, rep_phase, rep_idx2, virt_dim0,
212 data_to_buckets, data, buckets, counts, sr, rec_bucket_off, rec_pe_off);
216 if (rep_phase[idim] == 1){
217 int rec_bucket_off = bucket_off + bucket_offset[idim][0];
218 int rec_pe_off = pe_off + pe_offset[idim][0];
219 redist_bucket_isr<idim-1>(order, pe_offset, bucket_offset, data_offset, ivmax_pre, rep_phase, rep_idx, virt_dim0,
226 data_to_buckets, data, buckets, counts, sr, rec_bucket_off, rec_pe_off);
228 for (
int r=0; r<rep_phase[idim]; r++){
230 memcpy(rep_idx2, rep_idx,
sizeof(
int)*order);
233 int rec_bucket_off = bucket_off + bucket_offset[idim][r];
234 int rec_pe_off = pe_off + pe_offset[idim][r];
235 redist_bucket_isr<idim-1>(order, pe_offset, bucket_offset, data_offset, ivmax_pre, rep_phase, rep_idx2, virt_dim0,
242 data_to_buckets, data, buckets, counts, sr, rec_bucket_off, rec_pe_off);
252 int *
const * pe_offset,
253 int *
const * bucket_offset,
254 int64_t *
const * data_offset,
255 int *
const * ivmax_pre,
256 int const * rep_phase,
264 int64_t
const * put_displs,
267 bool data_to_buckets,
268 char * __restrict__ data,
269 char ** __restrict__ buckets,
276 if (!data_to_buckets){
277 MPI_Waitall(rep_phase[0], rep_reqs+bucket_off, MPI_STATUSES_IGNORE);
281 SWITCH_ORD_CALL(
redist_bucket_ror, order-1, bucket_offset, data_offset, ivmax_pre, rep_phase, rep_idx, virt_dim0, data_to_buckets, data, buckets, counts, sr, 0, bucket_off, 0)
282 if (data_to_buckets){
285 for (
int r=0; r<rep_phase[0]; r++){
286 int bucket = bucket_off + bucket_offset[0][r];
287 int pe = pe_off + pe_offset[0][r];
288 MPI_Isend(buckets[bucket], counts[bucket], sr->
mdtype(), pe,
MTAG, cm, rep_reqs+bucket);
293 MPI_Testall(bucket_off, rep_reqs, &flag, MPI_STATUSES_IGNORE);
298 put_buckets<0>(rep_phase, pe_offset, bucket_offset, buckets, counts, sr, put_displs, win, bucket_off, pe_off);
305 int const * edge_len,
308 char ** ptr_tsr_data,
309 char ** ptr_tsr_new_data,
312 int order = old_dist.
order;
314 char * tsr_data = *ptr_tsr_data;
315 char * tsr_new_data = *ptr_tsr_new_data;
318 tsr_new_data = sr->
alloc(1);
319 if (ord_glb_comm.
rank == 0){
320 sr->
copy(tsr_new_data, tsr_data);
324 *ptr_tsr_new_data = tsr_new_data;
329 MPI_Barrier(ord_glb_comm.
cm);
332 double st_time = MPI_Wtime();
334 int * old_virt_lda, * new_virt_lda;
335 alloc_ptr(order*
sizeof(
int), (
void**)&old_virt_lda);
336 alloc_ptr(order*
sizeof(
int), (
void**)&new_virt_lda);
341 int old_idx_lyr = ord_glb_comm.
rank - old_dist.
perank[0]*old_dist.
pe_lda[0];
342 int new_idx_lyr = ord_glb_comm.
rank - new_dist.
perank[0]*new_dist.
pe_lda[0];
344 for (
int i=1; i<order; i++) {
345 new_virt_lda[i] = new_nvirt;
346 old_virt_lda[i] = old_nvirt;
352 int64_t old_virt_nelem = old_dist.
size/old_nvirt;
353 int64_t new_virt_nelem = new_dist.
size/new_nvirt;
355 int *old_phys_edge_len;
alloc_ptr(
sizeof(
int)*order, (
void**)&old_phys_edge_len);
359 int *new_phys_edge_len;
alloc_ptr(
sizeof(
int)*order, (
void**)&new_phys_edge_len);
363 int *old_virt_edge_len;
alloc_ptr(
sizeof(
int)*order, (
void**)&old_virt_edge_len);
367 int *new_virt_edge_len;
alloc_ptr(
sizeof(
int)*order, (
void**)&new_virt_edge_len);
372 int * old_rep_phase;
alloc_ptr(
sizeof(
int)*order, (
void**)&old_rep_phase);
373 for (
int i=0; i<order; i++){
375 nold_rep *= old_rep_phase[i];
379 int * new_rep_phase;
alloc_ptr(
sizeof(
int)*order, (
void**)&new_rep_phase);
380 for (
int i=0; i<order; i++){
382 nnew_rep *= new_rep_phase[i];
385 int64_t * send_counts = (int64_t*)
alloc(
sizeof(int64_t)*nold_rep);
386 std::fill(send_counts, send_counts+nold_rep, 0);
387 calc_drv_displs(sym, edge_len, old_dist, new_dist, send_counts, old_idx_lyr);
389 int64_t * recv_counts = (int64_t*)
alloc(
sizeof(int64_t)*nnew_rep);
390 std::fill(recv_counts, recv_counts+nnew_rep, 0);
391 calc_drv_displs(sym, edge_len, new_dist, old_dist, recv_counts, new_idx_lyr);
392 int64_t * recv_displs = (int64_t*)
alloc(
sizeof(int64_t)*nnew_rep);
399 for (
int i=0; i<nnew_rep; i++){
403 recv_displs[i] = recv_displs[i-1] + recv_counts[i-1];
406 int ** recv_bucket_offset;
alloc_ptr(
sizeof(
int*)*order, (
void**)&recv_bucket_offset);
407 int ** recv_pe_offset;
alloc_ptr(
sizeof(
int*)*order, (
void**)&recv_pe_offset);
408 int ** recv_ivmax_pre;
alloc_ptr(
sizeof(
int*)*order, (
void**)&recv_ivmax_pre);
409 int64_t ** recv_data_offset;
alloc_ptr(
sizeof(int64_t*)*order, (
void**)&recv_data_offset);
410 precompute_offsets(new_dist, old_dist, sym, edge_len, new_rep_phase, new_phys_edge_len, new_virt_edge_len, new_dist.
virt_phase, new_virt_lda, new_virt_nelem, recv_pe_offset, recv_bucket_offset, recv_data_offset, recv_ivmax_pre);
412 int ** send_bucket_offset;
alloc_ptr(
sizeof(
int*)*order, (
void**)&send_bucket_offset);
413 int ** send_pe_offset;
alloc_ptr(
sizeof(
int*)*order, (
void**)&send_pe_offset);
414 int ** send_ivmax_pre;
alloc_ptr(
sizeof(
int*)*order, (
void**)&send_ivmax_pre);
415 int64_t ** send_data_offset;
alloc_ptr(
sizeof(int64_t*)*order, (
void**)&send_data_offset);
417 precompute_offsets(old_dist, new_dist, sym, edge_len, old_rep_phase, old_phys_edge_len, old_virt_edge_len, old_dist.
virt_phase, old_virt_lda, old_virt_nelem, send_pe_offset, send_bucket_offset, send_data_offset, send_ivmax_pre);
419 #if !defined(IREDIST) && !defined(PUTREDIST) 420 int64_t * send_displs = (int64_t*)
alloc(
sizeof(int64_t)*nold_rep);
422 for (
int i=1; i<nold_rep; i++){
423 send_displs[i] = send_displs[i-1] + send_counts[i-1];
425 #elif defined(PUTREDIST) 426 int64_t * all_recv_displs = (int64_t*)
alloc(
sizeof(int64_t)*ord_glb_comm.
np);
429 int64_t * all_put_displs = (int64_t*)
alloc(
sizeof(int64_t)*ord_glb_comm.
np);
430 MPI_Alltoall(all_recv_displs, 1, MPI_INT64_T, all_put_displs, 1, MPI_INT64_T, ord_glb_comm.
cm);
433 int64_t * put_displs = (int64_t*)
alloc(
sizeof(int64_t)*nold_rep);
443 int suc = MPI_Win_create(recv_buffer, new_dist.
size*sr->
el_size, sr->
el_size, MPI_INFO_NULL, ord_glb_comm.
cm, &win);
444 ASSERT(suc == MPI_SUCCESS);
446 MPI_Win_fence(0, win);
452 if (new_idx_lyr == 0)
453 SWITCH_ORD_CALL(
isendrecv, order-1, recv_pe_offset, recv_bucket_offset, new_rep_phase, recv_counts, recv_displs, recv_reqs, win, recv_buffer, sr, 0, 0, 1);
457 if (new_idx_lyr == 0)
458 SWITCH_ORD_CALL(
isendrecv, order-1, recv_pe_offset, recv_bucket_offset, new_rep_phase, recv_counts, recv_displs, recv_reqs, ord_glb_comm.
cm, recv_buffer, sr, 0, 0, 1);
463 if (old_idx_lyr == 0){
464 char * aux_buf = sr->
alloc(old_dist.
size);
465 char * tmp = aux_buf;
468 char ** buckets = (
char**)
alloc(
sizeof(
char**)*nold_rep);
470 buckets[0] = tsr_data;
471 for (
int i=1; i<nold_rep; i++){
472 buckets[i] = buckets[i-1] + sr->
el_size*send_counts[i-1];
475 int64_t save_counts[nold_rep];
476 memcpy(save_counts, send_counts,
sizeof(int64_t)*nold_rep);
478 std::fill(send_counts, send_counts+nold_rep, 0);
481 int * old_rep_idx;
alloc_ptr(
sizeof(
int)*order, (
void**)&old_rep_idx);
482 memset(old_rep_idx, 0,
sizeof(
int)*order);
485 send_ivmax_pre, old_rep_phase, old_rep_idx, old_dist.
virt_phase[0],
487 send_reqs, ord_glb_comm.
cm,
492 1, aux_buf, buckets, send_counts, sr);
496 send_ivmax_pre, old_rep_phase[0], old_dist.
virt_phase[0], 1, aux_buf, buckets, send_counts, sr);
503 for (
int i=0; i<nold_rep; i++){
504 if (save_counts[i] != send_counts[i]) pass =
false;
507 for (
int i=0; i<nold_rep; i++){
508 printf(
"[%d] send_counts[%d] = %ld, redist_bucket counts[%d] = %ld\n", ord_glb_comm.
rank, i, save_counts[i], i, send_counts[i]);
518 char * recv_buffer = sr->
alloc(new_dist.
size);
525 if (new_idx_lyr == 0){
527 SWITCH_ORD_CALL(
isendrecv, order-1, recv_pe_offset, recv_bucket_offset, new_rep_phase, recv_counts, recv_displs, reqs, ord_glb_comm.
cm, recv_buffer, sr, 0, 0, 1);
530 if (old_idx_lyr == 0){
532 SWITCH_ORD_CALL(
isendrecv, order-1, send_pe_offset, send_bucket_offset, old_rep_phase, send_counts, send_displs, reqs+nrecv, ord_glb_comm.
cm, tsr_data, sr, 0, 0, 0);
534 if (nrecv+nsent > 0){
536 MPI_Waitall(nrecv+nsent, reqs, MPI_STATUSES_IGNORE);
546 MPI_Win_fence(0, win);
555 if (new_idx_lyr == 0){
556 char * aux_buf = sr->
alloc(new_dist.
size);
559 char ** buckets = (
char**)
alloc(
sizeof(
char**)*nnew_rep);
561 buckets[0] = recv_buffer;
563 for (
int i=1; i<nnew_rep; i++){
564 buckets[i] = buckets[i-1] + sr->
el_size*recv_counts[i-1];
569 int64_t save_counts[nnew_rep];
570 memcpy(save_counts, recv_counts,
sizeof(int64_t)*nnew_rep);
572 std::fill(recv_counts, recv_counts+nnew_rep, 0);
577 for (
int nb=0; nb<nnew_rep; nb++){
586 MPI_Waitany(nnew_rep, recv_reqs, &bucket_off, &stat);
589 ASSERT(bucket_off != MPI_UNDEFINED);
590 ASSERT(bucket_off >= 0 && bucket_off <nnew_rep);
591 ASSERT(recv_counts[bucket_off] == 0);
595 int iboff=bucket_off;
596 for (
int i=0; i<order; i++){
607 rep_idx[i] = iboff%new_rep_phase[i];
608 iboff = iboff/new_rep_phase[i];
611 SWITCH_ORD_CALL(
redist_bucket_ror, order-1, recv_bucket_offset, recv_data_offset, recv_ivmax_pre, new_rep_phase, rep_idx, new_dist.
virt_phase[0], 0, aux_buf, buckets, recv_counts, sr, 0, bucket_off, 0)
616 for (
int nb=0; nb<nnew_rep; nb++){
617 foMPI_Request_free(recv_reqs+nb);
622 int * new_rep_idx;
alloc_ptr(
sizeof(
int)*order, (
void**)&new_rep_idx);
623 memset(new_rep_idx, 0,
sizeof(
int)*order);
626 recv_ivmax_pre, new_rep_phase, new_rep_idx, new_dist.
virt_phase[0],
628 recv_reqs, ord_glb_comm.
cm,
633 0, aux_buf, buckets, recv_counts, sr);
637 recv_bucket_offset, recv_data_offset, recv_ivmax_pre,
638 new_rep_phase[0], new_dist.
virt_phase[0], 0, aux_buf, buckets, recv_counts, sr);
645 for (
int i=0; i<nnew_rep; i++){
646 if (save_counts[i] != recv_counts[i]) pass =
false;
649 for (
int i=0; i<nnew_rep; i++){
650 printf(
"[%d] recv_counts[%d] = %ld, redist_bucket counts[%d] = %ld\n", ord_glb_comm.
rank, i, save_counts[i], i, recv_counts[i]);
655 *ptr_tsr_new_data = aux_buf;
658 if (sr->
addid() != NULL)
660 *ptr_tsr_new_data = recv_buffer;
668 for (
int i=0; i<order; i++){
679 for (
int i=0; i<order; i++){
702 foMPI_Win_flush_all(win);
703 foMPI_Win_free(&win);
706 MPI_Barrier(ord_glb_comm.
cm);
707 TAU_FSTOP(barrier_after_dgtog_reshuffle);
712 MPI_Barrier(ord_glb_comm.
cm);
714 double exe_time = MPI_Wtime()-st_time;
715 double tps[] = {exe_time, 1.0, (double)log2(ord_glb_comm.
np), (double)std::max(old_dist.
size, new_dist.
size)*log2(ord_glb_comm.
np)*sr->
el_size};
void redist_bucket(int *const *bucket_offset, int64_t *const *data_offset, int *const *ivmax_pre, int rep_phase0, int virt_dim0, bool data_to_buckets, char *__restrict__ data, char **__restrict__ buckets, int64_t *counts, algstrct const *sr, int64_t data_off, int bucket_off, int prev_idx)
virtual void init(int64_t n, char *arr) const
initialize n objects to zero
void redist_bucket_ror< 0 >(int *const *bucket_offset, int64_t *const *data_offset, int *const *ivmax_pre, int const *rep_phase, int const *rep_idx, int virt_dim0, bool data_to_buckets, char *__restrict__ data, char **__restrict__ buckets, int64_t *counts, algstrct const *sr, int64_t data_off, int bucket_off, int prev_idx)
virtual void copy(char *a, char const *b) const
copies element b to element a
void * alloc(int64_t len)
alloc abstraction
void redist_bucket_isr< 0 >(int order, int *const *pe_offset, int *const *bucket_offset, int64_t *const *data_offset, int *const *ivmax_pre, int const *rep_phase, int *rep_idx, int virt_dim0, bool data_to_buckets, char *__restrict__ data, char **__restrict__ buckets, int64_t *counts, algstrct const *sr, int bucket_off, int pe_off)
void dgtog_reshuffle(int const *sym, int const *edge_len, distribution const &old_dist, distribution const &new_dist, char **ptr_tsr_data, char **ptr_tsr_new_data, algstrct const *sr, CommData ord_glb_comm)
void isendrecv< 0 >(int *const *pe_offset, int *const *bucket_offset, int const *rep_phase, int64_t const *counts, int64_t const *displs, CTF_Request *reqs, MPI_Comm cm, char *buffer, algstrct const *sr, int bucket_off, int pe_off, int dir)
virtual char const * addid() const
MPI datatype for pairs.
virtual void dealloc(char *ptr) const
deallocate given pointer containing contiguous array of values
void calc_drv_displs(int const *sym, int const *edge_len, distribution const &old_dist, distribution const &new_dist, int64_t *counts, int idx_lyr)
virtual char * alloc(int64_t n) const
allocate space for n items, necessary for object types
void redist_bucket< 0 >(int *const *bucket_offset, int64_t *const *data_offset, int *const *ivmax_pre, int rep_phase0, int virt_dim0, bool data_to_buckets, char *__restrict__ data, char **__restrict__ buckets, int64_t *counts, algstrct const *sr, int64_t data_off, int bucket_off, int prev_idx)
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
#define SWITCH_ORD_CALL(F, act_ord,...)
void redist_bucket_ror(int *const *bucket_offset, int64_t *const *data_offset, int *const *ivmax_pre, int const *rep_phase, int const *rep_idx, int virt_dim0, bool data_to_buckets, char *__restrict__ data, char **__restrict__ buckets, int64_t *counts, algstrct const *sr, int64_t data_off=0, int bucket_off=0, int prev_idx=0)
void isendrecv(int *const *pe_offset, int *const *bucket_offset, int const *rep_phase, int64_t const *counts, int64_t const *displs, CTF_Request *reqs, MPI_Comm cm, char *buffer, algstrct const *sr, int bucket_off, int pe_off, int dir)
void redist_bucket_isr(int order, int *const *pe_offset, int *const *bucket_offset, int64_t *const *data_offset, int *const *ivmax_pre, int const *rep_phase, int *rep_idx, int virt_dim0, bool data_to_buckets, char *__restrict__ data, char **__restrict__ buckets, int64_t *counts, algstrct const *sr, int bucket_off=0, int pe_off=0)
void put_buckets< 0 >(int const *rep_phase, int *const *pe_offset, int *const *bucket_offset, char *const *__restrict__ buckets, int64_t const *counts, algstrct const *sr, int64_t const *put_displs, CTF_Win &win, int bucket_off, int pe_off)
int el_size
size of each element of algstrct in bytes
int cdealloc(void *ptr)
free abstraction
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
LinModel< 3 > dgtog_res_mdl(dgtog_res_mdl_init,"dgtog_res_mdl")
void precompute_offsets(distribution const &old_dist, distribution const &new_dist, int const *sym, int const *len, int const *rep_phase, int const *phys_edge_len, int const *virt_edge_len, int const *virt_dim, int const *virt_lda, int64_t virt_nelem, int **pe_offset, int **bucket_offset, int64_t **data_offset, int **ivmax_pre)
void calc_cnt_from_rep_cnt(int const *rep_phase, int *const *pe_offset, int *const *bucket_offset, int64_t const *old_counts, int64_t *counts, int bucket_off, int pe_off, int dir)
void put_buckets(int const *rep_phase, int *const *pe_offset, int *const *bucket_offset, char *const *__restrict__ buckets, int64_t const *counts, algstrct const *sr, int64_t const *put_displs, CTF_Win &win, int bucket_off, int pe_off)
void redist_bucket_r0(int *const *bucket_offset, int64_t *const *data_offset, int *const *ivmax_pre, int rep_phase0, int rep_idx0, int virt_dim0, bool data_to_buckets, char *__restrict__ data, char **__restrict__ buckets, int64_t *counts, algstrct const *sr, int64_t data_off, int bucket_off, int prev_idx)
virtual MPI_Datatype mdtype() const
MPI datatype.