4 #include "../shared/util.h" 12 int const * old_phys_dim,
13 int const * old_phys_edge_len,
14 int const * old_virt_edge_len,
15 int64_t old_virt_nelem,
16 int const * old_offsets,
17 int *
const * old_permutation,
19 int const * new_phys_dim,
20 int const * new_phys_edge_len,
21 int const * new_virt_edge_len,
22 int64_t new_virt_nelem,
26 int *
const * bucket_offset,
32 if (old_dist.
order == 0){
34 sr->
copy(new_data[0], old_data);
37 sr->
copy(old_data, new_data[0]);
39 sr->
acc(old_data, beta, new_data[0], alpha);
50 int nbucket = total_np;
54 MPI_Comm_rank(MPI_COMM_WORLD,&
rank);
63 int max_ntd = omp_get_max_threads();
64 max_ntd =
MAX(1,
MIN(max_ntd,new_virt_nelem/nbucket));
66 int64_t old_size, new_size;
77 int64_t * bucket_store;
78 int64_t * count_store;
79 int64_t * thread_store;
80 mst_alloc_ptr(
sizeof(int64_t)*
MAX(old_size,new_size), (
void**)&bucket_store);
82 mst_alloc_ptr(
sizeof(int64_t)*
MAX(old_size,new_size), (
void**)&thread_store);
83 std::fill(bucket_store, bucket_store+
MAX(old_size,new_size), -1);
85 int64_t ** par_virt_counts;
86 alloc_ptr(
sizeof(int64_t*)*max_ntd, (
void**)&par_virt_counts);
87 for (
int t=0; t<max_ntd; t++){
88 mst_alloc_ptr(
sizeof(int64_t)*nbucket, (
void**)&par_virt_counts[t]);
89 std::fill(par_virt_counts[t], par_virt_counts[t]+nbucket, 0);
91 #pragma omp parallel num_threads(max_ntd) 96 if (old_offsets == NULL)
105 int tid = omp_get_thread_num();
106 int ntd = omp_get_num_threads();
109 int gidx_st[old_dist.
order];
110 int gidx_end[old_dist.
order];
111 if (old_dist.
order > 1){
113 int64_t chnk = all_size/ntd;
114 int64_t glb_idx_st = chnk*tid +
MIN(tid,all_size%ntd);
115 int64_t glb_idx_end = glb_idx_st+chnk+(tid<(all_size%ntd));
125 if (gidx_end[old_dist.
order-1] != len[old_dist.
order-1]){
127 printf(
"glb_idx_end = %ld, gidx_end[%d]= %d, len[%d] = %d\n",
128 glb_idx_end,
dim, gidx_end[
dim], dim, len[dim]);
139 gidx_end[0] = ends[0];
152 int64_t * count = par_virt_counts[tid];
154 int64_t *count;
alloc_ptr(
sizeof(int64_t)*nbucket, (
void**)&count);
155 memset(count, 0,
sizeof(int64_t)*nbucket);
159 memset(gidx, 0,
sizeof(
int)*old_dist.
order);
164 int64_t *virt_offset;
alloc_ptr(
sizeof(int64_t)*old_dist.
order, (
void**)&virt_offset);
165 memset(virt_offset, 0,
sizeof(int64_t)*old_dist.
order);
168 memset(idx, 0,
sizeof(
int)*old_dist.
order);
170 int64_t *virt_acc;
alloc_ptr(
sizeof(int64_t)*old_dist.
order, (
void**)&virt_acc);
171 memset(virt_acc, 0,
sizeof(int64_t)*old_dist.
order);
173 int64_t *idx_acc;
alloc_ptr(
sizeof(int64_t)*old_dist.
order, (
void**)&idx_acc);
174 memset(idx_acc, 0,
sizeof(int64_t)*old_dist.
order);
176 int64_t *old_virt_lda;
alloc_ptr(
sizeof(int64_t)*old_dist.
order, (
void**)&old_virt_lda);
177 old_virt_lda[0] = old_virt_nelem;
184 int64_t zero_len_toff = 0;
189 int64_t ist = iist/old_dist.
phase[
dim];
191 int plen[old_dist.
order];
192 memcpy(plen,old_virt_edge_len,old_dist.
order*
sizeof(
int));
197 }
while (idim >= 0 && sym[idim] !=
NS);
201 offset += idx_acc[
dim];
206 if (gidx[dim] > gidx_st[dim])
break;
208 int64_t vst = iist-ist*old_dist.
phase[
dim];
212 virt_offset[
dim] = vst;
213 offset += vst*old_virt_lda[
dim];
215 if (gidx[dim] > gidx_st[dim])
break;
219 ASSERT(old_permutation == NULL);
220 int rep_phase0 =
lcm(old_phys_dim[0],new_phys_dim[0])/old_phys_dim[0];
223 printf(
"rep_phase0 = %d\n",rep_phase0);
224 for (
int id=0;
id<rep_phase0;
id++){
225 for (
int jd=0; jd<(old_phys_edge_len[0]-id)/rep_phase0; jd++){
226 printf(
"bucket_offset[%d] = %d\n",
id+jd*rep_phase0,bucket_offset[0][
id+jd*rep_phase0]);
227 ASSERT(bucket_offset[0][
id+jd*rep_phase0] == bucket_offset[0][
id] || bucket_offset[0][
id+jd*rep_phase0] == -1);
235 bool outside0 =
false;
236 int len_zero_max = ends[0];
238 bool is_at_end =
true;
239 bool is_at_start =
true;
241 if (gidx[
dim] > gidx_st[
dim]){
245 if (gidx[dim] < gidx_st[dim]){
251 zero_len_toff = gidx_st[0];
254 if (gidx_end[
dim] < gidx[
dim]){
259 if (gidx_end[dim] > gidx[dim]){
265 len_zero_max =
MIN(ends[0],gidx_end[0]);
289 int idx_max = (sym[0] ==
NS ? old_virt_edge_len[0] : idx[1]+1);
292 int gidx_min =
MAX(zero_len_toff,offs[0]);
293 int gidx_max = (sym[0] ==
NS ? ends[0] : (sym[0] ==
SY ? gidx[1]+1 : gidx[1]));
294 gidx_max =
MIN(gidx_max, len_zero_max);
296 int idx0 =
MAX(0,(gidx_min-gidx[0])/old_phys_dim[0]);
298 int idx1 =
MAX(0,(gidx_max-gidx[0]+old_phys_dim[0]-1)/old_phys_dim[0]);
299 int lencp =
MIN(rep_phase0,idx1-idx0);
302 for (
int ia=0; ia<lencp; ia++){
303 int64_t bucket = bucket0+bucket_offset[0][idx0];
304 sr->
copy((idx1-idx0+rep_phase0-1)/rep_phase0,
305 old_data+ sr->
el_size*(offset+idx0), rep_phase0,
306 new_data[bucket]+sr->
el_size*count[bucket], 1);
307 count[bucket]+=(idx1-idx0+rep_phase0-1)/rep_phase0;
314 for (
int ia=0; ia<lencp; ia++){
315 int64_t bucket = bucket0+bucket_offset[0][idx0];
316 sr->
copy((idx1-idx0+rep_phase0-1)/rep_phase0,
317 new_data[bucket]+sr->
el_size*count[bucket], 1,
318 old_data+ sr->
el_size*(offset+idx0), rep_phase0);
319 count[bucket]+=(idx1-idx0+rep_phase0-1)/rep_phase0;
390 virt_offset[
dim] += 1;
395 virt_offset[
dim] = 0;
400 if (idx[
dim] == (sym[
dim] ==
NS ? old_virt_edge_len[
dim] : idx[
dim+1]+1)){
405 if (
dim == old_dist.
order-1) done =
true;
417 if (old_dist.
order <= 1) done =
true;
430 for (
int i = 0;i < nbucket-1;i++){
431 if (count[i] != (int64_t)((new_data[i+1]-new_data[i])/sr->
el_size)){
432 printf(
"rank = %d count %d should have been %d is %ld\n",
rank, i, (
int)((new_data[i+1]-new_data[i])/sr->
el_size), count[i]);
446 par_virt_counts[tid] = count;
448 for (
int bckt=0; bckt<nbucket; bckt++){
450 for (
int thread=0; thread<max_ntd; thread++){
451 par_tmp += par_virt_counts[thread][bckt];
452 par_virt_counts[thread][bckt] = par_tmp - par_virt_counts[thread][bckt];
455 if (bckt < nbucket-1 && par_tmp != (new_data[bckt+1]-new_data[bckt])/sr->
el_size){
456 printf(
"rank = %d count for bucket %d is %d should have been %ld\n",
rank,bckt,par_tmp,(int64_t)(new_data[bckt+1]-new_data[bckt])/sr->
el_size);
464 int64_t tot_sz =
MAX(old_size, new_size);
468 #pragma omp parallel for private(i) 469 for (i=0; i<tot_sz; i++){
470 if (bucket_store[i] != -1){
471 int64_t pc = par_virt_counts[thread_store[i]][bucket_store[i]];
472 int64_t ct = count_store[i]+pc;
478 #pragma omp parallel for private(i) 479 for (i=0; i<tot_sz; i++){
480 if (bucket_store[i] != -1){
481 int64_t pc = par_virt_counts[thread_store[i]][bucket_store[i]];
482 int64_t ct = count_store[i]+pc;
487 #pragma omp parallel for private(i) 488 for (i=0; i<tot_sz; i++){
489 if (bucket_store[i] != -1){
490 int64_t pc = par_virt_counts[thread_store[i]][bucket_store[i]];
491 int64_t ct = count_store[i]+pc;
492 sr->
acc(old_data+i*sr->
el_size, beta, new_data[bucket_store[i]]+ct*sr->
el_size, alpha);
499 for (
int t=0; t<max_ntd; t++){
512 int64_t sy_packed_offset(
int dim,
int const * len,
int idx,
int const * sym){
513 if (idx == 0)
return 0;
514 if (sym[dim-1] ==
NS){
520 int64_t offset = iidx;
526 }
while (i<=dim && sym[dim-i] !=
NS);
535 int const * virt_edge_len,
536 int const * virt_phase_lda,
539 char const * tsr_data_in,
543 int64_t glb_ord_offset=0,
544 int64_t blk_ord_offset=0){
546 int imax=virt_edge_len[idim];
547 if (sym[idim] !=
NS) imax = prev_idx+1;
548 int vp_stride = virt_phase_lda[idim]*dist.
virt_phase[idim];
549 for (
int i=0; i<imax; i++){
550 int64_t dim_offset = sy_packed_offset(idim, virt_edge_len, i, sym);
551 int64_t i_blk_ord_offset = blk_ord_offset + dim_offset;
552 int64_t i_glb_ord_offset = glb_ord_offset + dim_offset*vp_stride;
554 int64_t iv_blk_ord_offset = i_blk_ord_offset + v*virt_phase_lda[idim]*vbs;
555 int64_t iv_glb_ord_offset = i_glb_ord_offset;
557 int64_t glb_vrt_offset = sy_packed_offset(idim, virt_edge_len, i+1, sym);
558 iv_glb_ord_offset += (glb_vrt_offset-dim_offset)*virt_phase_lda[idim]*v;
560 ord_glb<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, i, iv_glb_ord_offset, iv_blk_ord_offset);
568 int const * virt_edge_len,
569 int const * virt_phase_lda,
572 char const * tsr_data_in,
576 int64_t glb_ord_offset,
577 int64_t blk_ord_offset){
578 int imax=virt_edge_len[0];
579 if (sym[0] !=
NS) imax = prev_idx+1;
580 for (
int v=0; v<dist.virt_phase[0]; v++){
582 sr->copy(imax, tsr_data_in + sr->el_size*(blk_ord_offset+v*vbs), 1,
583 tsr_data_out + sr->el_size*(glb_ord_offset+v), dist.virt_phase[0]);
585 sr->copy(imax, tsr_data_in + sr->el_size*(glb_ord_offset+v), dist.virt_phase[0],
586 tsr_data_out + sr->el_size*(blk_ord_offset+v*vbs), 1);
594 int const * virt_edge_len,
595 int const * virt_phase_lda,
598 char const * tsr_data_in,
602 int64_t glb_ord_offset,
603 int64_t blk_ord_offset);
608 int const * virt_edge_len,
609 int const * virt_phase_lda,
612 char const * tsr_data_in,
618 int64_t glb_ord_offset=0,
619 int64_t blk_ord_offset=0){
620 int imax=virt_edge_len[idim];
621 if (sym[idim] !=
NS) imax = prev_idx+1;
623 imax =
MIN(imax,idx_end[idim]+1);
629 int vp_stride = virt_phase_lda[idim]*dist.
virt_phase[idim];
631 for (
int i=ist; i<imax; i++){
632 int64_t dim_offset = sy_packed_offset(idim, virt_edge_len, i, sym);
633 int64_t i_blk_ord_offset = blk_ord_offset + dim_offset;
634 int64_t i_glb_ord_offset = glb_ord_offset + dim_offset*vp_stride;
636 int64_t iv_blk_ord_offset = i_blk_ord_offset + v*virt_phase_lda[idim]*vbs;
637 int64_t iv_glb_ord_offset = i_glb_ord_offset;
639 int64_t glb_vrt_offset = sy_packed_offset(idim, virt_edge_len, i+1, sym);
640 iv_glb_ord_offset += (glb_vrt_offset-dim_offset)*virt_phase_lda[idim]*v;
642 if (i==ist && i==imax-1)
643 ord_glb_omp<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, idx_st, idx_end, i, iv_glb_ord_offset, iv_blk_ord_offset);
645 ord_glb_omp<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, idx_st, NULL, i, iv_glb_ord_offset, iv_blk_ord_offset);
647 ord_glb_omp<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, NULL, idx_end, i, iv_glb_ord_offset, iv_blk_ord_offset);
649 ord_glb<idim-1>(sym, dist, virt_edge_len, virt_phase_lda, vbs, dir, tsr_data_in, tsr_data_out, sr, i, iv_glb_ord_offset, iv_blk_ord_offset);
657 int const * virt_edge_len,
658 int const * virt_phase_lda,
661 char const * tsr_data_in,
667 int64_t glb_ord_offset,
668 int64_t blk_ord_offset){
669 ord_glb<0>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr,prev_idx,glb_ord_offset,blk_ord_offset);
675 int const * virt_edge_len,
676 int const * virt_phase_lda,
679 char const * tsr_data_in,
685 int64_t glb_ord_offset,
686 int64_t blk_ord_offset);
691 int const * virt_edge_len,
692 int const * virt_phase_lda,
695 char const * tsr_data_in,
700 if (dist.
order == 1){
701 return ord_glb<0>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr);
703 if (dist.
order <= 8){
709 int tid = omp_get_thread_num();
710 int ntd = omp_get_num_threads();
711 int64_t vbs_chunk = vbs/ntd;
712 int64_t fidx_st = vbs_chunk*tid +
MIN(tid,vbs%ntd);
714 int64_t fidx_end = fidx_st + vbs_chunk;
715 if (tid < vbs%ntd) fidx_end++;
716 int * idx_st = (
int*)
alloc(dist.
order*
sizeof(
int));
717 int * idx_end = (
int*)
alloc(dist.
order*
sizeof(
int));
725 if (idx_end[idim] < 0 && idim+1<dist.
order){
726 idx_end[idim] = virt_edge_len[idim]-1;
740 #define CASE_ORD_GLB(n) \ 742 ord_glb_omp<n-1>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr,idx_st,idx_end); \ 745 #define CASE_ORD_GLB(n) \ 747 ord_glb<n-1>(sym,dist,virt_edge_len,virt_phase_lda,vbs,dir,tsr_data_in,tsr_data_out,sr); \ 770 int order = dist.
order;
771 int * virt_idx = (
int*)
alloc(order*
sizeof(
int));
772 int * idx = (
int*)
alloc(order*
sizeof(
int));
774 std::fill(virt_idx, virt_idx+order, 0);
776 std::fill(idx, idx+order, 0);
781 int64_t glb_ord_offset = virt_idx[0];
782 int64_t blk_ord_offset = virt_idx[0]*vbs;
783 for (
int idim=1; idim<order; idim++){
785 int64_t dim_offset = sy_packed_offset(idim, virt_edge_len, idx[idim], sym);
787 blk_ord_offset += dim_offset;
788 blk_ord_offset += virt_idx[idim]*virt_phase_lda[idim]*vbs;
791 glb_ord_offset += dim_offset*virt_phase_lda[idim]*dist.
virt_phase[idim];
797 if (virt_idx[idim] > 0){
798 int64_t glb_vrt_offset = sy_packed_offset(idim, virt_edge_len, idx[idim]+1, sym);
799 glb_ord_offset += (glb_vrt_offset-dim_offset)*virt_phase_lda[idim]*virt_idx[idim];
805 int n = virt_edge_len[0];
806 if (sym[0] !=
NS) n = idx[1]+1;
834 bool exit, finish=
false;
840 if (idx[dim] == virt_edge_len[dim]-1 || (sym[dim] !=
NS && idx[dim] == idx[dim+1])){
854 bool exit, finish=
false;
881 int const * old_offsets,
882 int *
const * old_permutation,
884 int const * new_offsets,
885 int *
const * new_permutation,
886 char ** ptr_tsr_data,
887 char ** ptr_tsr_cyclic_data,
893 int i,
np, old_nvirt, new_nvirt, old_np, new_np, idx_lyr;
894 int64_t vbs_old, vbs_new;
897 int64_t * send_counts, * recv_counts;
900 int64_t * send_displs;
901 int64_t * recv_displs;
902 int * new_virt_lda, * old_virt_lda;
903 int * old_sub_edge_len, * new_sub_edge_len;
904 int order = old_dist.
order;
906 char * tsr_data = *ptr_tsr_data;
907 char * tsr_cyclic_data = *ptr_tsr_cyclic_data;
909 bool is_copy =
false;
912 if (ord_glb_comm.
rank == 0){
914 sr->
copy(tsr_cyclic_data, tsr_data);
916 sr->
acc(tsr_cyclic_data, beta, tsr_data, alpha);
920 *ptr_tsr_cyclic_data = tsr_cyclic_data;
921 return tsr_cyclic_data;
928 np = ord_glb_comm.
np;
930 alloc_ptr(order*
sizeof(
int), (
void**)&hsym);
931 alloc_ptr(order*
sizeof(
int), (
void**)&idx);
932 alloc_ptr(order*
sizeof(int64_t), (
void**)&idx_offs);
933 alloc_ptr(order*
sizeof(
int), (
void**)&old_virt_lda);
934 alloc_ptr(order*
sizeof(
int), (
void**)&new_virt_lda);
940 idx_lyr = ord_glb_comm.
rank;
941 for (i=0; i<order; i++) {
942 new_virt_lda[i] = new_nvirt;
943 old_virt_lda[i] = old_nvirt;
957 vbs_old = old_dist.
size/old_nvirt;
963 alloc_ptr(order*
sizeof(
int), (
void**)&old_sub_edge_len);
964 alloc_ptr(order*
sizeof(
int), (
void**)&new_sub_edge_len);
965 int ** bucket_offset;
967 int *real_edge_len;
alloc_ptr(
sizeof(
int)*order, (
void**)&real_edge_len);
970 int *old_phys_edge_len;
alloc_ptr(
sizeof(
int)*order, (
void**)&old_phys_edge_len);
971 for (
int dim = 0;dim < order;dim++) old_phys_edge_len[dim] = (real_edge_len[dim]+old_dist.
padding[dim])/old_dist.
phys_phase[
dim];
973 int *new_phys_edge_len;
alloc_ptr(
sizeof(
int)*order, (
void**)&new_phys_edge_len);
974 for (
int dim = 0;dim < order;dim++) new_phys_edge_len[dim] = (real_edge_len[dim]+new_dist.
padding[dim])/new_dist.
phys_phase[
dim];
976 int *old_virt_edge_len;
alloc_ptr(
sizeof(
int)*order, (
void**)&old_virt_edge_len);
977 for (
int dim = 0;dim < order;dim++) old_virt_edge_len[dim] = old_phys_edge_len[dim]/old_dist.
virt_phase[dim];
979 int *new_virt_edge_len;
alloc_ptr(
sizeof(
int)*order, (
void**)&new_virt_edge_len);
980 for (
int dim = 0;dim < order;dim++) new_virt_edge_len[dim] = new_phys_edge_len[dim]/new_dist.
virt_phase[dim];
1044 for (i=0; i<order; i++){
1048 for (i=0; i<order; i++){
1049 new_sub_edge_len[i] = new_sub_edge_len[i] / new_dist.
phase[i];
1050 old_sub_edge_len[i] = old_sub_edge_len[i] / old_dist.
phase[i];
1052 for (i=1; i<order; i++){
1055 swp_nval = new_nvirt*
sy_packed_size(order, new_sub_edge_len, sym);
1056 vbs_new = swp_nval/new_nvirt;
1058 char * send_buffer, * recv_buffer;
1061 recv_buffer = tsr_cyclic_data;
1069 order_globally(sym, old_dist, old_virt_edge_len, old_virt_lda, vbs_old, 1, tsr_data, tsr_cyclic_data, sr);
1070 char **new_data;
alloc_ptr(
sizeof(
char*)*np, (
void**)&new_data);
1071 for (int64_t p = 0;p <
np;p++){
1072 new_data[p] = tsr_data+sr->
el_size*send_displs[p];
1099 for (
int dim = 0;dim < order;dim++){
1107 recv_buffer = tsr_cyclic_data;
1108 send_buffer = tsr_data;
1116 ord_glb_comm.all_to_allv(send_buffer, send_counts, send_displs, sr->el_size,
1117 recv_buffer, recv_counts, recv_displs);
1121 if (swp_nval > old_dist.size){
1128 if (recv_displs[ord_glb_comm.np-1] + recv_counts[ord_glb_comm.np-1] > 0){
1129 sr->set(tsr_data, sr->addid(), swp_nval);
1130 char **new_data;
alloc_ptr(
sizeof(
char*)*
np, (
void**)&new_data);
1131 for (int64_t p = 0;p <
np;p++){
1132 new_data[p] = recv_buffer+recv_displs[p]*sr->el_size;
1154 new_dist.phys_phase,
1161 old_dist.phys_phase,
1174 order_globally(sym, new_dist, new_virt_edge_len, new_virt_lda, vbs_new, 0, tsr_data, tsr_cyclic_data, sr);
1175 for (
int dim = 0;dim < order;dim++){
1181 sr->set(tsr_cyclic_data, sr->addid(), swp_nval);
1185 *ptr_tsr_cyclic_data = tsr_cyclic_data;
1186 *ptr_tsr_data = tsr_data;
void calc_idx_arr(int order, int const *lens, int const *sym, int64_t idx, int *idx_arr)
int ** compute_bucket_offsets(distribution const &old_dist, distribution const &new_dist, int const *len, int const *old_phys_edge_len, int const *old_virt_lda, int const *old_offsets, int *const *old_permutation, int const *new_phys_edge_len, int const *new_virt_lda, int forward, int old_virt_np, int new_virt_np, int const *old_virt_edge_len)
computes offsets for redistribution targets along each edge length
void order_globally(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr)
reorder local buffer so that elements are in ordered according to where they are in the global tensor...
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
void acc(char *b, char const *beta, char const *a, char const *alpha) const
compute b=beta*b + alpha*a
void ord_glb< 0 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
virtual void copy(char *a, char const *b) const
copies element b to element a
void * alloc(int64_t len)
alloc abstraction
void glb_ord_pup(int const *sym, distribution const &old_dist, distribution const &new_dist, int const *len, int const *old_phys_dim, int const *old_phys_edge_len, int const *old_virt_edge_len, int64_t old_virt_nelem, int const *old_offsets, int *const *old_permutation, int total_np, int const *new_phys_dim, int const *new_phys_edge_len, int const *new_virt_edge_len, int64_t new_virt_nelem, char *old_data, char **new_data, int forward, int *const *bucket_offset, char const *alpha, char const *beta, algstrct const *sr)
virtual char const * addid() const
MPI datatype for pairs.
void cyclic_reshuffle(int const *sym, distribution const &old_dist, int const *old_offsets, int *const *old_permutation, distribution const &new_dist, int const *new_offsets, int *const *new_permutation, char **ptr_tsr_data, char **ptr_tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm, bool reuse_buffers, char const *alpha, char const *beta)
Goes from any set of phases to any new set of phases.
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction
template void ord_glb< 7 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
char * glb_cyclic_reshuffle(int const *sym, distribution const &old_dist, int const *old_offsets, int *const *old_permutation, distribution const &new_dist, int const *new_offsets, int *const *new_permutation, char **ptr_tsr_data, char **ptr_tsr_cyclic_data, algstrct const *sr, CommData ord_glb_comm, bool reuse_buffers, char const *alpha, char const *beta)
Goes from any set of phases to any new set of phases.
template void ord_glb_omp< 7 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int const *idx_st, int const *idx_end, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
void calc_cnt_displs(int const *sym, distribution const &old_dist, distribution const &new_dist, int new_nvirt, int np, int const *old_virt_edge_len, int const *new_virt_lda, int64_t *send_counts, int64_t *recv_counts, int64_t *send_displs, int64_t *recv_displs, CommData ord_glb_comm, int idx_lyr, int *const *bucket_offset)
assigns keys to an array of values
void ord_glb_omp< 0 >(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int const *idx_st, int const *idx_end, int prev_idx, int64_t glb_ord_offset, int64_t blk_ord_offset)
void sy_calc_idx_arr(int order, int const *lens, int const *sym, int64_t idx, int *idx_arr)
same as above except assumes sym only NS or SY
void ord_glb_omp(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int const *idx_st, int const *idx_end, int prev_idx=0, int64_t glb_ord_offset=0, int64_t blk_ord_offset=0)
int el_size
size of each element of algstrct in bytes
int cdealloc(void *ptr)
free abstraction
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
int64_t packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in packed symmetric (SY, SH, or AS) layout
virtual char const * mulid() const
identity element for multiplication i.e. 1
void ord_glb(int const *sym, distribution const &dist, int const *virt_edge_len, int const *virt_phase_lda, int64_t vbs, bool dir, char const *tsr_data_in, char *tsr_data_out, algstrct const *sr, int prev_idx=0, int64_t glb_ord_offset=0, int64_t blk_ord_offset=0)
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout