4 #include "../shared/util.h" 5 #include "../interface/common.h" 9 inline int get_glb(
int i,
int s,
int t){
24 int const * rep_phase,
28 int const * loc_edge_len){
29 assert(sym[idim] ==
NS);
30 if (sym[idim-1] ==
NS){
31 return (
get_loc(edge_len[idim]-1,sphase[idim],gidx_off[idim])+1)*
calc_cnt<idim-1>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
33 int64_t * pfx = calc_sy_pfx<idim>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
35 for (
int i=0; i<=
get_loc(edge_len[idim]-1,sphase[idim],gidx_off[idim]); i++){
45 int const * rep_phase,
49 int const * loc_edge_len){
51 return get_loc(edge_len[0]-1, sphase[0], gidx_off[0])+1;
56 int const * rep_phase,
60 int const * loc_edge_len){
61 int64_t * pfx = (int64_t*)
alloc(
sizeof(int64_t)*loc_edge_len[idim]);
62 if (sym[idim-1] ==
NS){
63 int64_t ns_size =
calc_cnt<idim-1>(sym,rep_phase,sphase,gidx_off,edge_len,loc_edge_len);
64 for (
int i=0; i<loc_edge_len[idim]; i++){
68 int64_t * pfx_m1 =
calc_sy_pfx<idim-1>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
69 for (
int i=0; i<loc_edge_len[idim]; i++){
73 if (sym[idim-1] ==
SY)
74 jst =
get_loc(
get_glb(i-1,sphase[idim],gidx_off[idim]),sphase[idim-1],gidx_off[idim-1])+1;
76 jst =
get_loc(
get_glb(i-1,sphase[idim],gidx_off[idim])-1,sphase[idim-1],gidx_off[idim-1])+1;
82 if (sym[idim-1] ==
SY)
83 jed =
get_loc(std::min(edge_len[idim]-1,
get_glb(i,sphase[idim],gidx_off[idim])),sphase[idim-1],gidx_off[idim-1]);
85 jed =
get_loc(std::min(edge_len[idim]-1,
get_glb(i,sphase[idim],gidx_off[idim]))-1,sphase[idim-1],gidx_off[idim-1]);
86 for (
int j=jst; j<=jed; j++){
98 int const * rep_phase,
100 int const * gidx_off,
101 int const * edge_len,
102 int const * loc_edge_len){
103 int64_t * pfx= (int64_t*)
alloc(
sizeof(int64_t)*loc_edge_len[1]);
105 int64_t cnt =
calc_cnt<0>(sym, rep_phase, sphase, gidx_off, edge_len, loc_edge_len);
106 std::fill(pfx, pfx+loc_edge_len[1], cnt);
107 }
else if (sym[0] ==
SY){
108 for (
int i=0; i<loc_edge_len[1]; i++){
109 pfx[i] =
get_loc(
get_glb(i,sphase[1],gidx_off[1]),sphase[0],gidx_off[0])+1;
112 for (
int i=0; i<loc_edge_len[1]; i++){
113 pfx[i] =
get_loc(
get_glb(i,sphase[1],gidx_off[1])-1,sphase[0],gidx_off[0])+1;
123 int const * rep_phase,
124 int const * rep_phase_lda,
126 int const * phys_phase,
128 int const * edge_len,
129 int const * loc_edge_len){
130 for (
int i=0; i<rep_phase[idim]; i++, gidx_off[idim]+=phys_phase[idim]){
131 calc_drv_cnts<idim-1>(order, sym, counts+i*rep_phase_lda[idim], rep_phase, rep_phase_lda, sphase, phys_phase,
132 gidx_off, edge_len, loc_edge_len);
134 gidx_off[idim] -= phys_phase[idim]*rep_phase[idim];
141 int const * rep_phase,
142 int const * rep_phase_lda,
144 int const * phys_phase,
146 int const * edge_len,
147 int const * loc_edge_len){
148 for (
int i=0; i<rep_phase[0]; i++, gidx_off[0]+=phys_phase[0]){
151 gidx_off[0] -= phys_phase[0]*rep_phase[0];
156 int *
const * pe_offset,
157 int *
const * bucket_offset,
158 int64_t
const * old_counts,
163 for (
int i=0; i<rep_phase[idim]; i++){
164 int rec_bucket_off = bucket_off+bucket_offset[idim][i];
165 int rec_pe_off = pe_off+pe_offset[idim][i];
166 calc_cnt_from_rep_cnt<idim-1>(rep_phase, pe_offset, bucket_offset, old_counts, counts, rec_bucket_off, rec_pe_off, dir);
174 (
int const * rep_phase,
175 int *
const * pe_offset,
176 int *
const * bucket_offset,
177 int64_t
const * old_counts,
183 for (
int i=0; i<rep_phase[0]; i++){
184 counts[pe_off+pe_offset[0][i]] = old_counts[bucket_off+i];
187 for (
int i=0; i<rep_phase[0]; i++){
188 counts[bucket_off+i] = old_counts[pe_off+pe_offset[0][i]];
195 #define INST_CALC_CNT_BEC_ICPC_SUCKS(X) \ 197 void calc_cnt_from_rep_cnt<X> \ 198 (int const * rep_phase, \ 199 int * const * pe_offset, \ 200 int * const * bucket_offset, \ 201 int64_t const * old_counts, \ 221 int const * edge_len,
227 int * rep_phase, * gidx_off, * sphase;
229 int * new_loc_edge_len;
231 int order = old_dist.order;
232 rep_phase = (
int*)
alloc(order*
sizeof(
int));
233 rep_phase_lda = (
int*)
alloc(order*
sizeof(
int));
234 sphase = (
int*)
alloc(order*
sizeof(
int));
235 gidx_off = (
int*)
alloc(order*
sizeof(
int));
236 new_loc_edge_len = (
int*)
alloc(order*
sizeof(
int));
238 for (
int i=0; i<order; i++){
240 rep_phase_lda[i] = nrep;
241 sphase[i] =
lcm(old_dist.phys_phase[i],new_dist.phys_phase[i]);
242 rep_phase[i] = sphase[i] / old_dist.phys_phase[i];
243 gidx_off[i] = old_dist.perank[i];
244 nrep *= rep_phase[i];
245 new_loc_edge_len[i] = (edge_len[i]+sphase[i]-1)/sphase[i];
249 SWITCH_ORD_CALL(
calc_drv_cnts, order-1, order, sym, counts, rep_phase, rep_phase_lda, sphase, old_dist.phys_phase, gidx_off, edge_len, new_loc_edge_len)
265 int const * rep_phase,
266 int const * phys_edge_len,
267 int const * virt_edge_len,
268 int const * virt_dim,
269 int const * virt_lda,
272 int ** bucket_offset,
273 int64_t ** data_offset,
277 int rep_phase_lda = 1;
278 alloc_ptr(
sizeof(int64_t)*1, (
void**)&ivmax_pre[old_dist.
order-1]);
282 alloc_ptr(
sizeof(
int)*std::max(rep_phase[
dim],phys_edge_len[dim]), (
void**)&pe_offset[dim]);
283 alloc_ptr(
sizeof(
int)*std::max(rep_phase[dim],phys_edge_len[dim]), (
void**)&bucket_offset[dim]);
284 alloc_ptr(
sizeof(int64_t)*std::max(rep_phase[dim],phys_edge_len[dim]), (
void**)&data_offset[dim]);
286 alloc_ptr(
sizeof(int64_t)*std::max(rep_phase[dim],phys_edge_len[dim]), (
void**)&ivmax_pre[dim-1]);
290 int64_t data_stride, sub_data_stride;
292 if (dim > 0 && sym[dim-1] !=
NS){
295 while (jdim>=0 && sym[jdim] !=
NS){ nsym++; jdim--; }
305 if (dim == 0) data_stride = 1;
314 vidx < std::max((rep_phase[dim]+old_dist.
virt_phase[dim]-1)/old_dist.
virt_phase[dim],virt_edge_len[dim]);
317 int64_t rec_data_off = data_off;
318 if (dim > 0 && sym[dim-1] !=
NS){
319 data_stride = (vidx+1)*sub_data_stride;
320 for (
int j=1; j<nsym; j++){
321 data_stride = (data_stride*(vidx+j+1))/(j+1);
324 data_off += data_stride;
325 for (
int vr = 0;vr < old_dist.
virt_phase[
dim] && pidx<std::max(rep_phase[dim],phys_edge_len[dim]) ;vr++,pidx++){
328 if (sym[dim-1] ==
NS){
330 }
else if (sym[dim-1] ==
SY){
339 data_offset[
dim][pidx] = rec_data_off;
340 rec_data_off += virt_lda[
dim]*virt_nelem;
344 pe_offset[
dim][pidx] = phys_rank*
MAX(1,new_dist.
pe_lda[dim]);
345 bucket_offset[
dim][pidx] = (pidx%rep_phase[
dim])*rep_phase_lda;
348 rep_phase_lda *= rep_phase[
dim];
void calc_cnt_from_rep_cnt< 0 >(int const *rep_phase, int *const *pe_offset, int *const *bucket_offset, int64_t const *old_counts, int64_t *counts, int bucket_off, int pe_off, int dir)
void * alloc(int64_t len)
alloc abstraction
void calc_drv_cnts(int order, int const *sym, int64_t *counts, int const *rep_phase, int const *rep_phase_lda, int const *sphase, int const *phys_phase, int *gidx_off, int const *edge_len, int const *loc_edge_len)
void calc_drv_displs(int const *sym, int const *edge_len, distribution const &old_dist, distribution const &new_dist, int64_t *counts, int idx_lyr)
int64_t calc_cnt< 0 >(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
int64_t * calc_sy_pfx(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
computes the cardinality of the sets of elements of a tensor of order idim+1 for different values of ...
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
#define SWITCH_ORD_CALL(F, act_ord,...)
void calc_drv_cnts< 0 >(int order, int const *sym, int64_t *counts, int const *rep_phase, int const *rep_phase_lda, int const *sphase, int const *phys_phase, int *gidx_off, int const *edge_len, int const *loc_edge_len)
#define SWITCH_ORD_CALL_RET(R, F, act_ord,...)
int64_t calc_cnt(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
computes the cardinality of the set of elements of a tensor of order idim+1 that are owned by process...
int get_loc(int g, int s, int t)
int cdealloc(void *ptr)
free abstraction
#define INST_CALC_CNT_BEC_ICPC_SUCKS(X)
int64_t * calc_sy_pfx< 1 >(int const *sym, int const *rep_phase, int const *sphase, int const *gidx_off, int const *edge_len, int const *loc_edge_len)
void precompute_offsets(distribution const &old_dist, distribution const &new_dist, int const *sym, int const *len, int const *rep_phase, int const *phys_edge_len, int const *virt_edge_len, int const *virt_dim, int const *virt_lda, int64_t virt_nelem, int **pe_offset, int **bucket_offset, int64_t **data_offset, int **ivmax_pre)
void calc_cnt_from_rep_cnt(int const *rep_phase, int *const *pe_offset, int *const *bucket_offset, int64_t const *old_counts, int64_t *counts, int bucket_off, int pe_off, int dir)
int get_glb(int i, int s, int t)
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout