26 #include "../src/shared/util.h" 36 int const * prl1_lens,
37 char const * prl1_idx,
39 int const * prl2_lens,
40 char const * prl2_idx,
42 int const * blk1_lens,
43 char const * blk1_idx,
45 int const * blk2_lens,
46 char const * blk2_idx){
50 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
51 MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
55 for (
int i=0; i<order; i++){
65 Tensor<> A(order, lens, sym, dw, idx, prl1[prl1_idx], blk1[blk1_idx],
"A", 1);
74 MPI_Barrier(MPI_COMM_WORLD);
76 MPI_Barrier(MPI_COMM_WORLD);
79 double * data_ref = A.
read(idx, prl2[prl2_idx], blk2[blk2_idx]);
90 sprintf(fname,
"bench_redist.p%d.o%d.N%d.pst-%s.vst-%s.ped-%s.ved-%s.dat", num_pes, order, lens[0], prl1_idx, blk1_idx, prl2_idx, blk2_idx);
94 std::vector<double> times[N_DGTOG];
95 for (
int D=0; D<N_DGTOG; D++){
97 char const * str_name;
106 str_name =
"ROR_ISR";
109 str_name =
"ROR_PUT";
112 str_name =
"ROR_ISR_ANY";
115 str_name =
"ROR_PUT_ANY";
118 if (rank == 0) printf(
"Testing redistribution via kernel %s\n", str_name);
120 double * data = A.
read(idx, prl2[prl2_idx], blk2[blk2_idx]);
122 for (int64_t j=0; j<N/num_pes; j++){
123 if (data[j] != data_ref[j]){
125 printf(
"[%d] Incorrect! data[%ld] = %lf instead of %lf\n",rank,j, data[j],data_ref[j]);
129 MPI_Allreduce(MPI_IN_PLACE, &pass, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
131 if (rank == 0) printf(
"Correctness test passed.\n");
132 MPI_Barrier(MPI_COMM_WORLD);
135 for (
int i=0; i<niter; i++){
136 double t_st = MPI_Wtime();
137 double * data = A.
read(idx, prl2[prl2_idx], blk2[blk2_idx]);
138 MPI_Barrier(MPI_COMM_WORLD);
139 times[D].push_back(MPI_Wtime() - t_st - btime);
143 std::sort(×[D][0], ×[D][0]+niter);
145 printf(
"Performed %d redistributions via kernel %s sec/iter: median = %lf (median effective end-to-end bandwidth, N/(t*p) = %lf GB/s), range = [%lf, %lf]\n",
146 niter, str_name, times[D][niter/2], 1.E-9*N*
sizeof(
double)/(num_pes*times[D][niter/2]), times[D][0], times[D][niter-1]);
147 f << str_name <<
" ";
148 for (
int i=0; i<niter; i++){
149 f << times[D][i] <<
" ";
156 printf(
"Data line kernel * [min, median max]:\n");
157 for (
int D=0; D<N_DGTOG; D++){
158 printf(
"%lf %lf %lf ", times[D][0], times[D][niter/2], times[D][niter-1]);
175 char ** itr = std::find(begin, end, option);
176 if (itr != end && ++itr != end){
183 int main(
int argc,
char ** argv){
184 int rank,
np, niter, n, phase;
185 int const in_num = argc;
186 char ** input_str = argv;
188 char const * prl1_idx;
189 char const * prl2_idx;
190 char const * blk1_idx;
191 char const * blk2_idx;
192 int64_t prl1, prl2, blk1, blk2;
204 MPI_Init(&argc, &argv);
205 MPI_Comm_rank(MPI_COMM_WORLD, &rank);
206 MPI_Comm_size(MPI_COMM_WORLD, &np);
209 n = atoi(
getCmdOption(input_str, input_str+in_num,
"-n"));
213 if (
getCmdOption(input_str, input_str+in_num,
"-phase")){
214 phase = atoi(
getCmdOption(input_str, input_str+in_num,
"-phase"));
215 if (phase < 0) phase = 10;
218 if (
getCmdOption(input_str, input_str+in_num,
"-prl1")){
219 prl1 = atoi(
getCmdOption(input_str, input_str+in_num,
"-prl1"));
220 if (prl1 < 0) prl1 =
np;
223 if (
getCmdOption(input_str, input_str+in_num,
"-prl2")){
224 prl2 = atoi(
getCmdOption(input_str, input_str+in_num,
"-prl2"));
225 if (prl2 < 0) prl2 =
np;
228 if (
getCmdOption(input_str, input_str+in_num,
"-blk1")){
229 blk1 = atoi(
getCmdOption(input_str, input_str+in_num,
"-blk1"));
230 if (blk1 < 0) blk1 =
np;
233 if (
getCmdOption(input_str, input_str+in_num,
"-blk2")){
234 blk2 = atoi(
getCmdOption(input_str, input_str+in_num,
"-blk2"));
235 if (blk2 < 0) blk2 =
np;
238 if (
getCmdOption(input_str, input_str+in_num,
"-niter")){
239 niter = atoi(
getCmdOption(input_str, input_str+in_num,
"-niter"));
240 if (niter < 0) niter = 3;
244 idx =
getCmdOption(input_str, input_str+in_num,
"-idx");
246 if (
getCmdOption(input_str, input_str+in_num,
"-prl1_idx")){
247 prl1_idx =
getCmdOption(input_str, input_str+in_num,
"-prl1_idx");
248 }
else prl1_idx =
"i";
249 if (
getCmdOption(input_str, input_str+in_num,
"-prl2_idx")){
250 prl2_idx =
getCmdOption(input_str, input_str+in_num,
"-prl2_idx");
251 }
else prl2_idx =
"j";
252 if (
getCmdOption(input_str, input_str+in_num,
"-blk1_idx")){
253 blk1_idx =
getCmdOption(input_str, input_str+in_num,
"-blk1_idx");
254 }
else blk1_idx =
"";
255 if (
getCmdOption(input_str, input_str+in_num,
"-blk2_idx")){
256 blk2_idx =
getCmdOption(input_str, input_str+in_num,
"-blk2_idx");
257 }
else blk2_idx =
"";
260 prl1_ord = strlen(prl1_idx);
261 prl2_ord = strlen(prl2_idx);
262 blk1_ord = strlen(blk1_idx);
263 blk2_ord = strlen(blk2_idx);
266 printf(
"Redistributing order %d tensor with all dims %d and idx %s from order %d proc grid with dims %ld and idx %s, to order %d proc grid with dims %ld and idx %s\n", order, n, idx, prl1_ord, prl1, prl1_idx, prl2_ord, prl2, prl2_idx);
267 printf(
"Initial blocking order %d dims %ld and idx %s, to final blocking order %d dims %ld and idx %s\n", blk1_ord, blk1, blk1_idx, blk2_ord, blk2, blk2_idx);
271 lens = (
int*)malloc(order*
sizeof(
int));
272 for (
int i=0; i<order; i++){
275 prl1_lens = (
int*)malloc(prl1_ord*
sizeof(
int));
276 for (
int i=0; i<prl1_ord; i++){
277 prl1_lens[prl1_ord-i-1] = prl1%phase;
281 printf(
"start topology:");
282 for (
int i=0; i<prl1_ord; i++){
283 printf(
" %d", prl1_lens[i]);
287 prl2_lens = (
int*)malloc(prl2_ord*
sizeof(
int));
288 for (
int i=0; i<prl2_ord; i++){
289 prl2_lens[prl2_ord-i-1] = prl2%phase;
293 printf(
"end topology:");
294 for (
int i=0; i<prl2_ord; i++){
295 printf(
" %d", prl2_lens[i]);
300 blk1_lens = (
int*)malloc(blk1_ord*
sizeof(
int));
301 for (
int i=0; i<blk1_ord; i++){
302 blk1_lens[blk1_ord-i-1] = blk1%phase;
306 printf(
"start blocking:");
307 for (
int i=0; i<blk1_ord; i++){
308 printf(
" %d", blk1_lens[i]);
313 blk2_lens = (
int*)malloc(blk2_ord*
sizeof(
int));
314 for (
int i=0; i<blk2_ord; i++){
315 blk2_lens[blk2_ord-i-1] = blk2%phase;
319 printf(
"end blocking:");
320 for (
int i=0; i<blk2_ord; i++){
321 printf(
" %d", blk2_lens[i]);
330 prl1_ord, prl1_lens, prl1_idx,
331 prl2_ord, prl2_lens, prl2_idx,
332 blk1_ord, blk1_lens, blk1_idx,
333 blk2_ord, blk2_lens, blk2_idx);
an instance of the CTF library (world) on a MPI communicator
char * getCmdOption(char **begin, char **end, const std::string &option)
void fill_random(dtype rmin, dtype rmax)
fills local unique tensor elements to random values in the range [min,max] works only for dtype in {f...
epoch during which to measure timers
an instance of a tensor within a CTF world
void read(int64_t npair, Pair< dtype > *pairs)
Gives the values associated with any set of indices.
int main(int argc, char **argv)
void bench_redistribution(int niter, World &dw, int order, int const *lens, char const *idx, int prl1_ord, int const *prl1_lens, char const *prl1_idx, int prl2_ord, int const *prl2_lens, char const *prl2_idx, int blk1_ord, int const *blk1_lens, char const *blk1_idx, int blk2_ord, int const *blk2_lens, char const *blk2_idx)