Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
bench_redistribution.cxx
Go to the documentation of this file.
1 
9 //#include <boost/math/distributions/normal.hpp>
10 //
11 //boost::math::normal dist(0.0, 1.0);
12 //
14 //double q = quantile(dist, 0.95);
15 
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 #include <string>
20 #include <math.h>
21 #include <assert.h>
22 #include <algorithm>
23 #include <ctf.hpp>
24 #include <iostream>
25 #include <fstream>
26 #include "../src/shared/util.h"
27 
28 using namespace CTF;
29 
30 void bench_redistribution(int niter,
31  World & dw,
32  int order,
33  int const * lens,
34  char const * idx,
35  int prl1_ord,
36  int const * prl1_lens,
37  char const * prl1_idx,
38  int prl2_ord,
39  int const * prl2_lens,
40  char const * prl2_idx,
41  int blk1_ord,
42  int const * blk1_lens,
43  char const * blk1_idx,
44  int blk2_ord,
45  int const * blk2_lens,
46  char const * blk2_idx){
47 
48  int rank, num_pes;
49 
50  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
51  MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
52 
53  int sym[order];
54  int64_t N = 1;
55  for (int i=0; i<order; i++){
56  N*=lens[i];
57  sym[i] = NS;
58  }
59 
60  Partition prl1(prl1_ord, prl1_lens);
61  Partition prl2(prl2_ord, prl2_lens);
62  Partition blk1(blk1_ord, blk1_lens);
63  Partition blk2(blk2_ord, blk2_lens);
64 
65  Tensor<> A(order, lens, sym, dw, idx, prl1[prl1_idx], blk1[blk1_idx], "A", 1);
66 
67  A.fill_random(-.5, .5);
68 
69  double t = 0.0;
70  double t_min;
71  double t_max;
72 
73  double btime;
74  MPI_Barrier(MPI_COMM_WORLD);
75  btime = MPI_Wtime();
76  MPI_Barrier(MPI_COMM_WORLD);
77  btime -= MPI_Wtime();
78 
79  double * data_ref = A.read(idx, prl2[prl2_idx], blk2[blk2_idx]);
80 
81 #ifdef USE_FOMPI
82  int N_DGTOG = 6;
83 #else
84  int N_DGTOG = 5;
85 #endif
86 
87  std::ofstream f;
88  if (rank == 0){
89  char fname[1000];
90  sprintf(fname, "bench_redist.p%d.o%d.N%d.pst-%s.vst-%s.ped-%s.ved-%s.dat", num_pes, order, lens[0], prl1_idx, blk1_idx, prl2_idx, blk2_idx);
91  f.open(fname);
92  }
93 
94  std::vector<double> times[N_DGTOG];
95  for (int D=0; D<N_DGTOG; D++){
96  DGTOG_SWITCH = D;
97  char const * str_name;
98  switch (D){
99  case 0:
100  str_name = "NAIVE";
101  break;
102  case 1:
103  str_name = "ROR";
104  break;
105  case 2:
106  str_name = "ROR_ISR";
107  break;
108  case 3:
109  str_name = "ROR_PUT";
110  break;
111  case 4:
112  str_name = "ROR_ISR_ANY";
113  break;
114  case 5:
115  str_name = "ROR_PUT_ANY";
116  break;
117  }
118  if (rank == 0) printf("Testing redistribution via kernel %s\n", str_name);
119 
120  double * data = A.read(idx, prl2[prl2_idx], blk2[blk2_idx]);
121  int pass = 1;
122  for (int64_t j=0; j<N/num_pes; j++){
123  if (data[j] != data_ref[j]){
124  pass = 0;
125  printf("[%d] Incorrect! data[%ld] = %lf instead of %lf\n",rank,j, data[j],data_ref[j]);
126  }
127  }
128  free(data);
129  MPI_Allreduce(MPI_IN_PLACE, &pass, 1, MPI_INT, MPI_MIN, MPI_COMM_WORLD);
130  if (pass){
131  if (rank == 0) printf("Correctness test passed.\n");
132  MPI_Barrier(MPI_COMM_WORLD);
133  Timer_epoch te(str_name);
134  te.begin();
135  for (int i=0; i<niter; i++){
136  double t_st = MPI_Wtime();
137  double * data = A.read(idx, prl2[prl2_idx], blk2[blk2_idx]);
138  MPI_Barrier(MPI_COMM_WORLD);
139  times[D].push_back(MPI_Wtime() - t_st - btime);
140  free(data);
141  }
142  te.end();
143  std::sort(&times[D][0], &times[D][0]+niter);
144  if (rank == 0){
145  printf("Performed %d redistributions via kernel %s sec/iter: median = %lf (median effective end-to-end bandwidth, N/(t*p) = %lf GB/s), range = [%lf, %lf]\n",
146  niter, str_name, times[D][niter/2], 1.E-9*N*sizeof(double)/(num_pes*times[D][niter/2]), times[D][0], times[D][niter-1]);
147  f << str_name << " ";
148  for (int i=0; i<niter; i++){
149  f << times[D][i] << " ";
150  }
151  f << "\n";
152  }
153  }
154  }
155  if (rank == 0){
156  printf("Data line kernel * [min, median max]:\n");
157  for (int D=0; D<N_DGTOG; D++){
158  printf("%lf %lf %lf ", times[D][0], times[D][niter/2], times[D][niter-1]);
159  }
160  printf("\n");
161  }
162  if (rank == 0){
163  f.close();
164  }
165 
166  free(data_ref);
167 /* if (rank == 0)
168  printf("Performed %d redistributions in %lf time/iter %lf mem GB/sec\n",
169  niter, (end_time-st_time)/niter, (2*N*1.E-9/((end_time-st_time)/niter))/num_pes);*/
170 }
171 
172 char* getCmdOption(char ** begin,
173  char ** end,
174  const std::string & option){
175  char ** itr = std::find(begin, end, option);
176  if (itr != end && ++itr != end){
177  return *itr;
178  }
179  return 0;
180 }
181 
182 
183 int main(int argc, char ** argv){
184  int rank, np, niter, n, phase;
185  int const in_num = argc;
186  char ** input_str = argv;
187  char const * idx;
188  char const * prl1_idx;
189  char const * prl2_idx;
190  char const * blk1_idx;
191  char const * blk2_idx;
192  int64_t prl1, prl2, blk1, blk2;
193  int order;
194  int prl1_ord;
195  int prl2_ord;
196  int blk1_ord;
197  int blk2_ord;
198  int * lens;
199  int * prl1_lens;
200  int * prl2_lens;
201  int * blk1_lens;
202  int * blk2_lens;
203 
204  MPI_Init(&argc, &argv);
205  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
206  MPI_Comm_size(MPI_COMM_WORLD, &np);
207 
208  if (getCmdOption(input_str, input_str+in_num, "-n")){
209  n = atoi(getCmdOption(input_str, input_str+in_num, "-n"));
210  if (n < 0) n = 4;
211  } else n = 4;
212 
213  if (getCmdOption(input_str, input_str+in_num, "-phase")){
214  phase = atoi(getCmdOption(input_str, input_str+in_num, "-phase"));
215  if (phase < 0) phase = 10;
216  } else phase = 10;
217 
218  if (getCmdOption(input_str, input_str+in_num, "-prl1")){
219  prl1 = atoi(getCmdOption(input_str, input_str+in_num, "-prl1"));
220  if (prl1 < 0) prl1 = np;
221  } else prl1 = np;
222 
223  if (getCmdOption(input_str, input_str+in_num, "-prl2")){
224  prl2 = atoi(getCmdOption(input_str, input_str+in_num, "-prl2"));
225  if (prl2 < 0) prl2 = np;
226  } else prl2 = np;
227 
228  if (getCmdOption(input_str, input_str+in_num, "-blk1")){
229  blk1 = atoi(getCmdOption(input_str, input_str+in_num, "-blk1"));
230  if (blk1 < 0) blk1 = np;
231  } else blk1 = np;
232 
233  if (getCmdOption(input_str, input_str+in_num, "-blk2")){
234  blk2 = atoi(getCmdOption(input_str, input_str+in_num, "-blk2"));
235  if (blk2 < 0) blk2 = np;
236  } else blk2 = np;
237 
238  if (getCmdOption(input_str, input_str+in_num, "-niter")){
239  niter = atoi(getCmdOption(input_str, input_str+in_num, "-niter"));
240  if (niter < 0) niter = 3;
241  } else niter = 3;
242 
243  if (getCmdOption(input_str, input_str+in_num, "-idx")){
244  idx = getCmdOption(input_str, input_str+in_num, "-idx");
245  } else idx = "ij";
246  if (getCmdOption(input_str, input_str+in_num, "-prl1_idx")){
247  prl1_idx = getCmdOption(input_str, input_str+in_num, "-prl1_idx");
248  } else prl1_idx = "i";
249  if (getCmdOption(input_str, input_str+in_num, "-prl2_idx")){
250  prl2_idx = getCmdOption(input_str, input_str+in_num, "-prl2_idx");
251  } else prl2_idx = "j";
252  if (getCmdOption(input_str, input_str+in_num, "-blk1_idx")){
253  blk1_idx = getCmdOption(input_str, input_str+in_num, "-blk1_idx");
254  } else blk1_idx = "";
255  if (getCmdOption(input_str, input_str+in_num, "-blk2_idx")){
256  blk2_idx = getCmdOption(input_str, input_str+in_num, "-blk2_idx");
257  } else blk2_idx = "";
258 
259  order = strlen(idx);
260  prl1_ord = strlen(prl1_idx);
261  prl2_ord = strlen(prl2_idx);
262  blk1_ord = strlen(blk1_idx);
263  blk2_ord = strlen(blk2_idx);
264 
265  if (rank==0){
266  printf("Redistributing order %d tensor with all dims %d and idx %s from order %d proc grid with dims %ld and idx %s, to order %d proc grid with dims %ld and idx %s\n", order, n, idx, prl1_ord, prl1, prl1_idx, prl2_ord, prl2, prl2_idx);
267  printf("Initial blocking order %d dims %ld and idx %s, to final blocking order %d dims %ld and idx %s\n", blk1_ord, blk1, blk1_idx, blk2_ord, blk2, blk2_idx);
268  }
269 
270 
271  lens = (int*)malloc(order*sizeof(int));
272  for (int i=0; i<order; i++){
273  lens[i] = n;
274  }
275  prl1_lens = (int*)malloc(prl1_ord*sizeof(int));
276  for (int i=0; i<prl1_ord; i++){
277  prl1_lens[prl1_ord-i-1] = prl1%phase;
278  prl1 = prl1/phase;
279  }
280  if (rank == 0){
281  printf("start topology:");
282  for (int i=0; i<prl1_ord; i++){
283  printf(" %d", prl1_lens[i]);
284  }
285  printf("\n");
286  }
287  prl2_lens = (int*)malloc(prl2_ord*sizeof(int));
288  for (int i=0; i<prl2_ord; i++){
289  prl2_lens[prl2_ord-i-1] = prl2%phase;
290  prl2 = prl2/phase;
291  }
292  if (rank == 0){
293  printf("end topology:");
294  for (int i=0; i<prl2_ord; i++){
295  printf(" %d", prl2_lens[i]);
296  }
297  printf("\n");
298  }
299 
300  blk1_lens = (int*)malloc(blk1_ord*sizeof(int));
301  for (int i=0; i<blk1_ord; i++){
302  blk1_lens[blk1_ord-i-1] = blk1%phase;
303  blk1 = blk1/phase;
304  }
305  if (rank == 0){
306  printf("start blocking:");
307  for (int i=0; i<blk1_ord; i++){
308  printf(" %d", blk1_lens[i]);
309  }
310  printf("\n");
311  }
312 
313  blk2_lens = (int*)malloc(blk2_ord*sizeof(int));
314  for (int i=0; i<blk2_ord; i++){
315  blk2_lens[blk2_ord-i-1] = blk2%phase;
316  blk2 = blk2/phase;
317  }
318  if (rank == 0){
319  printf("end blocking:");
320  for (int i=0; i<blk2_ord; i++){
321  printf(" %d", blk2_lens[i]);
322  }
323  printf("\n");
324  }
325 
326 
327  {
328  CTF_World dw(argc, argv);
329  bench_redistribution(niter, dw, order, lens, idx,
330  prl1_ord, prl1_lens, prl1_idx,
331  prl2_ord, prl2_lens, prl2_idx,
332  blk1_ord, blk1_lens, blk1_idx,
333  blk2_ord, blk2_lens, blk2_idx);
334  }
335 
336 
337  MPI_Finalize();
338  return 0;
339 }
def rank(self)
Definition: core.pyx:312
Definition: common.h:37
an instance of the CTF library (world) on a MPI communicator
Definition: world.h:19
char * getCmdOption(char **begin, char **end, const std::string &option)
string
Definition: core.pyx:456
void fill_random(dtype rmin, dtype rmax)
fills local unique tensor elements to random values in the range [min,max] works only for dtype in {f...
Definition: tensor.cxx:928
epoch during which to measure timers
Definition: timer.h:69
Definition: apsp.cxx:17
int DGTOG_SWITCH
Definition: common.cxx:12
an instance of a tensor within a CTF world
Definition: tensor.h:74
void read(int64_t npair, Pair< dtype > *pairs)
Gives the values associated with any set of indices.
Definition: tensor.cxx:246
int main(int argc, char **argv)
def np(self)
Definition: core.pyx:315
void bench_redistribution(int niter, World &dw, int order, int const *lens, char const *idx, int prl1_ord, int const *prl1_lens, char const *prl1_idx, int prl2_ord, int const *prl2_lens, char const *prl2_idx, int blk1_ord, int const *blk1_lens, char const *blk1_idx, int blk2_ord, int const *blk2_lens, char const *blk2_idx)