Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
model_trainer.cxx
Go to the documentation of this file.
1 
9 #include <ctf.hpp>
10 #define TEST_SUITE
11 #include "../examples/ccsd.cxx"
12 #include "../examples/sparse_mp3.cxx"
13 #undef TEST_SUITE
14 using namespace CTF;
15 
16 namespace CTF_int{
17  void update_all_models(MPI_Comm comm);
18 }
19 
20 void train_off_vec_mat(int64_t n, int64_t m, World & dw, bool sp_A, bool sp_B, bool sp_C);
21 
22 void train_ttm(int64_t sz, int64_t r, World & dw){
23  Timer TTM("TTM");
24  TTM.start();
25  for (int order=2; order<7; order++){
26  int64_t n = 1;
27  while (std::pow(n,order) < sz){
28  n++;
29  }
30  int64_t m = r;
31  Matrix<> M(n,m,dw);
32  M.fill_random(-.5,.5);
33  int * lens_n = (int*)malloc(order*sizeof(int));
34  int * lens_nm = (int*)malloc(order*sizeof(int));
35  int * lens_nmm = (int*)malloc(order*sizeof(int));
36  char * base_inds = (char*)malloc((order-1)*sizeof(char));
37  for (int i=0; i<order; i++){
38  if (i<order-2)
39  base_inds[i] = 'a'+i;
40  lens_n[i] = n;
41  lens_nm[i] = n;
42  lens_nmm[i] = n;
43 
44  if (i>=order-2){
45  lens_nmm[i] = m;
46  }
47  if (i>=order-1){
48  lens_nm[i] = m;
49  }
50  }
51  base_inds[order-2] = '\0';
52  char * inds_C = (char*)malloc((order+1)*sizeof(char));
53  char * inds_A = (char*)malloc((order+1)*sizeof(char));
54  char const * inds_M = "xy";
55  Tensor<> T(order,lens_n,dw);
56  Tensor<> U(order,lens_nm,dw);
57  Tensor<> V(order,lens_nmm,dw);
58  Tensor<> W(order-1,lens_nmm,dw);
59  T.fill_random(-.2,.8);
60  strcpy(inds_A, base_inds);
61  strcpy(inds_C, base_inds);
62  strcat(inds_A, "zx");
63  strcat(inds_C, "zy");
64  U[inds_C] = T[inds_A]*M[inds_M];
65  strcpy(inds_A, base_inds);
66  strcpy(inds_C, base_inds);
67  strcat(inds_A, "xq");
68  strcat(inds_C, "yq");
69  V[inds_C] = U[inds_A]*M[inds_M];
70  //include one weigh index
71  strcpy(inds_A, base_inds);
72  strcpy(inds_C, base_inds);
73  strcat(inds_A, "xy");
74  strcat(inds_C, "y");
75  W[inds_C] = U[inds_A]*M[inds_M];
76  free(lens_n);
77  free(lens_nm);
78  free(lens_nmm);
79  free(base_inds);
80  free(inds_C);
81  free(inds_A);
82  }
83  TTM.stop();
84 }
85 
86 void train_dns_vec_mat(int64_t n, int64_t m, World & dw){
87  Timer dns_vec_mat("dns_vec_mat");
88  dns_vec_mat.start();
89  Vector<> b(n, dw);
90  Vector<> c(m, dw);
91  Matrix<> A(m, n, dw);
92  Matrix<> A1(m, n, dw);
93  Matrix<> A2(m, n, dw);
94  Matrix<> G(n, n, SY, dw);
95  Matrix<> F(m, m, AS, dw);
96 
97  srand48(dw.rank);
98  b.fill_random(-.5, .5);
99  c.fill_random(-.5, .5);
100  A.fill_random(-.5, .5);
101  A1.fill_random(-.5, .5);
102  A2.fill_random(-.5, .5);
103  G.fill_random(-.5, .5);
104  F.fill_random(-.5, .5);
105 
106  A["ij"] += A["ik"]*G["kj"];
107  A["ij"] += A["ij"]*A1["ij"];
108  A["ij"] += F["ik"]*A["kj"];
109  c["i"] += A["ij"]*b["j"];
110  b["j"] += .2*A["ij"]*c["i"];
111  b["i"] += b["i"]*b["i"];
112 
113  Function<> f1([](double a){ return a*a; });
114 
115  A2["ij"] = f1(A["ij"]);
116 
117  c["i"] += f1(A["ij"]);
118 
119  Function<> f2([](double a, double b){ return a*a+b*b; });
120 
121  A1["ij"] -= f2(A["kj"], F["ki"]);
122 
123  Transform<> t1([](double & a){ a*=a; });
124 
125  t1(b["i"]);
126  t1(A["ij"]);
127 
128  Transform<> t2([](double a, double & b){ b-=b/a; });
129 
130  t2(b["i"],b["i"]);
131  t2(A["ij"],A2["ij"]);
132 
133 
134  /*Transform<> t3([](double a, double b, double & c){ c=c*c-b*a; });
135 
136  t3(c["i"],b["i"],b["i"]);
137  t3(A["ij"],G["ij"],F["ij"]);*/
138  dns_vec_mat.stop();
139 }
140 
141 
142 void train_sps_vec_mat(int64_t n, int64_t m, World & dw, bool sp_A, bool sp_B, bool sp_C){
143  Timer sps_vec_mat("sps_vec_mat");
144  sps_vec_mat.start();
145  Vector<> b(n, dw);
146  Vector<> c(m, dw);
147  Matrix<> A(m, n, dw);
148  Matrix<> B(m, n, dw);
149  Matrix<> A1(m, n, dw);
150  Matrix<> A2(m, n, dw);
151  Matrix<> G(n, n, NS, dw);
152  Matrix<> F(m, m, NS, dw);
153 
154  srand48(dw.rank);
155  for (double sp = .01; sp<.32; sp*=2.){
156  b.fill_sp_random(-.5, .5, sp);
157  c.fill_sp_random(-.5, .5, sp);
158  A.fill_sp_random(-.5, .5, sp);
159  B.fill_sp_random(-.5, .5, sp);
160  A1.fill_sp_random(-.5, .5, sp);
161  A2.fill_sp_random(-.5, .5, sp);
162  G.fill_sp_random(-.5, .5, sp);
163  F.fill_sp_random(-.5, .5, sp);
164 
165  B["ij"] += A["ik"]*G["kj"];
166  if (!sp_C) B["ij"] += A["ij"]*A1["ij"];
167  B["ij"] += F["ik"]*A["kj"];
168  c["i"] += A["ij"]*b["j"];
169  b["j"] += .2*A["ij"]*c["i"];
170  if (!sp_C) b["i"] += b["i"]*b["i"];
171 
172  Function<> f1([](double a){ return a*a; });
173 
174  A2["ij"] = f1(A["ij"]);
175 
176  c["i"] += f1(A["ij"]);
177 
178  Function<> f2([](double a, double b){ return a*a+b*b; });
179 
180  A2["ji"] -= f2(A1["ki"], F["kj"]);
181 
182  Transform<> t1([](double & a){ a*=a; });
183 
184  t1(b["i"]);
185  t1(A["ij"]);
186 
187  Transform<> t2([](double a, double & b){ b-=b/a; });
188 
189  t2(b["i"],b["i"]);
190  t2(A["ij"],A2["ij"]);
191 
192  /*Transform<> t3([](double a, double b, double & c){ c=c*c-b*a; });
193 
194  t3(c["i"],b["i"],b["i"]);
195  t3(A["ij"],G["ij"],F["ij"]);*/
196  }
197  sps_vec_mat.stop();
198 }
199 
200 void train_ccsd(int64_t n, int64_t m, World & dw){
201  Timer ccsd_t("CCSD");
202  ccsd_t.start();
203  int nv = sqrt(n);
204  int no = sqrt(m);
205  Integrals V(no, nv, dw);
206  V.fill_rand();
207  Amplitudes T(no, nv, dw);
208  T.fill_rand();
209  ccsd(V,T,0);
210  T["ai"] = (1./T.ai->norm2())*T["ai"];
211  T["abij"] = (1./T.abij->norm2())*T["abij"];
212  ccsd_t.stop();
213 }
214 
215 
216 void train_sparse_mp3(int64_t n, int64_t m, World & dw){
217  Timer sparse_mp3_t("spoarse_mp3");
218  sparse_mp3_t.start();
219  int nv = sqrt(n);
220  int no = sqrt(m);
221  for (double sp = .001; sp<.2; sp*=4.){
222  sparse_mp3(nv, no, dw, sp, 0, 1, 1, 0, 0);
223  sparse_mp3(nv, no, dw, sp, 0, 1, 0, 1, 0);
224  sparse_mp3(nv, no, dw, sp, 0, 1, 0, 1, 1);
225  }
226  sparse_mp3_t.stop();
227 }
228 
229 
230 void train_world(double dtime, World & dw, double step_size){
231  int n0 = 19, m0 = 75;
232  int64_t n = n0;
233  int64_t approx_niter = std::max(1,(int)(step_size*step_size*10*log(dtime))); //log((dtime*2000./15.)/dw.np);
234  double ddtime = dtime/approx_niter;
235 
236  // Question # 1:
237  // ddtime = dime / (10*log(dtime)), which is a function that increase really slow
238  int rnk;
239  MPI_Comm_rank(MPI_COMM_WORLD, &rnk);
240  for (;;){
241  double t_st = MPI_Wtime();
242  int niter = 0;
243  int64_t m = m0;
244  volatile double ctime = 0.0;
245  do {
246  if (n<80){
247  train_ttm(n*m+13,n,dw);
248  }
249  train_dns_vec_mat(n, m, dw);
250  train_sps_vec_mat(n-2, m, dw, 0, 0, 0);
251  train_sps_vec_mat(n-4, m-2, dw, 1, 0, 0);
252  train_sps_vec_mat(n-1, m-4, dw, 1, 1, 0);
253  train_sps_vec_mat(n-2, m-3, dw, 1, 1, 1);
254  train_off_vec_mat(n+7, m-4, dw, 0, 0, 0);
255  train_off_vec_mat(n-2, m+6, dw, 1, 0, 0);
256  train_off_vec_mat(n-5, m+2, dw, 1, 1, 0);
257  train_off_vec_mat(n-3, m-1, dw, 1, 1, 1);
258  train_ccsd(n/2, m/2, dw);
259  train_sparse_mp3(n,m,dw);
260  niter++;
261  // m *= 1.9;
262  m *= step_size;
263  n += 2;
264  ctime = MPI_Wtime() - t_st;
265  MPI_Allreduce(MPI_IN_PLACE, (void*)&ctime, 1, MPI_DOUBLE, MPI_MAX, dw.comm);
266 
267  //printf("rank = %d executing p = %d n= %ld m = %ld ctime = %lf ddtime = %lf\n", rnk, dw.np, n, m, ctime, ddtime);
268 
269  } while (ctime < ddtime && m<= 1000000);
270 
271  if (niter <= 2 || n>=1000000) break;
272  // n *= 1.7;
273  n *= step_size;
274  m += 3;
275  // Question # 2:
276  // If m is reassigned to m0 in the for loop, why is this necessary?
277  }
278 }
279 
280 void frize(std::set<int> & ps, int p){
281  ps.insert(p);
282  if (p>=1){
283  for (int i=2; i<p; i++){
284  if (p%i == 0) frize(ps, p/i);
285  }
286  }
287 }
288 
289 void train_all(double time, bool write_coeff, bool dump_data, std::string coeff_file, std::string data_dir){
290  World dw(MPI_COMM_WORLD);
291  int np = dw.np;
292  int rank;
293  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
294 
295  /* compute membership for each process
296  first process belongs to group 0
297  next 2 belong to group 1
298  next 4 belong to group 2
299  next 8 belong to group 3
300  and so on
301  */
302 
303  int color = (int)log2(rank + 1);
304  int end_color = (int)log2(np + 1);
305  int key = rank + 1 - (1<<color);
306 
307  // split out the communicator
308  int cm;
309  MPI_Comm_split(dw.comm, color, key, &cm);
310  World w(cm);
311 
312  // number of iterations for training
313  int num_iterations = 5;
314 
315  // control how much dtime should be increased upon each iteration
316  // dtime = dtime * time_dump at the end of each iteration
317  double time_jump = 1.5;
318 
319  double dtime = (time / (1- 1/time_jump)) / pow(time_jump, num_iterations - 1.0);
320  for (int i=0; i<num_iterations; i++){
321  // TODO probably need to adjust
322  double step_size = 1.0 + 1.5 / pow(2.0, (double)i);
323  if (rank == 0){
324  printf("Starting iteration %d/%d with dimension increment factor %lf\n", i+1,num_iterations,step_size);
325  }
326  // discard the last process
327  if (color != end_color){
328  train_world(dtime/5, w, step_size);
330  if (rank == 0){
331  printf("Completed training round 1/5\n");
332  }
333  }
334 
335  if (color != end_color)
336  train_world(dtime/5, w, step_size);
337  CTF_int::update_all_models(MPI_COMM_WORLD);
338  if (rank == 0){
339  printf("Completed training round 2/5\n");
340  }
341  if (color != end_color){
342  train_world(dtime/5, w, step_size);
344  if (rank == 0){
345  printf("Completed training round 3/5\n");
346  }
347  }
348 
349  if (color != end_color)
350  train_world(dtime/5, w, step_size);
351  CTF_int::update_all_models(MPI_COMM_WORLD);
352  if (rank == 0){
353  printf("Completed training round 4/5\n");
354  }
355  train_world(dtime/5, dw, step_size);
356  CTF_int::update_all_models(MPI_COMM_WORLD);
357 
358  if (rank == 0){
359  printf("Completed training round 5/5\n");
360  }
361  // double dtime for next iteration
362  dtime *= time_jump;
363  }
364 
365 
366  if(write_coeff)
367  CTF_int::write_all_models(coeff_file);
368  if(dump_data){
369  int rank, np;
370  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
371  MPI_Comm_size(MPI_COMM_WORLD, &np);
372  CTF_int::dump_all_models(data_dir);
373  }
374  MPI_Comm_free(&cm);
375 }
376 
377 char* getCmdOption(char ** begin,
378  char ** end,
379  const std::string & option){
380  char ** itr = std::find(begin, end, option);
381  if (itr != end && ++itr != end){
382  return *itr;
383  }
384  return 0;
385 }
386 
387 
388 int main(int argc, char ** argv){
389  int rank, np;
390  double time;
391  char * file_path;
392  int const in_num = argc;
393  char ** input_str = argv;
394 
395  MPI_Init(&argc, &argv);
396  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
397  MPI_Comm_size(MPI_COMM_WORLD, &np);
398 
399  if (getCmdOption(input_str, input_str+in_num, "-write")){
400  file_path = getCmdOption(input_str, input_str+in_num, "-write");
401  } else file_path = NULL;
402 
403  if (getCmdOption(input_str, input_str+in_num, "-time")){
404  time = atof(getCmdOption(input_str, input_str+in_num, "-time"));
405  if (time < 0) time = 5.0;
406  } else time = 5.0;
407 
408 
409  // Boolean expression that are used to pass command line argument to function train_all
410  bool write_coeff = false;
411  bool dump_data = false;
412 
413  // Get the environment variable FILE_PATH
414  std::string coeff_file;
415  if (file_path != NULL){
416  write_coeff = true;
417  coeff_file = std::string(file_path);
418  }
419 
420  char * data_dir = getenv("MODEL_DATA_DIR");
421  std::string data_dir_str;
422  if(!data_dir){
423  data_dir_str = std::string("./src/shared/data");
424  }
425  else{
426  data_dir_str = std::string(data_dir);
427  }
428 
429  if(std::find(input_str, input_str+in_num, std::string("-dump")) != input_str + in_num){
430  dump_data = true;
431  }
432 
433  {
434  World dw(MPI_COMM_WORLD, argc, argv);
435 
436  if (rank == 0){
437  printf("Executing a wide set of contractions to train model with time budget of %lf sec\n", time);
438  if (write_coeff) printf("At the end of execution write new coefficients will be written to model file %s\n",file_path);
439  }
440  train_all(time, write_coeff, dump_data, coeff_file, data_dir_str);
441  }
442 
443 
444  MPI_Finalize();
445  return 0;
446 }
447 
void train_sps_vec_mat(int64_t n, int64_t m, World &dw, bool sp_A, bool sp_B, bool sp_C)
void update_all_models(MPI_Comm comm)
Definition: model.cxx:15
void fill_rand()
Definition: ccsd.cxx:179
MPI_Comm comm
Definition: int_timer.cxx:22
void train_ttm(int64_t sz, int64_t r, World &dw)
Matrix class which encapsulates a 2D tensor.
Definition: matrix.h:18
void write_all_models(std::string file_name)
Definition: model.cxx:42
int main(int argc, char **argv)
def rank(self)
Definition: core.pyx:312
Tensor * ai
Definition: ccsd.cxx:156
int sparse_mp3(int nv, int no, World &dw, double sp=.8, bool test=1, int niter=0, bool bnd=1, bool bns=1, bool sparse_T=1)
Definition: sparse_mp3.cxx:96
void fill_rand()
Definition: ccsd.cxx:93
void stop()
Definition: int_timer.cxx:151
local process walltime measurement
Definition: timer.h:50
Vector class which encapsulates a 1D tensor.
Definition: vector.h:14
Definition: common.h:37
an instance of the CTF library (world) on a MPI communicator
Definition: world.h:19
double f2(double a, double b)
string
Definition: core.pyx:456
dtype norm2()
computes the frobenius norm of the tensor (needs sqrt()!)
Definition: tensor.h:811
void train_all(double time, bool write_coeff, bool dump_data, std::string coeff_file, std::string data_dir)
void train_dns_vec_mat(int64_t n, int64_t m, World &dw)
void fill_random(dtype rmin, dtype rmax)
fills local unique tensor elements to random values in the range [min,max] works only for dtype in {f...
Definition: tensor.cxx:928
int rank
rank of local processor
Definition: world.h:24
void train_off_vec_mat(int64_t n, int64_t m, World &dw, bool sp_A, bool sp_B, bool sp_C)
char * getCmdOption(char **begin, char **end, const std::string &option)
void frize(std::set< int > &ps, int p)
Tensor * abij
Definition: ccsd.cxx:157
void dump_all_models(std::string path)
Definition: model.cxx:50
void fill_sp_random(dtype rmin, dtype rmax, double frac_sp)
generate roughly frac_sp*dense_tensor_size nonzeros between rmin and rmax, works only for dtype in {f...
Definition: tensor.cxx:969
void start()
Definition: int_timer.cxx:141
Definition: apsp.cxx:17
an instance of a tensor within a CTF world
Definition: tensor.h:74
void train_sparse_mp3(int64_t n, int64_t m, World &dw)
int64_t key
Definition: back_comp.h:66
void train_world(double dtime, World &dw, double step_size)
Definition: common.h:37
void train_ccsd(int64_t n, int64_t m, World &dw)
Definition: common.h:37
void ccsd(Integrals &V, Amplitudes &T, int sched_nparts=0)
Definition: ccsd.cxx:200
int np
number of processors
Definition: world.h:26
MPI_Comm comm
set of processors making up this world
Definition: world.h:22
def np(self)
Definition: core.pyx:315