Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
model.cxx
Go to the documentation of this file.
1 
2 #include "../shared/lapack_symbs.h"
3 #include "../shared/blas_symbs.h"
4 #include "model.h"
5 #include "../shared/util.h"
6 #include <iomanip>
7 
8 namespace CTF_int {
9 
10  std::vector<Model*>& get_all_models(){
11  static std::vector<Model*> all_models;
12  return all_models;
13  }
14 
15  void update_all_models(MPI_Comm cm){
16 #ifdef TUNE
17  for (int i=0; i<(int)get_all_models().size(); i++){
18  get_all_models()[i]->update(cm);
19  }
20 #endif
21  }
22 
24 #ifdef TUNE
25  for (int i=0; i<(int)get_all_models().size(); i++){
26  get_all_models()[i]->print();
27  }
28  for (int i=0; i<(int)get_all_models().size(); i++){
29  get_all_models()[i]->print_uo();
30  }
31 #endif
32  }
33 
34  void load_all_models(std::string file_name){
35 #ifdef TUNE
36  for (int i=0; i<(int)get_all_models().size(); i++){
37  get_all_models()[i]->load_coeff(file_name);
38  }
39 #endif
40  }
41 
42  void write_all_models(std::string file_name){
43 #ifdef TUNE
44  for (int i=0; i<(int)get_all_models().size(); i++){
45  get_all_models()[i]->write_coeff(file_name);
46  }
47 #endif
48  }
49 
51 #ifdef TUNE
52  for (int i=0; i<(int)get_all_models().size(); i++){
53  get_all_models()[i]->dump_data(path);
54  }
55 #endif
56  }
57 
58 #define SPLINE_CHUNK_SZ = 8
59 
60  double cddot(int n, const double *dX,
61  int incX, const double *dY,
62  int incY){
63  return CTF_BLAS::DDOT(&n, dX, &incX, dY, &incY);
64  }
65 // DGEQRF computes a QR factorization of a real M-by-N matrix A:
66 // A = Q * R.
67  void cdgeqrf(int const M,
68  int const N,
69  double * A,
70  int const LDA,
71  double * TAU2,
72  double * WORK,
73  int const LWORK,
74  int * INFO){
75 #ifdef TUNE
76  CTF_LAPACK::cdgeqrf(M, N, A, LDA, TAU2, WORK, LWORK, INFO);
77 #endif
78  }
79 
80  void cdormqr(char SIDE,
81  char TRANS,
82  int M,
83  int N,
84  int K,
85  double const * A,
86  int LDA,
87  double const * TAU2,
88  double * C,
89  int LDC,
90  double * WORK,
91  int LWORK,
92  int * INFO){
93 #ifdef TUNE
94  CTF_LAPACK::cdormqr(SIDE, TRANS, M, N, K, A, LDA, TAU2, C, LDC, WORK, LWORK, INFO);
95 #endif
96  }
97 
98 
99 //DGELSD computes the minimum-norm solution to a real linear least squares problem:
100 // minimize 2-norm(| b - A*x |)
101 // http://www.netlib.org/lapack/explore-html/d7/d3b/group__double_g_esolve_ga94bd4a63a6dacf523e25ff617719f752.html#ga94bd4a63a6dacf523e25ff617719f752
102  void cdgelsd(int m, int n, int k, double const * A, int lda_A, double * B, int lda_B, double * S, double cond, int * rank, double * work, int lwork, int * iwork, int * info){
103 #ifdef TUNE
104  CTF_LAPACK::cdgelsd(m, n, k, A, lda_A, B, lda_B, S, cond, rank, work, lwork, iwork, info);
105 #endif
106  }
107 
108  template <int nparam>
109  struct time_param {
110  double p[nparam+1];
111  };
112 
113  template <int nparam>
115  return a.p[0] > b.p[0];
116  }
117 
118 //FIXME: be smarter about regularization, magnitude of coefficients is different!
119 #define REG_LAMBDA 1.E6
120 
121  template <int nparam>
122  LinModel<nparam>::LinModel(double const * init_guess, char const * name_, int hist_size_){
123  //initialize the model as active by default
124  is_active = true;
125  //copy initial static coefficients to initialzie model (defined in init_model.cxx)
126  memcpy(coeff_guess, init_guess, nparam*sizeof(double));
127 #ifdef TUNE
128  /*for (int i=0; i<nparam; i++){
129  regularization[i] = coeff_guess[i]*REG_LAMBDA;
130  }*/
131  name = (char*)alloc(strlen(name_)+1);
132  name[0] = '\0';
133  strcpy(name, name_);
134  hist_size = hist_size_;
135  mat_lda = nparam+1;
136  time_param_mat = (double*)alloc(mat_lda*hist_size*sizeof(double));
137  nobs = 0;
138  is_tuned = false;
139  tot_time = 0.0;
140  avg_tot_time = 0.0;
141  avg_over_time = 0.0;
142  avg_under_time = 0.0;
143  over_time = 0.0;
144  under_time = 0.0;
145  get_all_models().push_back(this);
146 #endif
147  }
148 
149 
150  template <int nparam>
152  //initialize the model as active by default
153  is_active = true;
154  name = NULL;
155  time_param_mat = NULL;
156  }
157 
158  template <int nparam>
160 #ifdef TUNE
161  if (name != NULL) cdealloc(name);
162  if (time_param_mat != NULL) cdealloc(time_param_mat);
163 #endif
164  }
165 
166 
167  template <int nparam>
168  void LinModel<nparam>::observe(double const * tp){
169 #ifdef TUNE
170  /*for (int i=0; i<nobs; i++){
171  bool is_same = true;
172  for (int j=0; j<nparam; j++){
173  if (time_param_mat[i*mat_lda+1+j] != tp[1+j]) is_same = false;
174  }
175  if (is_same) return;
176  }*/
177 // if (is_tuned){
178  tot_time += tp[0];
179  if (est_time(tp+1)>tp[0]){
180  under_time += est_time(tp+1)-tp[0];
181  } else {
182  over_time += tp[0]-est_time(tp+1);
183  }
184 // }
185  /*if (fabs(est_time(tp+1)-tp[0])>1.E-1){
186  printf("estimate of %s[%1.2E*%1.2E", name, tp[0], coeff_guess[0]);
187  for (int i=1; i<nparam; i++){
188  printf(",%1.2E*%1.2E",tp[i+1], coeff_guess[i]);
189  }
190  printf("] was %1.2E, actual executon took %1.2E\n", est_time(tp+1), tp[0]);
191  print();
192  }*/
193  //printf("observed %lf %lf %lf\n", tp[0], tp[1], tp[2]);
194  assert(tp[0] >= 0.0);
195 
196  // Add the new instance of run process into time_param_mat
197  memcpy(time_param_mat+(nobs%hist_size)*mat_lda, tp, mat_lda*sizeof(double));
198  /* if (nobs < hist_size){
199  memcpy(time_param_mat+nobs*mat_lda, tp, mat_lda*sizeof(double));
200  } else {
201  std::pop_heap( (time_param<nparam>*)time_param_mat,
202  ((time_param<nparam>*)time_param_mat)+hist_size,
203  &comp_time_param<nparam>);
204 
205  memcpy(time_param_mat+(hist_size-1)*mat_lda, tp, mat_lda*sizeof(double));
206  std::push_heap( (time_param<nparam>*)time_param_mat,
207  ((time_param<nparam>*)time_param_mat)+hist_size,
208  &comp_time_param<nparam>);
209  }*/
210  nobs++;
211 #endif
212  }
213 
214  template <int nparam>
215  bool LinModel<nparam>::should_observe(double const * tp){
216 #ifndef TUNE
217  ASSERT(0);
218  assert(0);
219  return false;
220 #else
221  return is_active;
222 #endif
223  }
224 
225 
226  template <int nparam>
227  void LinModel<nparam>::update(MPI_Comm cm){
228 #ifdef TUNE
229  double S[nparam];
230  int lwork, liwork;
231  double * work;
232  int * iwork;
233  int rank;
234  int info;
235  // workspace query
236  double dlwork;
237 
238  // number of processors corresponded to the communicator
239  int np;
240  // the rank of the current process in the communicator
241  int rk;
242 
243  // get the number of processes in the group of cm (integer)
244  MPI_Comm_size(cm, &np);
245  // get the rank of the calling process in the group of cm (integer)
246  MPI_Comm_rank(cm, &rk);
247  //if (nobs % tune_interval == 0){
248 
249  //define the number of cols in the matrix to be the min of the number of observations and
250  //the number we are willing to store (hist_size)
251  int nrcol = std::min(nobs,(int64_t)hist_size);
252  //max of the number of local observations and nparam (will usually be the former)
253  int ncol = std::max(nrcol, nparam);
254  /* time_param * sort_mat = (time_param*)alloc(sizeof(time_param)*ncol);
255  memcpy(sort_mat, time_param_mat, sizeof(time_param)*ncol);
256  std::sort(sort_mat, sort_mat+ncol, &comp_time_param);*/
257  int tot_nrcol;
258 
259  //compute the total number of observations over all processors
260  MPI_Allreduce(&nrcol, &tot_nrcol, 1, MPI_INT, MPI_SUM, cm);
261 
262  //if there has been more than 16*nparam observations per processor, tune the model
263  if (tot_nrcol >= 16.*np*nparam){
264  is_tuned = true;
265 
266  //add nparam to ncol to include regularization, don't do so if the number of local
267  //observatins is less than the number of params, as in this case, the processor will
268  //not do any local tuning
269  if (nrcol >= nparam) ncol += nparam;
270 
271  double * R = (double*)alloc(sizeof(double)*nparam*nparam);
272  double * b = (double*)alloc(sizeof(double)*ncol);
273  //if number of local observations less than than nparam don't do local QR
274  if (nrcol < nparam){
275  std::fill(R, R+nparam*nparam, 0.0);
276  std::fill(b, b+ncol, 0.0);
277  //regularization done on every processor
278 /* if (rk == 0){
279  lda_cpy(sizeof(double), 1, nparam, 1, nparam, (char const*)regularization, (char*)R);
280  }*/
281  } else {
282  //define tall-skinny matrix A that is almost the transpose of time_param, but excludes the first row of time_param (that has execution times that we will put into b
283  double * A = (double*)alloc(sizeof(double)*nparam*ncol);
284  int i_st = 0;
285 
286  //figure out the maximum execution time any observation recorded
287  // double max_time = 0.0;
288  // for (int i=0; i<ncol-nparam; i++){
289  // max_time = std::max(time_param_mat[i*mat_lda],max_time);
290  // }
291  /*for (int i=0; i<nparam; i++){
292  R[nparam*i+i] = REG_LAMBDA;
293  }*/
294  // do regularization
295  if (true){ //rk == 0){
296 // lda_cpy(sizeof(double), 1, nparam, 1, ncol, (char const*)regularization, (char*)A);
297  //regularization done on every processor
298  // parameter observs. coeffs. times (sec)
299  //matrix Ax~=b has the form, e.g. nparam=2 [ REG_LAMBDA 0 ] [ x_1 ] = [ 0 ]
300  // [ 0 REG_LAMBDA ] [ x_2 ] [ 0 ]
301  // [ obs1p1 obs1p2 ] [ obs1t ]
302  // obsxpy is the yth parameter as observed [ obs2p1 obs2p2 ] [ obs2t ]
303  // in observation x [ ... ... ] [ ... ]
304  // obsxt is the exe time of observation x
305  for (int i=0; i<nparam; i++){
306  b[i] = 0.0;
307  for (int j=0; j<nparam; j++){
308  if (i==j){
309  if (coeff_guess[i] != 0.0){
310  A[ncol*j+i] = std::min(REG_LAMBDA,(avg_tot_time/coeff_guess[i])/1000.);
311  } else {
312  A[ncol*j+i] = 1;
313  }
314  } else A[ncol*j+i] = 0.0;
315  }
316  }
317  i_st = nparam;
318  }
319  //find the max execution time over all processors
320  // MPI_Allreduce(MPI_IN_PLACE, &max_time, 1, MPI_DOUBLE, MPI_MAX, cm);
321  //double chunk = max_time / 1000.;
322  //printf("%s chunk = %+1.2e\n",name,chunk);
323 
324  //form A
325  for (int i=i_st; i<ncol; i++){
326  //ignore observations that took time less than 1/3 of max
327  //FIXME: WHY? could be much smarter
328  if (0){
329  //if (time_param_mat[(i-i_st)*mat_lda] > max_time/3.){
330  b[i] = 0.0;
331  for (int j=0; j<nparam; j++){
332  A[i+j*ncol] = 0.0;
333  }
334  } else {
335  //take a column of time_param_mat, put the first element (execution time) into b
336  //and the rest of the elements into a row of A
337  b[i] = time_param_mat[(i-i_st)*mat_lda];
338  //double rt_chnks = std::sqrt(b[i] / chunk);
339  //double sfactor = rt_chnks/b[i];
340  //b[i] = rt_chnks;
341  for (int j=0; j<nparam; j++){
342  A[i+j*ncol] = /*sfactor**/time_param_mat[(i-i_st)*mat_lda+j+1];
343  }
344  }
345  }
346  /*for (int i=0; i<ncol; i++){
347  for (int j=0; j<nparam; j++){
348  printf("%+1.3e ", A[i+j*ncol]);
349  }
350  printf (" | %+1.3e\n",b[i]);
351  }*/
352 
353  //sequential code for fitting Ax=b (NOT USED, only works if running with 1 processor)
354  if (false && np == 1){
355  cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, &dlwork, -1, &liwork, &info);
356  assert(info == 0);
357  lwork = (int)dlwork;
358  work = (double*)alloc(sizeof(double)*lwork);
359  iwork = (int*)alloc(sizeof(int)*liwork);
360  std::fill(iwork, iwork+liwork, 0);
361  cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, work, lwork, iwork, &info);
362  //cdgeqrf(
363  assert(info == 0);
364  cdealloc(work);
365  cdealloc(iwork);
366  cdealloc(A);
367  memcpy(coeff_guess, b, nparam*sizeof(double));
368  /*print();
369  double max_resd_sq = 0.0;
370  for (int i=0; i<ncol-nparam; i++){
371  max_resd_sq = std::max(max_resd_sq, b[nparam+i]);
372  }
373  printf("%s max residual sq is %lf\n",name,max_resd_sq);
374  double max_err = 0.0;
375  for (int i=0; i<nobs; i++){
376  max_err = std::max(max_err, fabs(est_time(time_param_mat+i*mat_lda+1)-time_param_mat[i*mat_lda]));
377  }
378  printf("%s max error is %lf\n",name,max_err);*/
379  cdealloc(b);
380  return;
381  }
382 
383  //otherwise on the ith processor compute Q_iR_i=A_i and y_i=Q_i^Tb_i
384  double * tau = (double*)alloc(sizeof(double)*nparam);
385  int lwork;
386  int info;
387  double dlwork;
388  cdgeqrf(ncol, nparam, A, ncol, tau, &dlwork, -1, &info);
389  lwork = (int)dlwork;
390  double * work = (double*)alloc(sizeof(double)*lwork);
391  cdgeqrf(ncol, nparam, A, ncol, tau, work, lwork, &info);
392  lda_cpy(sizeof(double), nparam, nparam, ncol, nparam, (const char *)A, (char*)R);
393  for (int i=0; i<nparam; i++){
394  for (int j=i+1; j<nparam; j++){
395  R[i*nparam+j] = 0.0;
396  }
397  }
398  //query how much space dormqr which computes Q_i^Tb_i needs
399  cdormqr('L', 'T', ncol, 1, nparam, A, ncol, tau, b, ncol, &dlwork, -1, &info);
400  lwork = (int)dlwork;
401  cdealloc(work);
402  work = (double*)alloc(sizeof(double)*lwork);
403  //actually run dormqr which computes Q_i^Tb_i needs
404  cdormqr('L', 'T', ncol, 1, nparam, A, ncol, tau, b, ncol, work, lwork, &info);
405  cdealloc(work);
406  cdealloc(tau);
407  cdealloc(A);
408  }
409  int sub_np = np; //std::min(np,32);
410  MPI_Comm sub_comm;
411  MPI_Comm_split(cm, rk<sub_np, rk, &sub_comm);
412  //use only data from the first 32 processors, so that this doesn't take too long
413  //FIXME: can be smarter but not clear if necessary
414  if (rk < sub_np){
415  //all_R will have the Rs from each processor vertically stacked as [R_1^T .. R_32^T]^T
416  double * all_R = (double*)alloc(sizeof(double)*nparam*nparam*sub_np);
417  //all_b will have the bs from each processor vertically stacked as [b_1^T .. b_32^T]^T
418  double * all_b = (double*)alloc(sizeof(double)*nparam*sub_np);
419  //gather all Rs from all the processors
420  MPI_Allgather(R, nparam*nparam, MPI_DOUBLE, all_R, nparam*nparam, MPI_DOUBLE, sub_comm);
421  double * Rs = (double*)alloc(sizeof(double)*nparam*nparam*sub_np);
422  for (int i=0; i<sub_np; i++){
423  lda_cpy(sizeof(double), nparam, nparam, nparam, sub_np*nparam, (const char *)(all_R+i*nparam*nparam), (char*)(Rs+i*nparam));
424  }
425  //gather all bs from all the processors
426  MPI_Allgather(b, nparam, MPI_DOUBLE, all_b, nparam, MPI_DOUBLE, sub_comm);
427  cdealloc(b);
428  cdealloc(all_R);
429  cdealloc(R);
430  ncol = sub_np*nparam;
431  b = all_b;
432  double * A = Rs;
433  /* if (rk==0){
434  for (int r=0; r<ncol; r++){
435  for (int c=0; c<nparam; c++){
436  printf("A[%d, %d] = %lf, ", r,c,A[c*ncol+r]);
437  }
438  printf("b[%d] = %lf\n",r,b[r]);
439  }
440  }*/
441  //compute fit for a reduced system
442  // parameter observs. coeffs. times (sec)
443  //matrix Ax~=b has the form, e.g. nparam=2 [ R_1 ] [ x_1 ] = [ y_1 ]
444  // [ R_2 ] [ x_2 ] [ y_2 ]
445  // [ ... ] [ ... ]
446  // [ R_32 ] [ y_32 ]
447  //note 32 is p if p < 32
448  cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, &dlwork, -1, &liwork, &info);
449  assert(info == 0);
450  lwork = (int)dlwork;
451  work = (double*)alloc(sizeof(double)*lwork);
452  iwork = (int*)alloc(sizeof(int)*liwork);
453  std::fill(iwork, iwork+liwork, 0);
454  cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, work, lwork, iwork, &info);
455  //cdgeqrf(
456  assert(info == 0);
457  cdealloc(work);
458  cdealloc(iwork);
459  cdealloc(A);
460  memcpy(coeff_guess, b, nparam*sizeof(double));
461  /*print();
462  double max_resd_sq = 0.0;
463  for (int i=0; i<ncol-nparam; i++){
464  max_resd_sq = std::max(max_resd_sq, b[nparam+i]);
465  }
466  printf("%s max residual sq is %lf\n",name,max_resd_sq);
467  double max_err = 0.0;
468  for (int i=0; i<nobs; i++){
469  max_err = std::max(max_err, fabs(est_time(time_param_mat+i*mat_lda+1)-time_param_mat[i*mat_lda]));
470  }
471  printf("%s max error is %lf\n",name,max_err);*/
472  cdealloc(b);
473  }
474  MPI_Comm_free(&sub_comm);
475  //broadcast new coefficient guess
476  MPI_Bcast(coeff_guess, nparam, MPI_DOUBLE, 0, cm);
477  /*for (int i=0; i<nparam; i++){
478  regularization[i] = coeff_guess[i]*REG_LAMBDA;
479  }*/
480  }
481 
482  // check to see if the model should be turned off
483 
484  // first aggregrate the training records of all models
485  double tot_time_total;
486  double over_time_total;
487  double under_time_total;
488  MPI_Allreduce(&tot_time, &tot_time_total, 1, MPI_DOUBLE, MPI_SUM, cm);
489  MPI_Allreduce(&over_time, &over_time_total, 1, MPI_DOUBLE, MPI_SUM, cm);
490  MPI_Allreduce(&under_time, &under_time_total, 1, MPI_DOUBLE, MPI_SUM, cm);
491 
492 
493  // NOTE: to change the minimum number of observations and the threshold,
494  // one needs to change the environment variable MIN_OBS and THRESHOLD before running model_trainer
495 
496  // get the minimum observations required and threshold
497  int min_obs = 1000;
498  char * min_obs_env = getenv("MIN_OBS");
499  if(min_obs_env){
500  min_obs = std::stoi(min_obs_env);
501  }
502 
503  // get the threshold for turning off the model
504  double threshold = 0.05;
505  char * threshold_env = getenv("THRESHOLD");
506  if (threshold_env){
507  threshold = std::stod(threshold_env);
508  }
509 
510  // determine whether the model should be turned off
511  double under_time_ratio = under_time_total/tot_time_total;
512  double over_time_ratio = over_time_total/tot_time_total;
513 
514 
515  if (tot_nrcol >= min_obs && under_time_ratio < threshold && over_time_ratio < threshold && threshold < threshold){
516  is_active = false;
517  std::cout<<"Model "<<name<<" has been turned off"<<std::endl;
518  }
519  avg_tot_time = tot_time_total/np;
520  avg_over_time = over_time_total/np;
521  avg_under_time = under_time_total/np;
522  tot_time = 0.0;
523  over_time = 0.0;
524  under_time = 0.0;
525 #endif
526 
527  }
528 
529  template <int nparam>
530  double LinModel<nparam>::est_time(double const * param){
531  double d = 0.;
532  for (int i=0; i<nparam; i++){
533  d+=param[i]*coeff_guess[i];
534  }
535  return std::max(0.0,d);
536  }
537 
538  template <int nparam>
540  assert(name!=NULL);
541  printf("double %s_init[] = {",name);
542  for (int i=0; i<nparam; i++){
543  if (i>0) printf(", ");
544  printf("%1.4E", coeff_guess[i]);
545  }
546  printf("};\n");
547  }
548 
549  template <int nparam>
551  printf("%s is_tuned = %d (%ld) avg_tot_time = %lf avg_over_time = %lf avg_under_time = %lf\n",name,(int)is_tuned,nobs,avg_tot_time,avg_over_time,avg_under_time);
552  }
553 
554 
555  template <int nparam>
557  return coeff_guess;
558  }
559 
560  template <int nparam>
562  int my_rank;
563  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
564  // Only first process needs to dump the coefficient
565  if (my_rank) return;
566 
567  // Generate the model name
568  std::string model_name = std::string(name);
569  // Generate the new line in the file
570  std::string new_coeff_str = model_name+" ";
571  char buffer[64];
572  for(int i =0; i<nparam; i++){
573  buffer[0] = '\0';
574  std::sprintf(buffer,"%1.4E", coeff_guess[i]);
575  std::string s(buffer);
576  new_coeff_str += s;
577  if (i != nparam - 1){
578  new_coeff_str += " ";
579  }
580  }
581 
582  // Open the file that stores the model info
583  std::vector<std::string> file_content;
584  std::ifstream infile(file_name);
585 
586  bool found_line = false;
587  // If the file exists
588  if(infile){
589  // Scan the file to find the line and replace with the new model coeffs
590  std::string line;
591  while(std::getline(infile,line)){
592  std::istringstream f(line);
593  // Get the model name from the line
594  std::string s;
595  std::getline(f,s,' ');
596  if (s == model_name){
597  line = new_coeff_str;
598  found_line = true;
599  }
600  line += "\n";
601  file_content.push_back(line);
602  }
603  }
604 
605  // Append the string to the file if no match is found
606  if(!found_line){
607  new_coeff_str += "\n";
608  file_content.push_back(new_coeff_str);
609  }
610  std::ofstream ofs;
611  ofs.open(file_name, std::ofstream::out | std::ofstream::trunc);
612  for(int i=0; i<(int)file_content.size(); i++){
613  ofs<<file_content[i];
614  }
615  ofs.close();
616 
617  }
618 
619 
620 
621  template <int nparam>
623  // Generate the model name
624  std::string model_name = std::string(name);
625 
626  // Open the file that stores the model info
627  std::vector<std::string> file_content;
628  std::ifstream infile(file_name);
629  if(!infile){
630  std::cout<<"file "<<file_name<<" does not exist"<<std::endl;
631  return;
632  }
633 
634  // Flag boolean denotes whether the model is found in the file
635  bool found_line = false;
636  // Flag boolean denotes whether the number of coefficients in the file matches with what the model expects
637  bool right_num_coeff = true;
638 
639  // Scan the file to find the model coefficients
640  std::string line;
641  while(std::getline(infile,line)){
642  std::istringstream f(line);
643  // Get the model name from the line
644  std::string s;
645  std::getline(f,s,' ');
646  if (s == model_name){
647  found_line = true;
648 
649  // Get the nparam coeffs
650  // double coeff_from_file [nparam];
651  for(int i=0; i<nparam; i++){
652  if(!std::getline(f,s,' ')){
653  right_num_coeff = false;
654  break;
655  }
656 
657  // Convert the string to char* and update the model coefficients
658  char buf[s.length()+1];
659  for(int j=0;j<(int)s.length();j++){
660  buf[j] = s[j];
661  }
662  buf[s.length()] = '\0';
663  coeff_guess[i] = std::atof(buf);
664  }
665  // Check if there are more coefficients in the file
666  if(right_num_coeff && std::getline(f,s,' ')){
667  right_num_coeff = false;
668  }
669  break;
670  }
671  }
672  // If the model is not found
673  if(!found_line){
674  std::cout<<"Error! No model found in the file!"<<std::endl;
675  }
676  else if (!right_num_coeff){
677  std::cout<<"Error! Number of coefficients in file does not match with the model"<<std::endl;
678  // Initialize model coeff to be all 0s
679  for(int i = 0; i < nparam;i++){
680  coeff_guess[i] = 0.0;
681  }
682  }
683  }
684 
685  template <int nparam>
687  int rank = 0;
688  int np, my_rank;
689  MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
690  MPI_Comm_size(MPI_COMM_WORLD, &np);
691  while(rank < np){
692  if (rank == my_rank){
693  // Open the file
694  std::ofstream ofs;
695  std::string model_name = std::string(name);
696  ofs.open(path+"/"+model_name, std::ofstream::out | std::ofstream::app);
697 
698  if (my_rank == 0){
699  // Dump the model coeffs
700  for(int i=0; i<nparam; i++){
701  ofs<<coeff_guess[i]<<" ";
702  }
703  ofs<<"\n";
704  }
705 
706  // Dump the training data
707  int num_records = std::min(nobs, (int64_t)hist_size);
708  for(int i=0; i<num_records; i++){
709  std::string instance = "";
710  for(int j=0; j<mat_lda; j++){
711  ofs<<time_param_mat[i*mat_lda+j]<<" ";
712  }
713  ofs<<"\n";
714  }
715  ofs.close();
716  }
717  rank++;
718  MPI_Barrier(MPI_COMM_WORLD);
719  }
720  }
721 
722 
723  template class LinModel<1>;
724  template class LinModel<2>;
725  template class LinModel<3>;
726  template class LinModel<4>;
727  template class LinModel<5>;
728 
735  static void cube_params(double const * param, double * lparam, int nparam){
736  //linear parameters
737  memcpy(lparam, param, nparam*sizeof(double));
738  int sq_idx = nparam;
739  int cu_idx = nparam+nparam*(nparam+1)/2;
740  for (int i=0; i<nparam; i++){
741  for (int j=0; j<=i; j++){
742  //quadratic parameters
743  double sqp = param[i]*param[j];
744  lparam[sq_idx] = sqp;
745  sq_idx++;
746  for (int k=0; k<=j; k++){
747  //cubic parameters
748  lparam[cu_idx] = sqp*param[k];
749  cu_idx++;
750  }
751  }
752  }
753  }
754 
755  /*static double * get_cube_param(double const * param, int nparam){
756  double * lparam = new double[nparam*(nparam+1)*(nparam+2)/6+nparam*(nparam+1)/2+nparam];
757  cube_params(param, lparam, nparam);
758  return lparam;
759  }*/
760 
761 
762  template <int nparam>
763  CubicModel<nparam>::CubicModel(double const * init_guess, char const * name, int hist_size)
764  : lmdl(init_guess, name, hist_size)
765  { }
766 
767  template <int nparam>
769 
770  template <int nparam>
771  void CubicModel<nparam>::update(MPI_Comm cm){
772  lmdl.update(cm);
773  }
774 
775  template <int nparam>
777  double ltime_param[nparam*(nparam+1)*(nparam+2)/6+nparam*(nparam+1)/2+nparam+1];
778  ltime_param[0] = time_param[0];
779  cube_params(time_param+1, ltime_param+1, nparam);
780  lmdl.observe(ltime_param);
781  }
782 
783  template <int nparam>
785  return lmdl.should_observe(time_param);
786  }
787 
788  template <int nparam>
789  double CubicModel<nparam>::est_time(double const * param){
790  double lparam[nparam*(nparam+1)*(nparam+2)/6+nparam*(nparam+1)/2+nparam];
791  cube_params(param, lparam, nparam);
792  return lmdl.est_time(lparam);
793  }
794 
795  template <int nparam>
797  lmdl.print();
798  }
799 
800  template <int nparam>
802  lmdl.print_uo();
803  }
804 
805  template <int nparam>
807  return lmdl.get_coeff();
808  }
809 
810  template <int nparam>
812  lmdl.load_coeff(file_name);
813  }
814 
815  template <int nparam>
817  lmdl.write_coeff(file_name);
818  }
819 
820  template <int nparam>
822  lmdl.dump_data(path);
823  }
824 
825  template class CubicModel<1>;
826  template class CubicModel<2>;
827  template class CubicModel<3>;
828  template class CubicModel<4>;
829 }
void load_coeff(std::string file_name)
load model coefficients from file
Definition: model.cxx:811
void load_all_models(std::string file_name)
Definition: model.cxx:34
void cdormqr(char SIDE, char TRANS, int M, int N, int K, double const *A, int LDA, double const *TAU2, double *C, int LDC, double *WORK, int LWORK, int *INFO)
Definition: model.cxx:80
void update_all_models(MPI_Comm comm)
Definition: model.cxx:15
double est_time(double const *param)
estimates model time based on observarions
Definition: model.cxx:530
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values
Definition: model.cxx:168
void write_all_models(std::string file_name)
Definition: model.cxx:42
def rank(self)
Definition: core.pyx:312
double DDOT(int *n, const double *dX, int *incX, const double *dY, int *incY)
void update(MPI_Comm cm)
updates model based on observarions
Definition: model.cxx:227
void cdgeqrf(int const M, int const N, double *A, int const LDA, double *TAU2, double *WORK, int const LWORK, int *INFO)
Definition: model.cxx:67
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
void print_uo()
prints time estimate errors
Definition: model.cxx:801
double * get_coeff()
return the turned model coefficients
Definition: model.cxx:806
void load_coeff(std::string file_name)
load model coefficients from file
Definition: model.cxx:622
#define REG_LAMBDA
Definition: model.cxx:119
double est_time(double const *param)
estimates model time based on observarions
Definition: model.cxx:789
string
Definition: core.pyx:456
void print_all_models()
Definition: model.cxx:23
Cubic performance models, which given measurements, provides new model guess.
Definition: model.h:144
double p[nparam+1]
Definition: model.cxx:110
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values
Definition: model.cxx:776
void cdgelsd(int m, int n, int k, double const *A, int lda_A, double *B, int lda_B, double *S, double cond, int *rank, double *work, int lwork, int *iwork, int *info)
Definition: model.cxx:102
Linear performance models, which given measurements, provides new model guess.
Definition: model.h:32
void write_coeff(std::string file_name)
write model coefficients to file
Definition: model.cxx:816
void dump_all_models(std::string path)
Definition: model.cxx:50
void update(MPI_Comm cm)
updates model based on observarions
Definition: model.cxx:771
void cdormqr(char SIDE, char TRANS, int M, int N, int K, double const *A, int LDA, double const *TAU2, double *C, int LDC, double *WORK, int LWORK, int *INFO)
void cdgeqrf(int M, int N, double *A, int LDA, double *TAU2, double *WORK, int LWORK, int *INFO)
void write_coeff(std::string file_name)
write model coefficients to file
Definition: model.cxx:561
bool should_observe(double const *time_param)
decides whether the current instance should be observed
Definition: model.cxx:215
void print_uo()
prints time estimate errors
Definition: model.cxx:550
CubicModel(double const *init_guess, char const *name, int hist_size=8192)
constructor
Definition: model.cxx:763
void dump_data(std::string path)
write model coefficients to file
Definition: model.cxx:821
std::vector< Model * > & get_all_models()
Definition: model.cxx:10
void cdgelsd(int m, int n, int k, double const *A, int lda_A, double *B, int lda_B, double *S, double cond, int *rank, double *work, int lwork, int *iwork, int *info)
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
Definition: apsp.cxx:11
void lda_cpy(int el_size, int nrow, int ncol, int lda_A, int lda_B, const char *A, char *B)
Copies submatrix to submatrix (column-major)
Definition: util.h:355
double cddot(int n, const double *dX, int incX, const double *dY, int incY)
Definition: model.cxx:60
void dump_data(std::string path)
dump model data to a file
Definition: model.cxx:686
bool comp_time_param(const time_param< nparam > &a, const time_param< nparam > &b)
Definition: model.cxx:114
void print()
prints current parameter estimates
Definition: model.cxx:796
void print()
prints current parameter estimates
Definition: model.cxx:539
bool should_observe(double const *time_param)
decides whether the current instance should be observed
Definition: model.cxx:784
def np(self)
Definition: core.pyx:315
double * get_coeff()
return the turned model coefficients
Definition: model.cxx:556