2 #include "../shared/lapack_symbs.h" 3 #include "../shared/blas_symbs.h" 5 #include "../shared/util.h" 11 static std::vector<Model*> all_models;
58 #define SPLINE_CHUNK_SZ = 8 60 double cddot(
int n,
const double *dX,
61 int incX,
const double *dY,
94 CTF_LAPACK::cdormqr(SIDE, TRANS, M, N, K, A, LDA, TAU2, C, LDC, WORK, LWORK, INFO);
102 void cdgelsd(
int m,
int n,
int k,
double const * A,
int lda_A,
double * B,
int lda_B,
double * S,
double cond,
int *
rank,
double * work,
int lwork,
int * iwork,
int * info){
104 CTF_LAPACK::cdgelsd(m, n, k, A, lda_A, B, lda_B, S, cond, rank, work, lwork, iwork, info);
108 template <
int nparam>
113 template <
int nparam>
115 return a.
p[0] > b.
p[0];
119 #define REG_LAMBDA 1.E6 121 template <
int nparam>
126 memcpy(coeff_guess, init_guess, nparam*
sizeof(
double));
131 name = (
char*)
alloc(strlen(name_)+1);
134 hist_size = hist_size_;
136 time_param_mat = (
double*)
alloc(mat_lda*hist_size*
sizeof(
double));
142 avg_under_time = 0.0;
150 template <
int nparam>
155 time_param_mat = NULL;
158 template <
int nparam>
162 if (time_param_mat != NULL)
cdealloc(time_param_mat);
167 template <
int nparam>
179 if (est_time(tp+1)>tp[0]){
180 under_time += est_time(tp+1)-tp[0];
182 over_time += tp[0]-est_time(tp+1);
194 assert(tp[0] >= 0.0);
197 memcpy(time_param_mat+(nobs%hist_size)*mat_lda, tp, mat_lda*
sizeof(
double));
214 template <
int nparam>
226 template <
int nparam>
244 MPI_Comm_size(cm, &np);
246 MPI_Comm_rank(cm, &rk);
251 int nrcol = std::min(nobs,(int64_t)hist_size);
253 int ncol = std::max(nrcol, nparam);
260 MPI_Allreduce(&nrcol, &tot_nrcol, 1, MPI_INT, MPI_SUM, cm);
263 if (tot_nrcol >= 16.*np*nparam){
269 if (nrcol >= nparam) ncol += nparam;
271 double * R = (
double*)
alloc(
sizeof(
double)*nparam*nparam);
272 double *
b = (
double*)
alloc(
sizeof(
double)*ncol);
275 std::fill(R, R+nparam*nparam, 0.0);
276 std::fill(b, b+ncol, 0.0);
283 double * A = (
double*)
alloc(
sizeof(
double)*nparam*ncol);
305 for (
int i=0; i<nparam; i++){
307 for (
int j=0; j<nparam; j++){
309 if (coeff_guess[i] != 0.0){
310 A[ncol*j+i] = std::min(
REG_LAMBDA,(avg_tot_time/coeff_guess[i])/1000.);
314 }
else A[ncol*j+i] = 0.0;
325 for (
int i=i_st; i<ncol; i++){
331 for (
int j=0; j<nparam; j++){
337 b[i] = time_param_mat[(i-i_st)*mat_lda];
341 for (
int j=0; j<nparam; j++){
342 A[i+j*ncol] = time_param_mat[(i-i_st)*mat_lda+j+1];
354 if (
false && np == 1){
355 cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, &dlwork, -1, &liwork, &info);
358 work = (
double*)
alloc(
sizeof(
double)*lwork);
359 iwork = (
int*)
alloc(
sizeof(
int)*liwork);
360 std::fill(iwork, iwork+liwork, 0);
361 cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, work, lwork, iwork, &info);
367 memcpy(coeff_guess, b, nparam*
sizeof(
double));
384 double * tau = (
double*)
alloc(
sizeof(
double)*nparam);
388 cdgeqrf(ncol, nparam, A, ncol, tau, &dlwork, -1, &info);
390 double * work = (
double*)
alloc(
sizeof(
double)*lwork);
391 cdgeqrf(ncol, nparam, A, ncol, tau, work, lwork, &info);
392 lda_cpy(
sizeof(
double), nparam, nparam, ncol, nparam, (
const char *)A, (
char*)R);
393 for (
int i=0; i<nparam; i++){
394 for (
int j=i+1; j<nparam; j++){
399 cdormqr(
'L',
'T', ncol, 1, nparam, A, ncol, tau, b, ncol, &dlwork, -1, &info);
402 work = (
double*)
alloc(
sizeof(
double)*lwork);
404 cdormqr(
'L',
'T', ncol, 1, nparam, A, ncol, tau, b, ncol, work, lwork, &info);
411 MPI_Comm_split(cm, rk<sub_np, rk, &sub_comm);
416 double * all_R = (
double*)
alloc(
sizeof(
double)*nparam*nparam*sub_np);
418 double * all_b = (
double*)
alloc(
sizeof(
double)*nparam*sub_np);
420 MPI_Allgather(R, nparam*nparam, MPI_DOUBLE, all_R, nparam*nparam, MPI_DOUBLE, sub_comm);
421 double * Rs = (
double*)
alloc(
sizeof(
double)*nparam*nparam*sub_np);
422 for (
int i=0; i<sub_np; i++){
423 lda_cpy(
sizeof(
double), nparam, nparam, nparam, sub_np*nparam, (
const char *)(all_R+i*nparam*nparam), (
char*)(Rs+i*nparam));
426 MPI_Allgather(b, nparam, MPI_DOUBLE, all_b, nparam, MPI_DOUBLE, sub_comm);
430 ncol = sub_np*nparam;
448 cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, &dlwork, -1, &liwork, &info);
451 work = (
double*)
alloc(
sizeof(
double)*lwork);
452 iwork = (
int*)
alloc(
sizeof(
int)*liwork);
453 std::fill(iwork, iwork+liwork, 0);
454 cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, work, lwork, iwork, &info);
460 memcpy(coeff_guess, b, nparam*
sizeof(
double));
474 MPI_Comm_free(&sub_comm);
476 MPI_Bcast(coeff_guess, nparam, MPI_DOUBLE, 0, cm);
485 double tot_time_total;
486 double over_time_total;
487 double under_time_total;
488 MPI_Allreduce(&tot_time, &tot_time_total, 1, MPI_DOUBLE, MPI_SUM, cm);
489 MPI_Allreduce(&over_time, &over_time_total, 1, MPI_DOUBLE, MPI_SUM, cm);
490 MPI_Allreduce(&under_time, &under_time_total, 1, MPI_DOUBLE, MPI_SUM, cm);
498 char * min_obs_env = getenv(
"MIN_OBS");
500 min_obs = std::stoi(min_obs_env);
504 double threshold = 0.05;
505 char * threshold_env = getenv(
"THRESHOLD");
507 threshold = std::stod(threshold_env);
511 double under_time_ratio = under_time_total/tot_time_total;
512 double over_time_ratio = over_time_total/tot_time_total;
515 if (tot_nrcol >= min_obs && under_time_ratio < threshold && over_time_ratio < threshold && threshold < threshold){
517 std::cout<<
"Model "<<name<<
" has been turned off"<<std::endl;
519 avg_tot_time = tot_time_total/
np;
520 avg_over_time = over_time_total/
np;
521 avg_under_time = under_time_total/
np;
529 template <
int nparam>
532 for (
int i=0; i<nparam; i++){
533 d+=param[i]*coeff_guess[i];
535 return std::max(0.0,d);
538 template <
int nparam>
541 printf(
"double %s_init[] = {",name);
542 for (
int i=0; i<nparam; i++){
543 if (i>0) printf(
", ");
544 printf(
"%1.4E", coeff_guess[i]);
549 template <
int nparam>
551 printf(
"%s is_tuned = %d (%ld) avg_tot_time = %lf avg_over_time = %lf avg_under_time = %lf\n",name,(
int)is_tuned,nobs,avg_tot_time,avg_over_time,avg_under_time);
555 template <
int nparam>
560 template <
int nparam>
563 MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
572 for(
int i =0; i<nparam; i++){
574 std::sprintf(buffer,
"%1.4E", coeff_guess[i]);
577 if (i != nparam - 1){
578 new_coeff_str +=
" ";
583 std::vector<std::string> file_content;
584 std::ifstream infile(file_name);
586 bool found_line =
false;
591 while(std::getline(infile,line)){
592 std::istringstream f(line);
595 std::getline(f,s,
' ');
596 if (s == model_name){
597 line = new_coeff_str;
601 file_content.push_back(line);
607 new_coeff_str +=
"\n";
608 file_content.push_back(new_coeff_str);
611 ofs.open(file_name, std::ofstream::out | std::ofstream::trunc);
612 for(
int i=0; i<(int)file_content.size(); i++){
613 ofs<<file_content[i];
621 template <
int nparam>
627 std::vector<std::string> file_content;
628 std::ifstream infile(file_name);
630 std::cout<<
"file "<<file_name<<
" does not exist"<<std::endl;
635 bool found_line =
false;
637 bool right_num_coeff =
true;
641 while(std::getline(infile,line)){
642 std::istringstream f(line);
645 std::getline(f,s,
' ');
646 if (s == model_name){
651 for(
int i=0; i<nparam; i++){
652 if(!std::getline(f,s,
' ')){
653 right_num_coeff =
false;
658 char buf[s.length()+1];
659 for(
int j=0;j<(int)s.length();j++){
662 buf[s.length()] =
'\0';
663 coeff_guess[i] = std::atof(buf);
666 if(right_num_coeff && std::getline(f,s,
' ')){
667 right_num_coeff =
false;
674 std::cout<<
"Error! No model found in the file!"<<std::endl;
676 else if (!right_num_coeff){
677 std::cout<<
"Error! Number of coefficients in file does not match with the model"<<std::endl;
679 for(
int i = 0; i < nparam;i++){
680 coeff_guess[i] = 0.0;
685 template <
int nparam>
689 MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
690 MPI_Comm_size(MPI_COMM_WORLD, &np);
692 if (rank == my_rank){
696 ofs.open(path+
"/"+model_name, std::ofstream::out | std::ofstream::app);
700 for(
int i=0; i<nparam; i++){
701 ofs<<coeff_guess[i]<<
" ";
707 int num_records = std::min(nobs, (int64_t)hist_size);
708 for(
int i=0; i<num_records; i++){
710 for(
int j=0; j<mat_lda; j++){
711 ofs<<time_param_mat[i*mat_lda+j]<<
" ";
718 MPI_Barrier(MPI_COMM_WORLD);
735 static void cube_params(
double const * param,
double * lparam,
int nparam){
737 memcpy(lparam, param, nparam*
sizeof(
double));
739 int cu_idx = nparam+nparam*(nparam+1)/2;
740 for (
int i=0; i<nparam; i++){
741 for (
int j=0; j<=i; j++){
743 double sqp = param[i]*param[j];
744 lparam[sq_idx] = sqp;
746 for (
int k=0; k<=j; k++){
748 lparam[cu_idx] = sqp*param[k];
762 template <
int nparam>
764 : lmdl(init_guess, name, hist_size)
767 template <
int nparam>
770 template <
int nparam>
775 template <
int nparam>
777 double ltime_param[nparam*(nparam+1)*(nparam+2)/6+nparam*(nparam+1)/2+nparam+1];
778 ltime_param[0] = time_param[0];
779 cube_params(time_param+1, ltime_param+1, nparam);
783 template <
int nparam>
788 template <
int nparam>
790 double lparam[nparam*(nparam+1)*(nparam+2)/6+nparam*(nparam+1)/2+nparam];
791 cube_params(param, lparam, nparam);
795 template <
int nparam>
800 template <
int nparam>
805 template <
int nparam>
810 template <
int nparam>
815 template <
int nparam>
820 template <
int nparam>
void load_coeff(std::string file_name)
load model coefficients from file
void load_all_models(std::string file_name)
void cdormqr(char SIDE, char TRANS, int M, int N, int K, double const *A, int LDA, double const *TAU2, double *C, int LDC, double *WORK, int LWORK, int *INFO)
void update_all_models(MPI_Comm comm)
double est_time(double const *param)
estimates model time based on observarions
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values
void write_all_models(std::string file_name)
double DDOT(int *n, const double *dX, int *incX, const double *dY, int *incY)
void update(MPI_Comm cm)
updates model based on observarions
void cdgeqrf(int const M, int const N, double *A, int const LDA, double *TAU2, double *WORK, int const LWORK, int *INFO)
void * alloc(int64_t len)
alloc abstraction
void print_uo()
prints time estimate errors
double * get_coeff()
return the turned model coefficients
void load_coeff(std::string file_name)
load model coefficients from file
double est_time(double const *param)
estimates model time based on observarions
Cubic performance models, which given measurements, provides new model guess.
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values
void cdgelsd(int m, int n, int k, double const *A, int lda_A, double *B, int lda_B, double *S, double cond, int *rank, double *work, int lwork, int *iwork, int *info)
Linear performance models, which given measurements, provides new model guess.
void write_coeff(std::string file_name)
write model coefficients to file
void dump_all_models(std::string path)
void update(MPI_Comm cm)
updates model based on observarions
void cdormqr(char SIDE, char TRANS, int M, int N, int K, double const *A, int LDA, double const *TAU2, double *C, int LDC, double *WORK, int LWORK, int *INFO)
void cdgeqrf(int M, int N, double *A, int LDA, double *TAU2, double *WORK, int LWORK, int *INFO)
void write_coeff(std::string file_name)
write model coefficients to file
bool should_observe(double const *time_param)
decides whether the current instance should be observed
void print_uo()
prints time estimate errors
CubicModel(double const *init_guess, char const *name, int hist_size=8192)
constructor
void dump_data(std::string path)
write model coefficients to file
std::vector< Model * > & get_all_models()
void cdgelsd(int m, int n, int k, double const *A, int lda_A, double *B, int lda_B, double *S, double cond, int *rank, double *work, int lwork, int *iwork, int *info)
int cdealloc(void *ptr)
free abstraction
void lda_cpy(int el_size, int nrow, int ncol, int lda_A, int lda_B, const char *A, char *B)
Copies submatrix to submatrix (column-major)
double cddot(int n, const double *dX, int incX, const double *dY, int incY)
void dump_data(std::string path)
dump model data to a file
bool comp_time_param(const time_param< nparam > &a, const time_param< nparam > &b)
void print()
prints current parameter estimates
void print()
prints current parameter estimates
bool should_observe(double const *time_param)
decides whether the current instance should be observed
double * get_coeff()
return the turned model coefficients