2 #include "../shared/lapack_symbs.h"     3 #include "../shared/blas_symbs.h"     5 #include "../shared/util.h"    11     static std::vector<Model*> all_models;
    58 #define SPLINE_CHUNK_SZ = 8    60   double cddot(
int n,       
const double *dX,
    61                int incX,    
const double *dY,
    94     CTF_LAPACK::cdormqr(SIDE, TRANS, M, N, K, A, LDA, TAU2, C, LDC, WORK, LWORK, INFO);
   102   void cdgelsd(
int m, 
int n, 
int k, 
double const * A, 
int lda_A, 
double * B, 
int lda_B, 
double * S, 
double cond, 
int * 
rank, 
double * work, 
int lwork, 
int * iwork, 
int * info){
   104     CTF_LAPACK::cdgelsd(m, n, k, A, lda_A, B, lda_B, S, cond, rank, work, lwork, iwork, info);
   108   template <
int nparam>
   113   template <
int nparam>
   115     return a.
p[0] > b.
p[0];
   119 #define REG_LAMBDA 1.E6   121   template <
int nparam>
   126     memcpy(coeff_guess, init_guess, nparam*
sizeof(
double));
   131     name = (
char*)
alloc(strlen(name_)+1);
   134     hist_size = hist_size_;
   136     time_param_mat = (
double*)
alloc(mat_lda*hist_size*
sizeof(
double));
   142     avg_under_time = 0.0;
   150   template <
int nparam>
   155     time_param_mat = NULL;
   158   template <
int nparam>
   162     if (time_param_mat != NULL) 
cdealloc(time_param_mat);
   167   template <
int nparam>
   179       if (est_time(tp+1)>tp[0]){
   180         under_time += est_time(tp+1)-tp[0];
   182         over_time += tp[0]-est_time(tp+1);
   194     assert(tp[0] >= 0.0);
   197     memcpy(time_param_mat+(nobs%hist_size)*mat_lda, tp, mat_lda*
sizeof(
double));
   214   template <
int nparam>
   226   template <
int nparam>
   244     MPI_Comm_size(cm, &np);
   246     MPI_Comm_rank(cm, &rk);
   251     int nrcol = std::min(nobs,(int64_t)hist_size);
   253     int ncol = std::max(nrcol, nparam);
   260     MPI_Allreduce(&nrcol, &tot_nrcol, 1, MPI_INT, MPI_SUM, cm);
   263     if (tot_nrcol >= 16.*np*nparam){
   269       if (nrcol >= nparam) ncol += nparam;
   271       double * R = (
double*)
alloc(
sizeof(
double)*nparam*nparam);
   272       double * 
b = (
double*)
alloc(
sizeof(
double)*ncol);
   275         std::fill(R, R+nparam*nparam, 0.0);
   276         std::fill(b, b+ncol, 0.0);
   283         double * A = (
double*)
alloc(
sizeof(
double)*nparam*ncol);
   305           for (
int i=0; i<nparam; i++){
   307             for (
int j=0; j<nparam; j++){
   309                 if (coeff_guess[i] != 0.0){
   310                   A[ncol*j+i] = std::min(
REG_LAMBDA,(avg_tot_time/coeff_guess[i])/1000.);
   314               } 
else      A[ncol*j+i] = 0.0;
   325         for (
int i=i_st; i<ncol; i++){
   331             for (
int j=0; j<nparam; j++){
   337             b[i] = time_param_mat[(i-i_st)*mat_lda];
   341             for (
int j=0; j<nparam; j++){
   342               A[i+j*ncol] = time_param_mat[(i-i_st)*mat_lda+j+1];
   354         if (
false && np == 1){
   355           cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, &dlwork, -1, &liwork, &info);
   358           work = (
double*)
alloc(
sizeof(
double)*lwork);
   359           iwork = (
int*)
alloc(
sizeof(
int)*liwork);
   360           std::fill(iwork, iwork+liwork, 0);
   361           cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, work, lwork, iwork, &info);
   367           memcpy(coeff_guess, b, nparam*
sizeof(
double));
   384         double * tau = (
double*)
alloc(
sizeof(
double)*nparam);
   388         cdgeqrf(ncol, nparam, A, ncol, tau, &dlwork, -1, &info);
   390         double * work = (
double*)
alloc(
sizeof(
double)*lwork);
   391         cdgeqrf(ncol, nparam, A, ncol, tau, work, lwork, &info);
   392         lda_cpy(
sizeof(
double), nparam, nparam, ncol, nparam, (
const char *)A, (
char*)R);
   393         for (
int i=0; i<nparam; i++){
   394           for (
int j=i+1; j<nparam; j++){
   399         cdormqr(
'L', 
'T', ncol, 1, nparam, A, ncol, tau, b, ncol, &dlwork, -1, &info);
   402         work = (
double*)
alloc(
sizeof(
double)*lwork);
   404         cdormqr(
'L', 
'T', ncol, 1, nparam, A, ncol, tau, b, ncol, work, lwork, &info);
   411       MPI_Comm_split(cm, rk<sub_np, rk, &sub_comm);
   416         double * all_R = (
double*)
alloc(
sizeof(
double)*nparam*nparam*sub_np);
   418         double * all_b = (
double*)
alloc(
sizeof(
double)*nparam*sub_np);
   420         MPI_Allgather(R, nparam*nparam, MPI_DOUBLE, all_R, nparam*nparam, MPI_DOUBLE, sub_comm);
   421         double * Rs = (
double*)
alloc(
sizeof(
double)*nparam*nparam*sub_np);
   422         for (
int i=0; i<sub_np; i++){
   423           lda_cpy(
sizeof(
double), nparam, nparam, nparam, sub_np*nparam, (
const char *)(all_R+i*nparam*nparam), (
char*)(Rs+i*nparam));
   426         MPI_Allgather(b, nparam, MPI_DOUBLE, all_b, nparam, MPI_DOUBLE, sub_comm);
   430         ncol = sub_np*nparam;
   448         cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, &dlwork, -1, &liwork, &info);
   451         work = (
double*)
alloc(
sizeof(
double)*lwork);
   452         iwork = (
int*)
alloc(
sizeof(
int)*liwork);
   453         std::fill(iwork, iwork+liwork, 0);
   454         cdgelsd(ncol, nparam, 1, A, ncol, b, ncol, S, -1, &rank, work, lwork, iwork, &info);
   460         memcpy(coeff_guess, b, nparam*
sizeof(
double));
   474       MPI_Comm_free(&sub_comm);
   476       MPI_Bcast(coeff_guess, nparam, MPI_DOUBLE, 0, cm);
   485     double tot_time_total;
   486     double over_time_total;
   487     double under_time_total;
   488     MPI_Allreduce(&tot_time, &tot_time_total, 1, MPI_DOUBLE, MPI_SUM, cm);
   489     MPI_Allreduce(&over_time, &over_time_total, 1, MPI_DOUBLE, MPI_SUM, cm);
   490     MPI_Allreduce(&under_time, &under_time_total, 1, MPI_DOUBLE, MPI_SUM, cm);
   498     char * min_obs_env = getenv(
"MIN_OBS");
   500       min_obs = std::stoi(min_obs_env);
   504     double threshold = 0.05;
   505     char * threshold_env = getenv(
"THRESHOLD");
   507       threshold = std::stod(threshold_env);
   511     double under_time_ratio = under_time_total/tot_time_total;
   512     double over_time_ratio = over_time_total/tot_time_total;
   515     if (tot_nrcol >= min_obs  && under_time_ratio < threshold && over_time_ratio < threshold && threshold < threshold){
   517       std::cout<<
"Model "<<name<<
" has been turned off"<<std::endl;
   519     avg_tot_time = tot_time_total/
np;
   520     avg_over_time = over_time_total/
np;
   521     avg_under_time = under_time_total/
np;
   529   template <
int nparam>
   532     for (
int i=0; i<nparam; i++){
   533       d+=param[i]*coeff_guess[i];
   535     return std::max(0.0,d);
   538   template <
int nparam>
   541     printf(
"double %s_init[] = {",name);
   542     for (
int i=0; i<nparam; i++){
   543       if (i>0) printf(
", ");
   544       printf(
"%1.4E", coeff_guess[i]);
   549   template <
int nparam>
   551     printf(
"%s is_tuned = %d (%ld) avg_tot_time = %lf avg_over_time = %lf avg_under_time = %lf\n",name,(
int)is_tuned,nobs,avg_tot_time,avg_over_time,avg_under_time);
   555   template <
int nparam>
   560   template <
int nparam>
   563     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
   572     for(
int i =0; i<nparam; i++){
   574       std::sprintf(buffer,
"%1.4E", coeff_guess[i]);
   577       if (i != nparam - 1){
   578         new_coeff_str += 
" ";
   583     std::vector<std::string> file_content;
   584     std::ifstream infile(file_name);
   586     bool found_line = 
false;
   591       while(std::getline(infile,line)){
   592         std::istringstream f(line);
   595         std::getline(f,s,
' ');
   596         if (s == model_name){
   597           line = new_coeff_str;
   601         file_content.push_back(line);
   607       new_coeff_str += 
"\n";
   608       file_content.push_back(new_coeff_str);
   611     ofs.open(file_name, std::ofstream::out | std::ofstream::trunc);
   612     for(
int i=0; i<(int)file_content.size(); i++){
   613       ofs<<file_content[i];
   621   template <
int nparam>
   627     std::vector<std::string> file_content;
   628     std::ifstream infile(file_name);
   630       std::cout<<
"file "<<file_name<<
" does not exist"<<std::endl;
   635     bool found_line = 
false;
   637     bool right_num_coeff = 
true;
   641     while(std::getline(infile,line)){
   642       std::istringstream f(line);
   645       std::getline(f,s,
' ');
   646       if (s == model_name){
   651         for(
int i=0; i<nparam; i++){
   652           if(!std::getline(f,s,
' ')){
   653             right_num_coeff = 
false;
   658           char buf[s.length()+1];
   659           for(
int j=0;j<(int)s.length();j++){
   662           buf[s.length()] = 
'\0';
   663           coeff_guess[i] = std::atof(buf);
   666         if(right_num_coeff && std::getline(f,s,
' ')){
   667           right_num_coeff = 
false;
   674       std::cout<<
"Error! No model found in the file!"<<std::endl;
   676     else if (!right_num_coeff){
   677       std::cout<<
"Error! Number of coefficients in file does not match with the model"<<std::endl;
   679       for(
int i = 0; i < nparam;i++){
   680         coeff_guess[i] = 0.0;
   685   template <
int nparam>
   689     MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
   690     MPI_Comm_size(MPI_COMM_WORLD, &np);
   692         if (rank == my_rank){
   696         ofs.open(path+
"/"+model_name, std::ofstream::out | std::ofstream::app);
   700             for(
int i=0; i<nparam; i++){
   701               ofs<<coeff_guess[i]<<
" ";
   707         int num_records = std::min(nobs, (int64_t)hist_size);
   708         for(
int i=0; i<num_records; i++){
   710            for(
int j=0; j<mat_lda; j++){
   711              ofs<<time_param_mat[i*mat_lda+j]<<
" ";
   718       MPI_Barrier(MPI_COMM_WORLD);
   735   static void cube_params(
double const * param, 
double * lparam, 
int nparam){
   737     memcpy(lparam, param, nparam*
sizeof(
double));
   739     int cu_idx = nparam+nparam*(nparam+1)/2;
   740     for (
int i=0; i<nparam; i++){
   741       for (
int j=0; j<=i; j++){
   743         double sqp = param[i]*param[j];
   744         lparam[sq_idx] = sqp;
   746         for (
int k=0; k<=j; k++){
   748           lparam[cu_idx] = sqp*param[k];
   762   template <
int nparam>
   764     : lmdl(init_guess, name, hist_size)
   767   template <
int nparam>
   770   template <
int nparam>
   775   template <
int nparam>
   777     double ltime_param[nparam*(nparam+1)*(nparam+2)/6+nparam*(nparam+1)/2+nparam+1];
   778     ltime_param[0] = time_param[0];
   779     cube_params(time_param+1, ltime_param+1, nparam);
   783   template <
int nparam>
   788   template <
int nparam>
   790     double lparam[nparam*(nparam+1)*(nparam+2)/6+nparam*(nparam+1)/2+nparam];
   791     cube_params(param, lparam, nparam);
   795   template <
int nparam>
   800   template <
int nparam>
   805   template <
int nparam>
   810   template <
int nparam>
   815   template <
int nparam>
   820   template <
int nparam>
 void load_coeff(std::string file_name)
load model coefficients from file 
void load_all_models(std::string file_name)
void cdormqr(char SIDE, char TRANS, int M, int N, int K, double const *A, int LDA, double const *TAU2, double *C, int LDC, double *WORK, int LWORK, int *INFO)
void update_all_models(MPI_Comm comm)
double est_time(double const *param)
estimates model time based on observarions 
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values 
void write_all_models(std::string file_name)
double DDOT(int *n, const double *dX, int *incX, const double *dY, int *incY)
void update(MPI_Comm cm)
updates model based on observarions 
void cdgeqrf(int const M, int const N, double *A, int const LDA, double *TAU2, double *WORK, int const LWORK, int *INFO)
void * alloc(int64_t len)
alloc abstraction 
void print_uo()
prints time estimate errors 
double * get_coeff()
return the turned model coefficients 
void load_coeff(std::string file_name)
load model coefficients from file 
double est_time(double const *param)
estimates model time based on observarions 
Cubic performance models, which given measurements, provides new model guess. 
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values 
void cdgelsd(int m, int n, int k, double const *A, int lda_A, double *B, int lda_B, double *S, double cond, int *rank, double *work, int lwork, int *iwork, int *info)
Linear performance models, which given measurements, provides new model guess. 
void write_coeff(std::string file_name)
write model coefficients to file 
void dump_all_models(std::string path)
void update(MPI_Comm cm)
updates model based on observarions 
void cdormqr(char SIDE, char TRANS, int M, int N, int K, double const *A, int LDA, double const *TAU2, double *C, int LDC, double *WORK, int LWORK, int *INFO)
void cdgeqrf(int M, int N, double *A, int LDA, double *TAU2, double *WORK, int LWORK, int *INFO)
void write_coeff(std::string file_name)
write model coefficients to file 
bool should_observe(double const *time_param)
decides whether the current instance should be observed 
void print_uo()
prints time estimate errors 
CubicModel(double const *init_guess, char const *name, int hist_size=8192)
constructor 
void dump_data(std::string path)
write model coefficients to file 
std::vector< Model * > & get_all_models()
void cdgelsd(int m, int n, int k, double const *A, int lda_A, double *B, int lda_B, double *S, double cond, int *rank, double *work, int lwork, int *iwork, int *info)
int cdealloc(void *ptr)
free abstraction 
void lda_cpy(int el_size, int nrow, int ncol, int lda_A, int lda_B, const char *A, char *B)
Copies submatrix to submatrix (column-major) 
double cddot(int n, const double *dX, int incX, const double *dY, int incY)
void dump_data(std::string path)
dump model data to a file 
bool comp_time_param(const time_param< nparam > &a, const time_param< nparam > &b)
void print()
prints current parameter estimates 
void print()
prints current parameter estimates 
bool should_observe(double const *time_param)
decides whether the current instance should be observed 
double * get_coeff()
return the turned model coefficients