15   Timer jacobi_spmv(
"jacobi_spmv");
    62   for (
int i=0; i<nsm; i++){
    83     x1[
"i"] = -1.*R[
"ij"]*x[
"j"];
    88     x[
"i"] += omega*x1[
"i"];
    92     r[
"i"] -= A[
"ij"]*x[
"j"];
    95     if (A.
wrld->
rank == 0) printf(
"r norm is %E\n",rnorm);
   103   char tlvl_name[] = {
'l',
'v',
'l',(char)(
'0'+nlevel),
'\0'};
   104   Timer tlvl(tlvl_name);
   108   r[
"i"] -= A[
"ij"]*x[
"j"];
   113   if (A.
wrld->
rank == 0) printf(
"At level %d residual norm was %1.2E initially\n",nlevel,rnorm0);
   121   r[
"i"] -= A[
"ij"]*x[
"j"];
   127     if (A.
wrld->
rank == 0) printf(
"At level %d (coarsest level), residual norm was %1.2E initially\n",nlevel,rnorm0);
   128     if (A.
wrld->
rank == 0) printf(
"At level %d (coarsest level), residual norm was %1.2E after smooth\n",nlevel,rnorm);
   132   int64_t m = P[0].
lens[1];
   135   Timer rstr(
"restriction");
   140   PTr[
"i"] += P[0][
"ji"]*r[
"j"];
   147   vcycle(PTAP[0], zx, PTr, P+1, PTAP+1, m, nlevel-1, nsm+1);
   151   x[
"i"] += P[0][
"ij"]*zx[
"j"]; 
   155   r[
"i"] -= A[
"ij"]*x[
"j"];
   163   r[
"i"] -= A[
"ij"]*x[
"j"];
   165   if (A.
wrld->
rank == 0) printf(
"At level %d, residual norm was %1.2E initially\n",nlevel,rnorm0);
   166   if (x.wrld->rank == 0) printf(
"At level %d, n=%ld residual norm was %1.2E after initial smooth\n",nlevel,N,rnorm);
   167   if (A.
wrld->
rank == 0) printf(
"At level %d, residual norm was %1.2E after coarse recursion\n",nlevel,rnorm2);
   168   if (A.
wrld->
rank == 0) printf(
"At level %d, residual norm was %1.2E after final smooth\n",nlevel,rnorm3);
   174   if (nlevel == 0) 
return;
   176   char slvl_name[] = {
's',
'l',
'v',
'l',(char)(
'0'+nlevel),
'\0'};
   177   Timer slvl(slvl_name);
   179   int64_t m = T[0].
lens[1];
   185   Timer trip(
"triple_matrix_product_to_form_T");
   188   F[
"ik"] = A[
"ij"]*T[0][
"jk"];
   189   P[0][
"ij"] = T[0][
"ij"];
   190   P[0][
"ik"] -= D[
"il"]*F[
"lk"];
   200   Timer trip2(
"triple_matrix_product_to_form_PTAP");
   203   AP[
"lj"] = A[
"lk"]*P[0][
"kj"];
   204   PTAP[0][
"ij"] = P[0][
"li"]*AP[
"lj"];
   208   setup(PTAP[0], T+1, m, nlevel-1, P+1, PTAP+1);
   226   double st_time = MPI_Wtime();
   227   vcycle(A, x_init, b, P, PTAP, N, nlvl, nsm);
   228   double vtime = MPI_Wtime()-st_time;
   234   r2[
"i"] -= A[
"ij"]*x2[
"j"];
   239   r[
"i"] -= A[
"ij"]*x_init[
"j"];
   242   bool pass = rnorm < rnorm_alt;
   246     printf(
"Algebraic multigrid with n %ld nlvl %d took %lf seconds, fine-grid only err = %E, multigrid err = %E\n",N,nlvl,vtime,rnorm_alt,rnorm); 
   249       printf(
"{ algebraic multigrid method } passed \n");
   251       printf(
"{ algebraic multigrid method } failed \n");
   267   Timer tct(
"initialization");
   278     printf(
"Generated matrix with dimension %1.2E and %1.2E nonzeros\n", (
REAL)n3, (
REAL)A.
nnz_tot);
   286   std::pair<REAL,int64_t> * new_vals;
   290   new_vals = (std::pair<REAL,int64_t>*)malloc(
sizeof(std::pair<REAL,int64_t>)*nvals);
   292   for (int64_t i=0; i<nvals; i++){
   293     new_vals[i] = std::pair<REAL,int64_t>(vals[i],
abs((inds[i]%n3) - (inds[i]/n3)));
   296   B.
write(nvals,inds,new_vals);
   302     int64_t x =  d.second % n;
   303     int64_t y = (d.second / n) % n;
   304     int64_t z =  d.second / n  / n;
   306       d.first = d.first/pow((
REAL)(x+y+z),decay_exp/2.);
   314   int tot_ndiv = ndiv*ndiv*ndiv;
   315   for (
int i=0; i<nlvl; i++){
   316     int64_t m2 = m/tot_ndiv;
   318     int64_t mmy = m2/dw.
np;
   319     if (dw.
rank < m2%dw.
np) mmy++;
   322     for (int64_t j=dw.
rank; j<m2; j+=dw.
np){
   323       for (
int k=0; k<tot_ndiv; k++){
   324         pairs[nel] = 
Pair<REAL>(j*m+j*tot_ndiv+k, 1.0);
   328     T[i].
write(nel, pairs);
   339   setup(A, T, n3, nlvl, P, PTAP);
   352   Timer tct(
"initialization");
   358   int64_t my_col = n3/dw.
np;
   359   int64_t my_col_st = dw.
rank*my_col;
   360   if (dw.
rank < n%dw.
np) my_col++;
   361   my_col_st += std::min((
int)dw.
rank, n3%dw.
np);
   362   int64_t my_tot_nnz = my_col*3;
   363   int64_t * inds = (int64_t*)malloc(
sizeof(int64_t)*my_tot_nnz);
   366   int64_t act_tot_nnz = 0;
   367   for (int64_t col=my_col_st; col<my_col_st+my_col; col++){
   369       inds[act_tot_nnz] = col*n3 + col+1;
   370       vals[act_tot_nnz] = -1.;
   373     if (col+n < n3 && (col/n+1)%n != 0){
   374       inds[act_tot_nnz] = col*n3 + col+n;
   375       vals[act_tot_nnz] = -1.;
   378     if (col+n*n < n3 && (col/(n*n)+1)%n != 0){
   379       inds[act_tot_nnz] = col*n3 + col+n*n;
   380       vals[act_tot_nnz] = -1.;
   384   A.
write(act_tot_nnz, inds, vals);
   391     printf(
"Generated matrix with dimension %1.2E and %1.2E nonzeros\n", (
REAL)n3, (
REAL)A.
nnz_tot);
   398   int tot_ndiv = ndiv*ndiv*ndiv;
   399   for (
int i=0; i<nlvl; i++){
   400     int64_t m2 = m/tot_ndiv;
   402     int64_t mmy = m2/dw.
np;
   403     if (dw.
rank < m2%dw.
np) mmy++;
   407     for (int64_t j=dw.
rank; j<m2; j+=dw.
np){
   408       int64_t j1 = j/(nn*nn);
   409       int64_t j2 = (j/nn)%nn;
   411       for (
int k1=0; k1<ndiv; k1++){
   412         for (
int k2=0; k2<ndiv; k2++){
   413           for (
int k3=0; k3<ndiv; k3++){
   415             pairs[nel] = 
Pair<REAL>(j*m+(j1*ndiv+k1)*nn*nn+(j2*ndiv+k2)*nn+j3*ndiv+k3, 1.0/tot_ndiv);
   421     T[i].
write(nel, pairs);
   433   setup(A, T, n3, nlvl, P, PTAP);
   443   char ** itr = std::find(begin, end, option);
   444   if (itr != end && ++itr != end){
   451 int main(
int argc, 
char ** argv){
   452   int rank, 
np, pass, nlvl, ndiv, decay_exp, nsmooth, poi;
   456   int const in_num = argc;
   457   char ** input_str = argv;
   459   MPI_Init(&argc, &argv);
   460   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   461   MPI_Comm_size(MPI_COMM_WORLD, &np);
   464     n = atoi(
getCmdOption(input_str, input_str+in_num, 
"-n"));
   468   if (
getCmdOption(input_str, input_str+in_num, 
"-nlvl")){
   469     nlvl = atoi(
getCmdOption(input_str, input_str+in_num, 
"-nlvl"));
   470     if (nlvl < 0) nlvl = 3;
   473   if (
getCmdOption(input_str, input_str+in_num, 
"-ndiv")){
   474     ndiv = atoi(
getCmdOption(input_str, input_str+in_num, 
"-ndiv"));
   475     if (ndiv < 0) ndiv = 2;
   478   if (
getCmdOption(input_str, input_str+in_num, 
"-nsmooth")){
   479     nsmooth = atoi(
getCmdOption(input_str, input_str+in_num, 
"-nsmooth"));
   480     if (nsmooth < 0) nsmooth = 3;
   483   nsm = (
int*)malloc(
sizeof(
int)*nlvl);
   484   std::fill(nsm, nsm+nlvl, nsmooth);
   486   char str[] = {
'-',
'n',
's',
'm',
'0',
'\0'};
   487   for (
int i=0; i<nlvl; i++){
   490       int insm = atoi(
getCmdOption(input_str, input_str+in_num, str));
   491       if (insm > 0) nsm[i] = insm;
   496     poi = atoi(
getCmdOption(input_str, input_str+in_num, 
"-poi"));
   497     if (poi < 0) poi = 0;
   501   if (
getCmdOption(input_str, input_str+in_num, 
"-decay_exp")){
   502     decay_exp = atoi(
getCmdOption(input_str, input_str+in_num, 
"-decay_exp"));
   503     if (decay_exp < 0) decay_exp = 3;
   504   } 
else decay_exp = 3;
   506   if (
getCmdOption(input_str, input_str+in_num, 
"-sp_frac")){
   507     sp_frac = atof(
getCmdOption(input_str, input_str+in_num, 
"-sp_frac"));
   508     if (sp_frac < 0) sp_frac = .01;
   509   } 
else sp_frac = .01;
   512   int64_t all_lvl_ndiv=1;
   513   for (
int i=0; i<nlvl; i++){ all_lvl_ndiv *= ndiv; }
   515   assert(n%all_lvl_ndiv == 0);
   518     World dw(argc, argv);
   521       printf(
"Running algebraic smoothed multigrid method with %d levels with divisor %d in V-cycle, matrix dimension %ld, %d smooth iterations, decayed based on 3D indexing with decay exponent of %d\n",nlvl,ndiv,n,nsmooth, decay_exp);
   522       printf(
"number of smoothing iterations per level is ");
   523       for (
int i=0; i<nlvl+1; i++){ printf(
"%d ",nsm[i]); }
   539       for (int64_t i=0; i<nloc; i++){
   540         vals[i] = (1./(n1))*(1./(n1))*sin(h*M_PI*(1+(inds[i]/(n*n))))*sin(h*M_PI*(1+((inds[i]/n)%n)))*sin(h*M_PI*(1+(inds[i]%n)));
   542       b.
write(nloc,inds,vals);
   543       for (int64_t i=0; i<nloc; i++){
   544         vals[i] = (1./(3.*M_PI*M_PI))*sin(h*M_PI*(1+(inds[i]/(n*n))))*sin(h*M_PI*(1+((inds[i]/n)%n)))*sin(h*M_PI*(1+(inds[i]%n)));
   548       x_t.
write(nloc,inds,vals);
   549       r[
"i"] = A[
"ij"]*x_t[
"j"];
   552       if (dw.
rank == 0) printf(
"Truncation error norm is %1.2E\n",tnorm);
 Set class defined by a datatype and a min/max function (if it is partially ordered i...
void smooth_jacobi(Matrix< REAL > &A, Vector< REAL > &x, Vector< REAL > &b, int nsm)
Matrix class which encapsulates a 2D tensor. 
int jacobi(int n, World &dw)
int test_alg_multigrid(int64_t N, int nlvl, int *nsm, Matrix< REAL > &A, Vector< REAL > &b, Vector< REAL > &x_init, Matrix< REAL > *P, Matrix< REAL > *PTAP)
computes Multigrid for a 3D regular discretization 
void setup(Matrix< REAL > &A, Matrix< REAL > *T, int N, int nlevel, Matrix< REAL > *P, Matrix< REAL > *PTAP)
local process walltime measurement 
Vector class which encapsulates a 1D tensor. 
void setup_3d_Poisson(int64_t n, int nlvl, int ndiv, Matrix< REAL > &A, Matrix< REAL > *&P, Matrix< REAL > *&PTAP, World &dw)
an instance of the CTF library (world) on a MPI communicator 
bool is_sparse
whether only the non-zero elements of the tensor are stored 
dtype norm2()
computes the frobenius norm of the tensor (needs sqrt()!) 
index-value pair used for tensor data input 
CTF::World * wrld
distributed processor context on which tensor is defined 
void fill_random(dtype rmin, dtype rmax)
fills local unique tensor elements to random values in the range [min,max] works only for dtype in {f...
int rank
rank of local processor 
int main(int argc, char **argv)
int * lens
unpadded tensor edge lengths 
char * getCmdOption(char **begin, char **end, const std::string &option)
int64_t nnz_tot
maximum number of local nonzero elements over all procs 
void print(FILE *fp, dtype cutoff) const 
prints tensor data to file using process 0 (modify print(...) overload in set.h if you would like a d...
epoch during which to measure timers 
void fill_sp_random(dtype rmin, dtype rmax, double frac_sp)
generate roughly frac_sp*dense_tensor_size nonzeros between rmin and rmax, works only for dtype in {f...
void get_local_data(int64_t *npair, int64_t **global_idx, dtype **data, bool nonzeros_only=false, bool unpack_sym=false) const 
Gives the global indices and values associated with the local data. 
void setup_unstructured(int64_t n, int nlvl, REAL sp_frac, int ndiv, int decay_exp, Matrix< REAL > &A, Matrix< REAL > *&P, Matrix< REAL > *&PTAP, World &dw)
std::complex< double > omega(int i, int n)
void vcycle(Matrix< REAL > &A, Vector< REAL > &x, Vector< REAL > &b, Matrix< REAL > *P, Matrix< REAL > *PTAP, int64_t N, int nlevel, int *nsm)
void write(int64_t npair, int64_t const *global_idx, dtype const *data)
writes in values associated with any set of indices The sparse data is defined in coordinate format...
int np
number of processors