Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
recursive_matmul.cxx
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
9 #include <ctf.hpp>
10 using namespace CTF;
11 
12 void recursive_matmul(int n,
13  int m,
14  int k,
15  Tensor<> & A,
16  Tensor<> & B,
17  Tensor<> & C){
18  int rank, num_pes, cnum_pes, ri, rj, rk, ni, nj, nk, div;
19  MPI_Comm pcomm, ccomm;
20  pcomm = C.wrld->comm;
21 
22  MPI_Comm_rank(pcomm, &rank);
23  MPI_Comm_size(pcomm, &num_pes);
24 
25  if (num_pes == 1 || m == 1 || n == 1 || k==1){
26  C["ij"] += 1.0*A["ik"]*B["kj"];
27  } else {
28  for (div=2; num_pes%div!=0; div++){}
29 
30  cnum_pes = num_pes / div;
31 
32  MPI_Comm_split(pcomm, rank/cnum_pes, rank%cnum_pes, &ccomm);
33  World cdw(ccomm);
34 
35  ri = 0;
36  rj = 0;
37  rk = 0;
38  ni = 1;
39  nj = 1;
40  nk = 1;
41  if (m >= n && m >= k){
42  ni = div;
43  ri = rank/cnum_pes;
44  assert(m%div == 0);
45  } else if (n >= m && n >= k){
46  nj = div;
47  rj = rank/cnum_pes;
48  assert(n%div == 0);
49  } else if (k >= m && k >= n){
50  nk = div;
51  rk = rank/cnum_pes;
52  assert(k%div == 0);
53  }
54 
55  int off_ij[2] = {ri * m/ni, rj * n/nj};
56  int end_ij[2] = {ri * m/ni + m/ni, rj * n/nj + n/nj};
57  int off_ik[2] = {ri * m/ni, rk * k/nk};
58  int end_ik[2] = {ri * m/ni + m/ni, rk * k/nk + k/nk};
59  int off_kj[2] = {rk * k/nk, rj * n/nj};
60  int end_kj[2] = {rk * k/nk + k/nk, rj * n/nj + n/nj};
61  Tensor<> cA = A.slice(off_ik, end_ik, &cdw);
62  Tensor<> cB = B.slice(off_kj, end_kj, &cdw);
63  Matrix<> cC(m/ni, n/nj, NS, cdw);
64 
65 
66  recursive_matmul(n/nj, m/ni, k/nk, cA, cB, cC);
67 
68  int off_00[2] = {0, 0};
69  int end_11[2] = {m/ni, n/nj};
70  C.slice(off_ij, end_ij, 1.0, cC, off_00, end_11, 1.0);
71  MPI_Comm_free(&ccomm);
72  }
73 }
74 
76  int m,
77  int k,
78  World & dw){
79  int rank, num_pes;
80  int64_t i, np;
81  double * pairs, err;
82  int64_t * indices;
83 
84  Matrix<> C(m, n, NS, dw);
85  Matrix<> C_ans(m, n, NS, dw);
86  Matrix<> A(m, k, NS, dw);
87  Matrix<> B(k, n, NS, dw);
88 
89  MPI_Comm pcomm = dw.comm;
90  MPI_Comm_rank(pcomm, &rank);
91  MPI_Comm_size(pcomm, &num_pes);
92 
93  srand48(13*rank);
94  A.get_local_data(&np, &indices, &pairs);
95  for (i=0; i<np; i++ ) pairs[i] = drand48()-.5;
96  A.write(np, indices, pairs);
97  delete [] pairs;
98  free(indices);
99  B.get_local_data(&np, &indices, &pairs);
100  for (i=0; i<np; i++ ) pairs[i] = drand48()-.5;
101  B.write(np, indices, pairs);
102  delete [] pairs;
103  free(indices);
104 
105  C_ans["ij"] += 1.0*A["ik"]*B["kj"];
106 
107 // C_ans.print(stdout);
108 
109  recursive_matmul(n,m,k,A,B,C);
110 
111 // C.print(stdout);
112 
113  C_ans["ij"] -= C["ij"];
114 
115  err = C_ans.norm2();
116 
117  if (rank == 0){
118  if (err<1.E-9)
119  printf("{ GEMM with parallel slicing } passed\n");
120  else
121  printf("{ GEMM with parallel slicing } FAILED, error norm = %E\n",err);
122  }
123  return err<1.E-9;
124 }
125 
126 
127 #ifndef TEST_SUITE
128 char* getCmdOption(char ** begin,
129  char ** end,
130  const std::string & option){
131  char ** itr = std::find(begin, end, option);
132  if (itr != end && ++itr != end){
133  return *itr;
134  }
135  return 0;
136 }
137 
138 int main(int argc, char ** argv){
139  int rank, np, n, m, k;
140  int const in_num = argc;
141  char ** input_str = argv;
142 
143  MPI_Init(&argc, &argv);
144  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
145  MPI_Comm_size(MPI_COMM_WORLD, &np);
146 
147  if (getCmdOption(input_str, input_str+in_num, "-n")){
148  n = atoi(getCmdOption(input_str, input_str+in_num, "-n"));
149  if (n < 0) n = 256;
150  } else n = 256;
151  if (getCmdOption(input_str, input_str+in_num, "-m")){
152  m = atoi(getCmdOption(input_str, input_str+in_num, "-m"));
153  if (m < 0) m = 128;
154  } else m = 128;
155  if (getCmdOption(input_str, input_str+in_num, "-k")){
156  k = atoi(getCmdOption(input_str, input_str+in_num, "-k"));
157  if (k < 0) k = 512;
158  } else k = 512;
159 
160  {
161  World dw(MPI_COMM_WORLD, argc, argv);
162  int pass;
163  if (rank == 0){
164  printf("Non-symmetric: NS = NS*NS test_recursive_matmul:\n");
165  }
166  pass = test_recursive_matmul(n, m, k, dw);
167  assert(pass);
168  }
169 
170  MPI_Finalize();
171  return 0;
172 }
178 #endif
179 
Matrix class which encapsulates a 2D tensor.
Definition: matrix.h:18
int main(int argc, char **argv)
def rank(self)
Definition: core.pyx:312
Tensor< dtype > slice(int const *offsets, int const *ends) const
cuts out a slice (block) of this tensor A[offsets,ends) result will always be fully nonsymmetric ...
Definition: tensor.cxx:643
Definition: common.h:37
an instance of the CTF library (world) on a MPI communicator
Definition: world.h:19
string
Definition: core.pyx:456
dtype norm2()
computes the frobenius norm of the tensor (needs sqrt()!)
Definition: tensor.h:811
CTF::World * wrld
distributed processor context on which tensor is defined
void get_local_data(int64_t *npair, int64_t **global_idx, dtype **data, bool nonzeros_only=false, bool unpack_sym=false) const
Gives the global indices and values associated with the local data.
Definition: tensor.cxx:159
void recursive_matmul(int n, int m, int k, Tensor<> &A, Tensor<> &B, Tensor<> &C)
char * getCmdOption(char **begin, char **end, const std::string &option)
Definition: apsp.cxx:17
an instance of a tensor within a CTF world
Definition: tensor.h:74
int test_recursive_matmul(int n, int m, int k, World &dw)
void write(int64_t npair, int64_t const *global_idx, dtype const *data)
writes in values associated with any set of indices The sparse data is defined in coordinate format...
Definition: tensor.cxx:264
MPI_Comm comm
set of processors making up this world
Definition: world.h:22
def np(self)
Definition: core.pyx:315