Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
util.h
Go to the documentation of this file.
1 /*Copyright (c) 2011, Edgar Solomonik, all rights reserved.*/
2 
3 #ifndef __UTIL_H__
4 #define __UTIL_H__
5 
6 #include "../interface/common.h"
7 
8 
9 #include <inttypes.h>
10 #include <execinfo.h>
11 #include <signal.h>
12 
13 #ifndef __APPLE__
14 #ifndef OMP_OFF
15 #define USE_OMP
16 #include "omp.h"
17 #endif
18 #endif
19 
20 
21 /*
22 #if (defined(__X86_64__) || defined(__IA64__) || defined(__amd64__) || \
23  defined(__ppc64__) || defined(_ARCH_PPC) || defined(BGQ) || defined(BGP))
24 #define PRId64 "%ld"
25 #define PRIu64 "%lu"
26 #else //if (defined(__i386__))
27 #define PRId64 "%lld"
28 #define PRIu64 "%llu"
29 //#else
30 #endif
31 */
32 #ifdef CTF_DEBUG
33 #define DEBUG CTF_DEBUG
34 #endif
35 
36 #ifdef CTF_VERBOSE
37 #define VERBOSE CTF_VERBOSE
38 #endif
39 
40 #ifdef CTF_PROFILE
41 #define PROFILE CTF_PROFILE
42 #endif
43 
44 #ifdef CTF_PMPI
45 #define PMPI CTF_PMPI
46 #endif
47 
48 namespace CTF_int {
49  //latency time per message
50  #define COST_LATENCY (1.e-6)
51  //memory bandwidth: time per per byte
52  #define COST_MEMBW (1.e-9)
53  //network bandwidth: time per byte
54  #define COST_NETWBW (5.e-10)
55  //flop cost: time per flop
56  #define COST_FLOP (2.e-11)
57  //flop cost: time per flop
58  #define COST_OFFLOADBW (5.e-10)
59 }
60 
61 #include "int_timer.h"
62 #include "pmpi.h"
63 #include "fompi_wrapper.h"
64 
65 namespace CTF_int {
66  //max total virtualization factor for mappings
67  #define MAX_NVIRT 256
68  //min total virtualization factor for mappings
69  // (would be useful if explicit blockwise threading was enabled, which is not currently)
70  #ifndef MIN_NVIRT
71  #define MIN_NVIRT 1
72  #endif
73 
74  #ifndef ENABLE_ASSERT
75  #ifdef DEBUG
76  #define ENABLE_ASSERT 1
77  #else
78  #define ENABLE_ASSERT 0
79  #endif
80  #endif
81 
82 
83  #ifndef ASSERT
84  #if ENABLE_ASSERT
85  #define ASSERT(...) \
86  do { if (!(__VA_ARGS__)){ int rank; MPI_Comm_rank(MPI_COMM_WORLD,&rank); if (rank == 0){ printf("CTF ERROR: %s:%d, ASSERT(%s) failed\n",__FILE__,__LINE__,#__VA_ARGS__); } CTF_int::handler(); assert(__VA_ARGS__); } } while (0)
87  #else
88  #define ASSERT(...) do {} while(0 && (__VA_ARGS__))
89  #endif
90  #endif
91 
92 
93 
94  /* Force redistributions always by setting to 1 */
95  #define REDIST 0
96  //#define VERIFY 0
97  #define VERIFY_REMAP 0
98  #define FOLD_TSR 1
99  #define USE_SYM_SUM
100  #define HOME_CONTRACT
101  #define USE_BLOCK_RESHUFFLE
102 
103  #define MAX_ORD 12
104  #define LOOP_MAX_ORD(F,...) \
105  F(0,__VA_ARGS__) F(1,__VA_ARGS__) F(2,__VA_ARGS__) F(3,__VA_ARGS__) \
106  F(4,__VA_ARGS__) F(5,__VA_ARGS__) F(6,__VA_ARGS__) F(7,__VA_ARGS__) \
107  F(8,__VA_ARGS__) F(9,__VA_ARGS__) F(10,__VA_ARGS__) F(11,__VA_ARGS__)
108 
109  #define ORD_CASE(ord,F,...) \
110  case ord: \
111  F<ord>(__VA_ARGS__); \
112  break;
113 
114  #define ORD_CASE_RET(ord,R,F,...) \
115  case ord: \
116  R = F<ord>(__VA_ARGS__); \
117  break;
118 
119  #define SWITCH_ORD_CALL(F,act_ord,...) \
120  switch (act_ord){ \
121  LOOP_MAX_ORD(ORD_CASE,F,__VA_ARGS__) \
122  default: \
123  ASSERT(0); \
124  break; \
125  }
126 
127  #define SWITCH_ORD_CALL_RET(R,F,act_ord,...) \
128  switch (act_ord){ \
129  LOOP_MAX_ORD(ORD_CASE_RET,R,F,__VA_ARGS__) \
130  default: \
131  ASSERT(0); \
132  break; \
133  }
134 
135 
136  #define CTF_COUNT_FLOPS
137  #ifdef CTF_COUNT_FLOPS
138  #define CTF_FLOPS_ADD(n) CTF_int::flops_add(n)
139  #else
140  #define CTF_FLOPS_ADD(n)
141  #endif
142 
143  //doesn't work with OpenMPI
144  //volatile static int64_t mpi_int64_t = MPI_LONG_LONG_INT;
145  #ifdef _SC_PHYS_PAGES
146  inline
147  int64_t getTotalSystemMemory()
148  {
149  int64_t pages = (int64_t)sysconf(_SC_PHYS_PAGES);
150  int64_t page_size = (int64_t)sysconf(_SC_PAGE_SIZE);
151  return pages * page_size;
152  }
153  #else
154  inline
156  {
157  //Assume system memory is 1 GB
158  return ((int64_t)1)<<30;
159  }
160  #endif
161 
162  #define ABORT \
163  do{ \
164  handler(); MPI_Abort(MPI_COMM_WORLD, -1); } while(0)
165 
166  //proper modulus for 'a' in the range of [-b inf]
167  #ifndef WRAP
168  #define WRAP(a,b) ((a + b)%b)
169  #endif
170 
171  #ifndef ALIGN_BYTES
172  #define ALIGN_BYTES 32
173  #endif
174 
175  #ifndef MIN
176  #define MIN( a, b ) ( ((a) < (b)) ? (a) : (b) )
177  #endif
178 
179  #ifndef MAX
180  #define MAX( a, b ) ( ((a) > (b)) ? (a) : (b) )
181  #endif
182 
183  #ifndef LOC
184  #define LOC \
185  do { printf("debug:%s:%d ",__FILE__,__LINE__); } while(0)
186  #endif
187 
188  #ifndef THROW_ERROR
189  #define THROW_ERROR(...) \
190  do { printf("error:%s:%d ",__FILE__,__LINE__); printf(__VA_ARGS__); printf("\n"); quit(1); } while(0)
191  #endif
192 
193  #ifndef WARN
194  #define WARN(...) \
195  do { printf("warning: "); printf(__VA_ARGS__); printf("\n"); } while(0)
196  #endif
197 
198  #if defined(VERBOSE)
199  #ifndef VPRINTF
200  #define VPRINTF(i,...) \
201  do { if (i<=VERBOSE) { \
202  printf("CTF: "); printf(__VA_ARGS__); } \
203  } while (0)
204  #endif
205  #else
206  #ifndef VPRINTF
207  #define VPRINTF(...) do { } while (0)
208  #endif
209  #endif
210 
211 
212  #ifdef DEBUG
213  #ifndef DPRINTF
214  #define DPRINTF(__icx,...) \
215  do { if (__icx<=DEBUG) { LOC; printf(__VA_ARGS__); } } while (0)
216  #endif
217  #ifndef DEBUG_PRINTF
218  #define DEBUG_PRINTF(...) \
219  do { DPRINTF(5,__VA_ARGS__); } while(0)
220  #endif
221  #ifndef RANK_PRINTF
222  #define RANK_PRINTF(myRank,rank,...) \
223  do { if (myRank == rank) { LOC; printf("P[%d]: ",rank); printf(__VA_ARGS__); } } while(0)
224  #endif
225  #ifndef PRINT_INT
226  #define PRINT_INT(var) \
227  do { LOC; printf(#var); printf("=%d\n",var); } while(0)
228  #endif
229  #ifndef PRINT_DOUBLE
230  #define PRINT_DOUBLE(var) \
231  do { LOC; printf(#var); printf("=%lf\n",var); } while(0)
232  #endif
233  #else
234  #ifndef DPRINTF
235  #define DPRINTF(...) do { } while (0)
236  #endif
237  #ifndef DEBUG_PRINTF
238  #define DEBUG_PRINTF(...) do {} while (0)
239  #endif
240  #ifndef RANK_PRINTF
241  #define RANK_PRINTF(...) do { } while (0)
242 
243  #endif
244  #ifndef PRINT_INT
245  #define PRINT_INT(var)
246  #endif
247  #endif
248 
249 
250  #ifdef DUMPDEBUG
251  #ifndef DUMPDEBUG_PRINTF
252  #define DUMPDEBUG_PRINTF(...) \
253  do { LOC; printf(__VA_ARGS__); } while(0)
254  #endif
255  #else
256  #ifndef DUMPDEBUG_PRINTF
257  #define DUMPDEBUG_PRINTF(...)
258  #endif
259  #endif
260 
261  /*#ifdef TAU
262  #include <stddef.h>
263  #include <Profile/Profiler.h>
264  #define TAU_FSTART(ARG) \
265  TAU_PROFILE_TIMER(timer##ARG, #ARG, "", TAU_USER); \
266  TAU_PROFILE_START(timer##ARG)
267 
268  #define TAU_FSTOP(ARG) \
269  TAU_PROFILE_STOP(timer##ARG)
270 
271  #else*/
272  #ifndef TAU
273  #define TAU_PROFILE(NAME,ARG,USER)
274  #define TAU_PROFILE_TIMER(ARG1, ARG2, ARG3, ARG4)
275  #define TAU_PROFILER_CREATE(ARG1, ARG2, ARG3, ARG4)
276  #define TAU_PROFILE_STOP(ARG)
277  #define TAU_PROFILE_START(ARG)
278  #define TAU_PROFILE_SET_NODE(ARG)
279  #define TAU_PROFILE_SET_CONTEXT(ARG)
280  #define TAU_FSTART(ARG)
281  #define TAU_FSTOP(ARG)
282  #endif
283  #define TIME(STRING) TAU_PROFILE(STRING, " ", TAU_DEFAULT)
284  #define MST_ALIGN_BYTES ALIGN_BYTES
285 
286  struct mem_transfer {
287  void * old_ptr;
288  void * new_ptr;
289  };
290 
291  std::list<mem_transfer> contract_mst();
292  int untag_mem(void * ptr);
293  int free_cond(void * ptr);
294  void mem_create();
295  void mst_create(int64_t size);
296  void mem_exit(int rank);
297 
298 
299  /*
300  * \brief calculates dimensional indices corresponding to a symmetric-packed index
301  * For each symmetric (SH or AS) group of size sg we have
302  * idx = n*(n-1)*...*(n-sg) / d*(d-1)*...
303  * therefore (idx*sg!)^(1/sg) >= n-sg
304  * or similarly in the SY case ... >= n
305  *
306  * \param[in] order number of dimensions in the tensor
307  * \param[in] lens edge lengths
308  * \param[in] sym symmetry
309  * \param[in] idx index in the global tensor, in packed format
310  * \param[out] idx_arr preallocated to size order, computed to correspond to idx
311  */
312  void calc_idx_arr(int order,
313  int const * lens,
314  int const * sym,
315  int64_t idx,
316  int * idx_arr);
317 
319  void sy_calc_idx_arr(int order,
320  int const * lens,
321  int const * sym,
322  int64_t idx,
323  int * idx_arr);
324 
331  void factorize(int n, int *nfactor, int **factor);
332 
333  inline
334  int gcd(int a, int b){
335  if (b==0) return a;
336  return gcd(b, a%b);
337  }
338 
339  inline
340  int lcm(int a, int b){
341  return ((int64_t)a)*((int64_t)b)/gcd(a,b);
342  }
343 
354  inline
355  void lda_cpy(int el_size,
356  int nrow,
357  int ncol,
358  int lda_A,
359  int lda_B,
360  const char * A,
361  char * B){
362  if (lda_A == nrow && lda_B == nrow){
363  memcpy(B,A,el_size*nrow*ncol);
364  } else {
365  int i;
366  for (i=0; i<ncol; i++){
367  memcpy(B+el_size*lda_B*i,A+el_size*lda_A*i,nrow*el_size);
368  }
369  }
370  }
371 
385 /* template<typename dtype>
386  void coalesce_bwd(dtype *B,
387  dtype const *B_aux,
388  int k,
389  int n,
390  int kb){
391  int i;
392  for (i=n-1; i>=0; i--){
393  memcpy(B+i*k+kb, B_aux+i*(k-kb), (k-kb)*sizeof(dtype));
394  if (i>0) memcpy(B+i*k, B+i*kb, kb*sizeof(dtype));
395  }
396  }*/
397  inline
398  void coalesce_bwd(int el_size,
399  char *B,
400  char const *B_aux,
401  int k,
402  int n,
403  int kb){
404  int i;
405  for (i=n-1; i>=0; i--){
406  memcpy(B+el_size*(i*k+kb), B_aux+el_size*(i*(k-kb)), (k-kb)*el_size);
407  if (i>0){
408  if (k-kb>=kb){
409  memcpy(B+el_size*i*k, B+el_size*i*kb, kb*el_size);
410  } else {
411  for (int j=0; j<kb; j+=k-kb){
412  memcpy(B+el_size*(i*k+j), B+el_size*i*(kb+j), std::min(k-kb,kb-j)*el_size);
413  }
414  }
415  }
416  }
417  }
418 
426  void permute(int order,
427  int const * perm,
428  int * arr);
429 
437  void permute_target(int order,
438  int const * perm,
439  int * arr);
440 
441 
442  void socopy(int64_t m,
443  int64_t n,
444  int64_t lda_a,
445  int64_t lda_b,
446  int64_t const * sizes_a,
447  int64_t *& sizes_b,
448  int64_t *& offsets_b);
449 
450  void spcopy(int64_t m,
451  int64_t n,
452  int64_t lda_a,
453  int64_t lda_b,
454  int64_t const * sizes_a,
455  int64_t const * offsets_a,
456  char const * a,
457  int64_t const * sizes_b,
458  int64_t const * offsets_b,
459  char * b);
460 
461 
462  int64_t fact(int64_t n);
463  int64_t choose(int64_t n, int64_t k);
464  void get_choice(int64_t n, int64_t k, int64_t ch, int * chs);
465  int64_t chchoose(int64_t n, int64_t k);
466 }
467 #endif
468 
void calc_idx_arr(int order, int const *lens, int const *sym, int64_t idx, int *idx_arr)
Definition: util.cxx:72
void permute(int order, int const *perm, int *arr)
permute an array
Definition: util.cxx:205
void get_choice(int64_t n, int64_t k, int64_t ch, int *chs)
Definition: util.cxx:289
std::list< mem_transfer > contract_mst()
gets rid of empty space on the stack
Definition: memcontrol.cxx:125
def rank(self)
Definition: core.pyx:312
void mst_create(int64_t size)
initializes stack buffer
Definition: memcontrol.cxx:170
int gcd(int a, int b)
Definition: util.h:334
void mem_exit(int rank)
exit instance of memory manager
Definition: memcontrol.cxx:207
int untag_mem(void *ptr)
stops tracking memory allocated by CTF, so user doesn&#39;t have to call free
Definition: memcontrol.cxx:376
int64_t fact(int64_t n)
Definition: util.cxx:277
void coalesce_bwd(int el_size, char *B, char const *B_aux, int k, int n, int kb)
we receive a contiguous buffer kb-by-n B and (k-kb)-by-n B_aux which is the block below...
Definition: util.h:398
void sy_calc_idx_arr(int order, int const *lens, int const *sym, int64_t idx, int *idx_arr)
same as above except assumes sym only NS or SY
Definition: util.cxx:121
void permute_target(int order, int const *perm, int *arr)
permutes a permutation array
Definition: util.cxx:222
void socopy(int64_t m, int64_t n, int64_t lda_a, int64_t lda_b, int64_t const *sizes_a, int64_t *&sizes_b, int64_t *&offsets_b)
Definition: util.cxx:240
void mem_create()
create instance of memory manager
Definition: memcontrol.cxx:187
void lda_cpy(int el_size, int nrow, int ncol, int lda_A, int lda_B, const char *A, char *B)
Copies submatrix to submatrix (column-major)
Definition: util.h:355
int lcm(int a, int b)
Definition: util.h:340
int free_cond(void *ptr)
int64_t choose(int64_t n, int64_t k)
Definition: util.cxx:285
void factorize(int n, int *nfactor, int **factor)
computes the size of a tensor in packed symmetric layout
Definition: util.cxx:170
int64_t getTotalSystemMemory()
Definition: util.h:155
int64_t chchoose(int64_t n, int64_t k)
Definition: util.cxx:305
void spcopy(int64_t m, int64_t n, int64_t lda_a, int64_t lda_b, int64_t const *sizes_a, int64_t const *offsets_a, char const *a, int64_t const *sizes_b, int64_t const *offsets_b, char *b)
Definition: util.cxx:260