6 #include "../interface/common.h" 33 #define DEBUG CTF_DEBUG 37 #define VERBOSE CTF_VERBOSE 41 #define PROFILE CTF_PROFILE 50 #define COST_LATENCY (1.e-6) 52 #define COST_MEMBW (1.e-9) 54 #define COST_NETWBW (5.e-10) 56 #define COST_FLOP (2.e-11) 58 #define COST_OFFLOADBW (5.e-10) 76 #define ENABLE_ASSERT 1 78 #define ENABLE_ASSERT 0 86 do { if (!(__VA_ARGS__)){ int rank; MPI_Comm_rank(MPI_COMM_WORLD,&rank); if (rank == 0){ printf("CTF ERROR: %s:%d, ASSERT(%s) failed\n",__FILE__,__LINE__,#__VA_ARGS__); } CTF_int::handler(); assert(__VA_ARGS__); } } while (0) 88 #define ASSERT(...) do {} while(0 && (__VA_ARGS__)) 97 #define VERIFY_REMAP 0 100 #define HOME_CONTRACT 101 #define USE_BLOCK_RESHUFFLE 104 #define LOOP_MAX_ORD(F,...) \ 105 F(0,__VA_ARGS__) F(1,__VA_ARGS__) F(2,__VA_ARGS__) F(3,__VA_ARGS__) \ 106 F(4,__VA_ARGS__) F(5,__VA_ARGS__) F(6,__VA_ARGS__) F(7,__VA_ARGS__) \ 107 F(8,__VA_ARGS__) F(9,__VA_ARGS__) F(10,__VA_ARGS__) F(11,__VA_ARGS__) 109 #define ORD_CASE(ord,F,...) \ 111 F<ord>(__VA_ARGS__); \ 114 #define ORD_CASE_RET(ord,R,F,...) \ 116 R = F<ord>(__VA_ARGS__); \ 119 #define SWITCH_ORD_CALL(F,act_ord,...) \ 121 LOOP_MAX_ORD(ORD_CASE,F,__VA_ARGS__) \ 127 #define SWITCH_ORD_CALL_RET(R,F,act_ord,...) \ 129 LOOP_MAX_ORD(ORD_CASE_RET,R,F,__VA_ARGS__) \ 136 #define CTF_COUNT_FLOPS 137 #ifdef CTF_COUNT_FLOPS 138 #define CTF_FLOPS_ADD(n) CTF_int::flops_add(n) 140 #define CTF_FLOPS_ADD(n) 145 #ifdef _SC_PHYS_PAGES 149 int64_t pages = (int64_t)sysconf(_SC_PHYS_PAGES);
150 int64_t page_size = (int64_t)sysconf(_SC_PAGE_SIZE);
151 return pages * page_size;
158 return ((int64_t)1)<<30;
164 handler(); MPI_Abort(MPI_COMM_WORLD, -1); } while(0) 168 #define WRAP(a,b) ((a + b)%b) 172 #define ALIGN_BYTES 32 176 #define MIN( a, b ) ( ((a) < (b)) ? (a) : (b) ) 180 #define MAX( a, b ) ( ((a) > (b)) ? (a) : (b) ) 185 do { printf("debug:%s:%d ",__FILE__,__LINE__); } while(0) 189 #define THROW_ERROR(...) \ 190 do { printf("error:%s:%d ",__FILE__,__LINE__); printf(__VA_ARGS__); printf("\n"); quit(1); } while(0) 195 do { printf("warning: "); printf(__VA_ARGS__); printf("\n"); } while(0) 200 #define VPRINTF(i,...) \ 201 do { if (i<=VERBOSE) { \ 202 printf("CTF: "); printf(__VA_ARGS__); } \ 207 #define VPRINTF(...) do { } while (0) 214 #define DPRINTF(__icx,...) \ 215 do { if (__icx<=DEBUG) { LOC; printf(__VA_ARGS__); } } while (0) 218 #define DEBUG_PRINTF(...) \ 219 do { DPRINTF(5,__VA_ARGS__); } while(0) 222 #define RANK_PRINTF(myRank,rank,...) \ 223 do { if (myRank == rank) { LOC; printf("P[%d]: ",rank); printf(__VA_ARGS__); } } while(0) 226 #define PRINT_INT(var) \ 227 do { LOC; printf(#var); printf("=%d\n",var); } while(0) 230 #define PRINT_DOUBLE(var) \ 231 do { LOC; printf(#var); printf("=%lf\n",var); } while(0) 235 #define DPRINTF(...) do { } while (0) 238 #define DEBUG_PRINTF(...) do {} while (0) 241 #define RANK_PRINTF(...) do { } while (0) 245 #define PRINT_INT(var) 251 #ifndef DUMPDEBUG_PRINTF 252 #define DUMPDEBUG_PRINTF(...) \ 253 do { LOC; printf(__VA_ARGS__); } while(0) 256 #ifndef DUMPDEBUG_PRINTF 257 #define DUMPDEBUG_PRINTF(...) 273 #define TAU_PROFILE(NAME,ARG,USER) 274 #define TAU_PROFILE_TIMER(ARG1, ARG2, ARG3, ARG4) 275 #define TAU_PROFILER_CREATE(ARG1, ARG2, ARG3, ARG4) 276 #define TAU_PROFILE_STOP(ARG) 277 #define TAU_PROFILE_START(ARG) 278 #define TAU_PROFILE_SET_NODE(ARG) 279 #define TAU_PROFILE_SET_CONTEXT(ARG) 280 #define TAU_FSTART(ARG) 281 #define TAU_FSTOP(ARG) 283 #define TIME(STRING) TAU_PROFILE(STRING, " ", TAU_DEFAULT) 284 #define MST_ALIGN_BYTES ALIGN_BYTES 331 void factorize(
int n,
int *nfactor,
int **factor);
341 return ((int64_t)a)*((int64_t)b)/
gcd(a,b);
362 if (lda_A == nrow && lda_B == nrow){
363 memcpy(B,A,el_size*nrow*ncol);
366 for (i=0; i<ncol; i++){
367 memcpy(B+el_size*lda_B*i,A+el_size*lda_A*i,nrow*el_size);
405 for (i=n-1; i>=0; i--){
406 memcpy(B+el_size*(i*k+kb), B_aux+el_size*(i*(k-kb)), (k-kb)*el_size);
409 memcpy(B+el_size*i*k, B+el_size*i*kb, kb*el_size);
411 for (
int j=0; j<kb; j+=k-kb){
412 memcpy(B+el_size*(i*k+j), B+el_size*i*(kb+j), std::min(k-kb,kb-j)*el_size);
446 int64_t
const * sizes_a,
448 int64_t *& offsets_b);
454 int64_t
const * sizes_a,
455 int64_t
const * offsets_a,
457 int64_t
const * sizes_b,
458 int64_t
const * offsets_b,
462 int64_t
fact(int64_t n);
463 int64_t
choose(int64_t n, int64_t k);
464 void get_choice(int64_t n, int64_t k, int64_t ch,
int * chs);
465 int64_t
chchoose(int64_t n, int64_t k);
void calc_idx_arr(int order, int const *lens, int const *sym, int64_t idx, int *idx_arr)
void permute(int order, int const *perm, int *arr)
permute an array
void get_choice(int64_t n, int64_t k, int64_t ch, int *chs)
std::list< mem_transfer > contract_mst()
gets rid of empty space on the stack
void mst_create(int64_t size)
initializes stack buffer
void mem_exit(int rank)
exit instance of memory manager
int untag_mem(void *ptr)
stops tracking memory allocated by CTF, so user doesn't have to call free
void coalesce_bwd(int el_size, char *B, char const *B_aux, int k, int n, int kb)
we receive a contiguous buffer kb-by-n B and (k-kb)-by-n B_aux which is the block below...
void sy_calc_idx_arr(int order, int const *lens, int const *sym, int64_t idx, int *idx_arr)
same as above except assumes sym only NS or SY
void permute_target(int order, int const *perm, int *arr)
permutes a permutation array
void socopy(int64_t m, int64_t n, int64_t lda_a, int64_t lda_b, int64_t const *sizes_a, int64_t *&sizes_b, int64_t *&offsets_b)
void mem_create()
create instance of memory manager
void lda_cpy(int el_size, int nrow, int ncol, int lda_A, int lda_B, const char *A, char *B)
Copies submatrix to submatrix (column-major)
int64_t choose(int64_t n, int64_t k)
void factorize(int n, int *nfactor, int **factor)
computes the size of a tensor in packed symmetric layout
int64_t getTotalSystemMemory()
int64_t chchoose(int64_t n, int64_t k)
void spcopy(int64_t m, int64_t n, int64_t lda_a, int64_t lda_b, int64_t const *sizes_a, int64_t const *offsets_a, char const *a, int64_t const *sizes_b, int64_t const *offsets_b, char *b)