3 #include "../shared/util.h"     8 #include "../tensor/untyped_tensor.h"     9 #include "../shared/model.h"    72     printf(
"ctr_virt:\n");
    73     printf(
"blk_sz_A = %ld, blk_sz_B = %ld, blk_sz_C = %ld\n",
    76       printf(
"virt_dim[%d] = %d\n", i, 
virt_dim[i]);
   103     int * idx_arr, * tidx_arr, * lda_A, * lda_B, * lda_C, * beta_arr;
   104     int * ilda_A, * ilda_B, * ilda_C;
   105     int64_t i, off_A, off_B, off_C;
   106     int nb_A, nb_B, nb_C, alloced, ret;
   125   #define SET_LDA_X(__X)                                                  \   128     for (i=0; i<order_##__X; i++){                                         \   129       lda_##__X[i] = nb_##__X;                                            \   130       nb_##__X = nb_##__X*virt_dim[idx_map_##__X[i]];                     \   132     memset(ilda_##__X, 0, num_dim*sizeof(int));                           \   133     for (i=0; i<order_##__X; i++){                                         \   134       ilda_##__X[idx_map_##__X[i]] += lda_##__X[i];                       \   144     memset(beta_arr, 0, nb_C*
sizeof(
int));
   146   #pragma omp parallel private(off_A,off_B,off_C,tidx_arr,i)   149       int tid, ntd, start_off, end_off;
   151       tid = omp_get_thread_num();
   161         tidx_arr = idx_arr + tid*
num_dim;
   162         memset(tidx_arr, 0, num_dim*
sizeof(
int));
   164         start_off = (nb_C/ntd)*tid;
   167           end_off = start_off + nb_C/ntd + 1;
   169           start_off += nb_C%ntd;
   170           end_off = start_off + nb_C/ntd;
   182         off_A = 0, off_B = 0, off_C = 0;
   184           if (off_C >= start_off && off_C < end_off) {
   185             if (beta_arr[off_C]>0)
   197             off_A -= ilda_A[i]*tidx_arr[i];
   198             off_B -= ilda_B[i]*tidx_arr[i];
   199             off_C -= ilda_C[i]*tidx_arr[i];
   203             off_A += ilda_A[i]*tidx_arr[i];
   204             off_B += ilda_B[i]*tidx_arr[i];
   205             off_C += ilda_C[i]*tidx_arr[i];
   206             if (tidx_arr[i] != 0) 
break;
   211           if (i==num_dim) 
break;
   230                            iparam const *      inner_params,
   231                            int *               virt_blk_len_A,
   232                            int *               virt_blk_len_B,
   233                            int *               virt_blk_len_C,
   238     int * new_sym_A, * new_sym_B, * new_sym_C;
   240     memcpy(new_sym_A, c->
A->
sym, 
sizeof(
int)*c->
A->
order);
   242     memcpy(new_sym_B, c->
B->
sym, 
sizeof(
int)*c->
B->
order);
   244     memcpy(new_sym_C, c->
C->
sym, 
sizeof(
int)*c->
C->
order);
   249     } 
else if (is_inner == 1) {
   251         DPRINTF(3,
"Folded tensor l=%ld n=%ld m=%ld k=%ld\n", inner_params->
l, inner_params->
n,
   252           inner_params->
m, inner_params->
k);
   256       this->inner_params.
sz_C = vrt_sz_C;
   259       for (i=0; i<itsr->
order; i++){
   261         for (k=0; k<c->
A->
order; k++){
   262           if (c->
A->
sym[k] == 
NS) j--;
   266         while (k>0 && c->
A->
sym[k-1] != 
NS){
   272           virt_blk_len_A[k] = 1;
   277       for (i=0; i<itsr->
order; i++){
   279         for (k=0; k<c->
B->
order; k++){
   280           if (c->
B->
sym[k] == 
NS) j--;
   284         while (k>0 && c->
B->
sym[k-1] != 
NS){
   290           virt_blk_len_B[k] = 1;
   295       for (i=0; i<itsr->
order; i++){
   297         for (k=0; k<c->
C->
order; k++){
   298           if (c->
C->
sym[k] == 
NS) j--;
   302         while (k>0 && c->
C->
sym[k-1] != 
NS){
   308           virt_blk_len_C[k] = 1;
   323     this->
sym_A      = new_sym_A;
   327     this->
sym_B      = new_sym_B;
   331     this->
sym_C      = new_sym_C;
   339     printf(
"seq_tsr_ctr:\n");
   341       printf(
"edge_len_A[%d]=%d\n",i,
edge_len_A[i]);
   344       printf(
"edge_len_B[%d]=%d\n",i,
edge_len_B[i]);
   347       printf(
"edge_len_C[%d]=%d\n",i,
edge_len_C[i]);
   349     printf(
"is inner = %d\n", 
is_inner);
   350     if (
is_inner) printf(
"inner n = %ld m= %ld k = %ld l = %ld\n",
   410     return size_A+size_B+size_C;
   414     int idx_max, * rev_idx_map;
   418             &idx_max,     &rev_idx_map);
   426     for (
int i=0; i<idx_max; i++){
   427       if (rev_idx_map[3*i+0] != -1) flops*=
edge_len_A[rev_idx_map[3*i+0]];
   428       else if (rev_idx_map[3*i+1] != -1) flops*=
edge_len_B[rev_idx_map[3*i+1]];
   429       else if (rev_idx_map[3*i+2] != -1) flops*=
edge_len_C[rev_idx_map[3*i+2]];
   441       return seq_tsr_ctr_mdl_cst.
est_time(ps);
   445           return seq_tsr_ctr_mdl_cst_off.
est_time(ps);
   447           return seq_tsr_ctr_mdl_cst_inr.
est_time(ps);
   450           return seq_tsr_ctr_mdl_off.
est_time(ps);
   452           return seq_tsr_ctr_mdl_inr.
est_time(ps);
   455       return seq_tsr_ctr_mdl_ref.
est_time(ps);
   496       double st_time = MPI_Wtime();
   519       double exe_time = MPI_Wtime()-st_time;
   521       seq_tsr_ctr_mdl_cst.
observe(tps);
   526       double st_time = MPI_Wtime();
   549       double exe_time = MPI_Wtime()-st_time;
   554           seq_tsr_ctr_mdl_cst_off.
observe(tps);
   556           seq_tsr_ctr_mdl_cst_inr.
observe(tps);
   559           seq_tsr_ctr_mdl_off.
observe(tps);
   561           seq_tsr_ctr_mdl_inr.
observe(tps);
   565       double st_time = MPI_Wtime();
   586       double exe_time = MPI_Wtime()-st_time;
   588       seq_tsr_ctr_mdl_ref.
observe(tps);
   604       if (idx_A[i] > dim_max) dim_max = idx_A[i];
   607       if (idx_B[i] > dim_max) dim_max = idx_B[i];
   610       if (idx_C[i] > dim_max) dim_max = idx_C[i];
   613     *order_tot = dim_max;
   615     std::fill((*idx_arr), (*idx_arr)+3*dim_max, -1);
   618       (*idx_arr)[3*idx_A[i]] = i;
   621       (*idx_arr)[3*idx_B[i]+1] = i;
   624       (*idx_arr)[3*idx_C[i]+2] = i;
 
CTF_int::CommData cdt
communicator data for MPI comm defining this world 
int * sym
symmetries among tensor dimensions 
double est_time(double const *param)
estimates model time based on observarions 
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values 
double seq_tsr_ctr_mdl_ref_init[]
virtual int64_t mem_rec()
double seq_tsr_ctr_mdl_inr_init[]
int * inner_ordering
ordering of the dimensions according to which the tensori s folded 
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map 
void * alloc(int64_t len)
alloc abstraction 
~ctr_virt()
deallocates ctr_virt object 
seq_tsr_ctr(ctr *other)
clones ctr object 
LinModel< 3 > seq_tsr_ctr_mdl_cst(seq_tsr_ctr_mdl_cst_init,"seq_tsr_ctr_mdl_cst")
bivar_function const * func
function to execute on elements 
int order
number of tensor dimensions 
bool is_custom
whether there is a elementwise custom function 
CTF::World * wrld
distributed processor context on which tensor is defined 
ctr_virt(ctr *other)
copies ctr_virt object 
class for execution distributed contraction of tensors 
int * idx_B
indices of right operand 
void run(char *A, char *B, char *C)
iterates over the dense virtualization block grid and contracts 
Linear performance models, which given measurements, provides new model guess. 
double seq_tsr_ctr_mdl_cst_inr_init[]
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction 
int sym_seq_ctr_cust(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, bivar_function const *func)
performs symmetric contraction with custom elementwise function 
double seq_tsr_ctr_mdl_off_init[]
double est_time_rec(int nlyr)
int sym_seq_ctr_ref(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C)
performs symmetric contraction with reference (unblocked) kernel 
LinModel< 3 > seq_tsr_ctr_mdl_cst_inr(seq_tsr_ctr_mdl_cst_inr_init,"seq_tsr_ctr_mdl_cst_inr")
tensor * rec_tsr
representation of folded tensor (shares data pointer) 
LinModel< 3 > seq_tsr_ctr_mdl_ref(seq_tsr_ctr_mdl_ref_init,"seq_tsr_ctr_mdl_ref")
LinModel< 3 > seq_tsr_ctr_mdl_off(seq_tsr_ctr_mdl_off_init,"seq_tsr_ctr_mdl_off")
double est_time_rec(int nlyr)
virtual double est_time_rec(int nlyr)
bool should_observe(double const *time_param)
decides whether the current instance should be observed 
int * idx_C
indices of output 
int sym_seq_ctr_inr(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, iparam const *prm, bivar_function const *func)
performs symmetric contraction with blocked gemm 
bivar_function const * func
double seq_tsr_ctr_mdl_cst_off_init[]
virtual void run(char *A, char *B, char *C)
int el_size
size of each element of algstrct in bytes 
void run(char *A, char *B, char *C)
wraps user sequential function signature 
int cdealloc(void *ptr)
free abstraction 
internal distributed tensor class 
LinModel< 3 > seq_tsr_ctr_mdl_cst_off(seq_tsr_ctr_mdl_cst_off_init,"seq_tsr_ctr_mdl_cst_off")
double seq_tsr_ctr_mdl_cst_init[]
LinModel< 3 > seq_tsr_ctr_mdl_inr(seq_tsr_ctr_mdl_inr_init,"seq_tsr_ctr_mdl_inr")
char const * alpha
scaling of A*B 
int * idx_A
indices of left operand 
virtual char const * mulid() const 
identity element for multiplication i.e. 1 
double est_time_fp(int nlyr)
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout