3 #include "../shared/util.h" 8 #include "../tensor/untyped_tensor.h" 9 #include "../shared/model.h" 72 printf(
"ctr_virt:\n");
73 printf(
"blk_sz_A = %ld, blk_sz_B = %ld, blk_sz_C = %ld\n",
76 printf(
"virt_dim[%d] = %d\n", i,
virt_dim[i]);
103 int * idx_arr, * tidx_arr, * lda_A, * lda_B, * lda_C, * beta_arr;
104 int * ilda_A, * ilda_B, * ilda_C;
105 int64_t i, off_A, off_B, off_C;
106 int nb_A, nb_B, nb_C, alloced, ret;
125 #define SET_LDA_X(__X) \ 128 for (i=0; i<order_##__X; i++){ \ 129 lda_##__X[i] = nb_##__X; \ 130 nb_##__X = nb_##__X*virt_dim[idx_map_##__X[i]]; \ 132 memset(ilda_##__X, 0, num_dim*sizeof(int)); \ 133 for (i=0; i<order_##__X; i++){ \ 134 ilda_##__X[idx_map_##__X[i]] += lda_##__X[i]; \ 144 memset(beta_arr, 0, nb_C*
sizeof(
int));
146 #pragma omp parallel private(off_A,off_B,off_C,tidx_arr,i) 149 int tid, ntd, start_off, end_off;
151 tid = omp_get_thread_num();
161 tidx_arr = idx_arr + tid*
num_dim;
162 memset(tidx_arr, 0, num_dim*
sizeof(
int));
164 start_off = (nb_C/ntd)*tid;
167 end_off = start_off + nb_C/ntd + 1;
169 start_off += nb_C%ntd;
170 end_off = start_off + nb_C/ntd;
182 off_A = 0, off_B = 0, off_C = 0;
184 if (off_C >= start_off && off_C < end_off) {
185 if (beta_arr[off_C]>0)
197 off_A -= ilda_A[i]*tidx_arr[i];
198 off_B -= ilda_B[i]*tidx_arr[i];
199 off_C -= ilda_C[i]*tidx_arr[i];
203 off_A += ilda_A[i]*tidx_arr[i];
204 off_B += ilda_B[i]*tidx_arr[i];
205 off_C += ilda_C[i]*tidx_arr[i];
206 if (tidx_arr[i] != 0)
break;
211 if (i==num_dim)
break;
230 iparam const * inner_params,
231 int * virt_blk_len_A,
232 int * virt_blk_len_B,
233 int * virt_blk_len_C,
238 int * new_sym_A, * new_sym_B, * new_sym_C;
240 memcpy(new_sym_A, c->
A->
sym,
sizeof(
int)*c->
A->
order);
242 memcpy(new_sym_B, c->
B->
sym,
sizeof(
int)*c->
B->
order);
244 memcpy(new_sym_C, c->
C->
sym,
sizeof(
int)*c->
C->
order);
249 }
else if (is_inner == 1) {
251 DPRINTF(3,
"Folded tensor l=%ld n=%ld m=%ld k=%ld\n", inner_params->
l, inner_params->
n,
252 inner_params->
m, inner_params->
k);
256 this->inner_params.
sz_C = vrt_sz_C;
259 for (i=0; i<itsr->
order; i++){
261 for (k=0; k<c->
A->
order; k++){
262 if (c->
A->
sym[k] ==
NS) j--;
266 while (k>0 && c->
A->
sym[k-1] !=
NS){
272 virt_blk_len_A[k] = 1;
277 for (i=0; i<itsr->
order; i++){
279 for (k=0; k<c->
B->
order; k++){
280 if (c->
B->
sym[k] ==
NS) j--;
284 while (k>0 && c->
B->
sym[k-1] !=
NS){
290 virt_blk_len_B[k] = 1;
295 for (i=0; i<itsr->
order; i++){
297 for (k=0; k<c->
C->
order; k++){
298 if (c->
C->
sym[k] ==
NS) j--;
302 while (k>0 && c->
C->
sym[k-1] !=
NS){
308 virt_blk_len_C[k] = 1;
323 this->
sym_A = new_sym_A;
327 this->
sym_B = new_sym_B;
331 this->
sym_C = new_sym_C;
339 printf(
"seq_tsr_ctr:\n");
341 printf(
"edge_len_A[%d]=%d\n",i,
edge_len_A[i]);
344 printf(
"edge_len_B[%d]=%d\n",i,
edge_len_B[i]);
347 printf(
"edge_len_C[%d]=%d\n",i,
edge_len_C[i]);
349 printf(
"is inner = %d\n",
is_inner);
350 if (
is_inner) printf(
"inner n = %ld m= %ld k = %ld l = %ld\n",
410 return size_A+size_B+size_C;
414 int idx_max, * rev_idx_map;
418 &idx_max, &rev_idx_map);
426 for (
int i=0; i<idx_max; i++){
427 if (rev_idx_map[3*i+0] != -1) flops*=
edge_len_A[rev_idx_map[3*i+0]];
428 else if (rev_idx_map[3*i+1] != -1) flops*=
edge_len_B[rev_idx_map[3*i+1]];
429 else if (rev_idx_map[3*i+2] != -1) flops*=
edge_len_C[rev_idx_map[3*i+2]];
441 return seq_tsr_ctr_mdl_cst.
est_time(ps);
445 return seq_tsr_ctr_mdl_cst_off.
est_time(ps);
447 return seq_tsr_ctr_mdl_cst_inr.
est_time(ps);
450 return seq_tsr_ctr_mdl_off.
est_time(ps);
452 return seq_tsr_ctr_mdl_inr.
est_time(ps);
455 return seq_tsr_ctr_mdl_ref.
est_time(ps);
496 double st_time = MPI_Wtime();
519 double exe_time = MPI_Wtime()-st_time;
521 seq_tsr_ctr_mdl_cst.
observe(tps);
526 double st_time = MPI_Wtime();
549 double exe_time = MPI_Wtime()-st_time;
554 seq_tsr_ctr_mdl_cst_off.
observe(tps);
556 seq_tsr_ctr_mdl_cst_inr.
observe(tps);
559 seq_tsr_ctr_mdl_off.
observe(tps);
561 seq_tsr_ctr_mdl_inr.
observe(tps);
565 double st_time = MPI_Wtime();
586 double exe_time = MPI_Wtime()-st_time;
588 seq_tsr_ctr_mdl_ref.
observe(tps);
604 if (idx_A[i] > dim_max) dim_max = idx_A[i];
607 if (idx_B[i] > dim_max) dim_max = idx_B[i];
610 if (idx_C[i] > dim_max) dim_max = idx_C[i];
613 *order_tot = dim_max;
615 std::fill((*idx_arr), (*idx_arr)+3*dim_max, -1);
618 (*idx_arr)[3*idx_A[i]] = i;
621 (*idx_arr)[3*idx_B[i]+1] = i;
624 (*idx_arr)[3*idx_C[i]+2] = i;
CTF_int::CommData cdt
communicator data for MPI comm defining this world
int * sym
symmetries among tensor dimensions
double est_time(double const *param)
estimates model time based on observarions
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values
double seq_tsr_ctr_mdl_ref_init[]
virtual int64_t mem_rec()
double seq_tsr_ctr_mdl_inr_init[]
int * inner_ordering
ordering of the dimensions according to which the tensori s folded
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
void * alloc(int64_t len)
alloc abstraction
~ctr_virt()
deallocates ctr_virt object
seq_tsr_ctr(ctr *other)
clones ctr object
LinModel< 3 > seq_tsr_ctr_mdl_cst(seq_tsr_ctr_mdl_cst_init,"seq_tsr_ctr_mdl_cst")
bivar_function const * func
function to execute on elements
int order
number of tensor dimensions
bool is_custom
whether there is a elementwise custom function
CTF::World * wrld
distributed processor context on which tensor is defined
ctr_virt(ctr *other)
copies ctr_virt object
class for execution distributed contraction of tensors
int * idx_B
indices of right operand
void run(char *A, char *B, char *C)
iterates over the dense virtualization block grid and contracts
Linear performance models, which given measurements, provides new model guess.
double seq_tsr_ctr_mdl_cst_inr_init[]
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
int sym_seq_ctr_cust(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, bivar_function const *func)
performs symmetric contraction with custom elementwise function
double seq_tsr_ctr_mdl_off_init[]
double est_time_rec(int nlyr)
int sym_seq_ctr_ref(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C)
performs symmetric contraction with reference (unblocked) kernel
LinModel< 3 > seq_tsr_ctr_mdl_cst_inr(seq_tsr_ctr_mdl_cst_inr_init,"seq_tsr_ctr_mdl_cst_inr")
tensor * rec_tsr
representation of folded tensor (shares data pointer)
LinModel< 3 > seq_tsr_ctr_mdl_ref(seq_tsr_ctr_mdl_ref_init,"seq_tsr_ctr_mdl_ref")
LinModel< 3 > seq_tsr_ctr_mdl_off(seq_tsr_ctr_mdl_off_init,"seq_tsr_ctr_mdl_off")
double est_time_rec(int nlyr)
virtual double est_time_rec(int nlyr)
bool should_observe(double const *time_param)
decides whether the current instance should be observed
int * idx_C
indices of output
int sym_seq_ctr_inr(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, iparam const *prm, bivar_function const *func)
performs symmetric contraction with blocked gemm
bivar_function const * func
double seq_tsr_ctr_mdl_cst_off_init[]
virtual void run(char *A, char *B, char *C)
int el_size
size of each element of algstrct in bytes
void run(char *A, char *B, char *C)
wraps user sequential function signature
int cdealloc(void *ptr)
free abstraction
internal distributed tensor class
LinModel< 3 > seq_tsr_ctr_mdl_cst_off(seq_tsr_ctr_mdl_cst_off_init,"seq_tsr_ctr_mdl_cst_off")
double seq_tsr_ctr_mdl_cst_init[]
LinModel< 3 > seq_tsr_ctr_mdl_inr(seq_tsr_ctr_mdl_inr_init,"seq_tsr_ctr_mdl_inr")
char const * alpha
scaling of A*B
int * idx_A
indices of left operand
virtual char const * mulid() const
identity element for multiplication i.e. 1
double est_time_fp(int nlyr)
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout