4 #include "../shared/util.h" 8 #include "../sparse_formats/coo.h" 9 #include "../sparse_formats/csr.h" 10 #include "../tensor/untyped_tensor.h" 32 iparam const * inner_params_,
40 int * new_sym_A, * new_sym_B, * new_sym_C;
42 memcpy(new_sym_A, c->
A->
sym,
sizeof(
int)*c->
A->
order);
44 memcpy(new_sym_B, c->
B->
sym,
sizeof(
int)*c->
B->
order);
46 memcpy(new_sym_C, c->
C->
sym,
sizeof(
int)*c->
C->
order);
52 DPRINTF(2,
"Folded tensor n=%ld m=%ld k=%ld\n", inner_params_->
n,
53 inner_params_->
m, inner_params_->
k);
59 for (i=0; i<itsr->
order; i++){
61 for (k=0; k<c->
A->
order; k++){
62 if (c->
A->
sym[k] ==
NS) j--;
66 while (k>0 && c->
A->
sym[k-1] !=
NS){
72 virt_blk_len_A[k] = 1;
77 for (i=0; i<itsr->
order; i++){
79 for (k=0; k<c->
B->
order; k++){
80 if (c->
B->
sym[k] ==
NS) j--;
84 while (k>0 && c->
B->
sym[k-1] !=
NS){
90 virt_blk_len_B[k] = 1;
95 for (i=0; i<itsr->
order; i++){
97 for (k=0; k<c->
C->
order; k++){
98 if (c->
C->
sym[k] ==
NS) j--;
102 while (k>0 && c->
C->
sym[k-1] !=
NS){
108 virt_blk_len_C[k] = 1;
125 this->
sym_A = new_sym_A;
129 this->
sym_B = new_sym_B;
133 this->
sym_C = new_sym_C;
140 printf(
"seq_tsr_spctr:\n");
142 printf(
"edge_len_A[%d]=%d\n",i,
edge_len_A[i]);
145 printf(
"edge_len_B[%d]=%d\n",i,
edge_len_B[i]);
148 printf(
"edge_len_C[%d]=%d\n",i,
edge_len_C[i]);
150 printf(
"kernel type is %d\n",
krnl_type);
151 if (
krnl_type>0) printf(
"inner n = %ld m= %ld k = %ld sz_C=%ld\n",
194 int idx_max, * rev_idx_map;
198 &idx_max, &rev_idx_map);
206 for (
int i=0; i<idx_max; i++){
207 if (rev_idx_map[3*i+0] != -1) flops*=
edge_len_A[rev_idx_map[3*i+0]];
208 else if (rev_idx_map[3*i+1] != -1) flops*=
edge_len_B[rev_idx_map[3*i+1]];
209 else if (rev_idx_map[3*i+2] != -1) flops*=
edge_len_C[rev_idx_map[3*i+2]];
231 return size_A+size_B+size_C;
253 double ps[] = {1.0, (double)
est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C),
est_fp(nnz_frac_A, nnz_frac_B, nnz_frac_C)};
258 return seq_tsr_spctr_cst_off_k0.
est_time(ps);
260 return seq_tsr_spctr_cst_k0.
est_time(ps);
263 return seq_tsr_spctr_off_k0.
est_time(ps);
265 return seq_tsr_spctr_k0.
est_time(ps);
271 return seq_tsr_spctr_cst_off_k1.
est_time(ps);
273 return seq_tsr_spctr_cst_k1.
est_time(ps);
276 return seq_tsr_spctr_off_k1.
est_time(ps);
278 return seq_tsr_spctr_k1.
est_time(ps);
284 return seq_tsr_spctr_cst_off_k2.
est_time(ps);
286 return seq_tsr_spctr_cst_k2.
est_time(ps);
289 return seq_tsr_spctr_off_k2.
est_time(ps);
291 return seq_tsr_spctr_k2.
est_time(ps);
296 return seq_tsr_spctr_cst_k3.
est_time(ps);
298 return seq_tsr_spctr_k3.
est_time(ps);
303 return seq_tsr_spctr_cst_k4.
est_time(ps);
305 return seq_tsr_spctr_k4.
est_time(ps);
314 return est_time_fp(nlyr, nnz_frac_A, nnz_frac_B, nnz_frac_C);
318 char * B,
int nblk_B, int64_t
const * size_blk_B,
319 char * C,
int nblk_C, int64_t * size_blk_C,
330 double st_time = MPI_Wtime();
348 double nnz_frac_A = 1.0, nnz_frac_B = 1.0, nnz_frac_C = 1.0;
368 double tps_[] = {0.0, 1.0, (double)
est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C),
est_fp(nnz_frac_B, nnz_frac_B, nnz_frac_C)};
527 double exe_time = MPI_Wtime() - st_time;
528 double tps[] = {exe_time, 1.0, (double)
est_membw(nnz_frac_A, nnz_frac_B, nnz_frac_C),
est_fp(nnz_frac_B, nnz_frac_B, nnz_frac_C)};
532 seq_tsr_spctr_cst_k0.
observe(tps);
540 seq_tsr_spctr_cst_off_k1.
observe(tps);
542 seq_tsr_spctr_cst_k1.
observe(tps);
545 seq_tsr_spctr_off_k1.
observe(tps);
553 seq_tsr_spctr_cst_off_k2.
observe(tps);
555 seq_tsr_spctr_cst_k2.
observe(tps);
558 seq_tsr_spctr_off_k2.
observe(tps);
565 seq_tsr_spctr_cst_k3.
observe(tps);
574 seq_tsr_spctr_cst_k4.
observe(tps);
637 printf(
"spctr_virt:\n");
638 printf(
"blk_sz_A = %ld, blk_sz_B = %ld, blk_sz_C = %ld\n",
641 printf(
"virt_dim[%d] = %d\n", i,
virt_dim[i]);
668 char * B,
int nblk_B, int64_t
const * size_blk_B,
669 char * C,
int nblk_C, int64_t * size_blk_C,
672 int * idx_arr, * tidx_arr, * lda_A, * lda_B, * lda_C, * beta_arr;
673 int * ilda_A, * ilda_B, * ilda_C;
674 int64_t i, off_A, off_B, off_C;
675 int nb_A, nb_B, nb_C, alloced, ret;
694 #define SET_LDA_X(__X) \ 697 for (i=0; i<order_##__X; i++){ \ 698 lda_##__X[i] = nb_##__X; \ 699 nb_##__X = nb_##__X*virt_dim[idx_map_##__X[i]]; \ 701 memset(ilda_##__X, 0, num_dim*sizeof(int)); \ 702 for (i=0; i<order_##__X; i++){ \ 703 ilda_##__X[idx_map_##__X[i]] += lda_##__X[i]; \ 713 memset(beta_arr, 0, nb_C*
sizeof(
int));
715 int64_t * sp_offsets_A = NULL;
717 sp_offsets_A = (int64_t*)
alloc(
sizeof(int64_t)*nb_A);
719 for (
int i=1; i<nb_A; i++){
720 sp_offsets_A[i] = sp_offsets_A[i-1]+size_blk_A[i-1];
723 int64_t * sp_offsets_B = NULL;
725 sp_offsets_B = (int64_t*)
alloc(
sizeof(int64_t)*nb_B);
727 for (
int i=1; i<nb_B; i++){
728 sp_offsets_B[i] = sp_offsets_B[i-1]+size_blk_B[i-1];
732 int64_t * sp_offsets_C = NULL;
733 int64_t * new_sp_szs_C = NULL;
734 char ** buckets_C = NULL;
736 sp_offsets_C = (int64_t*)
alloc(
sizeof(int64_t)*nb_C);
737 new_sp_szs_C = size_blk_C;
739 buckets_C = (
char**)
alloc(
sizeof(
char*)*nb_C);
740 for (
int i=0; i<nb_C; i++){
744 sp_offsets_C[i] = sp_offsets_C[i-1]+size_blk_C[i-1];
745 buckets_C[i] = C + sp_offsets_C[i];
753 int tid, ntd, start_off, end_off;
755 tid = omp_get_thread_num();
765 tidx_arr = idx_arr + tid*
num_dim;
766 memset(tidx_arr, 0, num_dim*
sizeof(
int));
768 start_off = (nb_C/ntd)*tid;
771 end_off = start_off + nb_C/ntd + 1;
773 start_off += nb_C%ntd;
774 end_off = start_off + nb_C/ntd;
786 off_A = 0, off_B = 0, off_C = 0;
788 if (off_C >= start_off && off_C < end_off) {
789 char * rec_A, * rec_B, * rec_C;
791 rec_A = A + sp_offsets_A[off_A];
795 rec_B = B + sp_offsets_B[off_B];
799 rec_C = buckets_C[off_C];
802 if (beta_arr[off_C]>0)
806 bool do_dealloc = beta_arr[off_C] > 0;
808 char * pass_C =
is_sparse_C ? buckets_C[off_C] : rec_C;
809 tid_rec_ctr->
run(rec_A, 1, size_blk_A+off_A,
810 rec_B, 1, size_blk_B+off_B,
811 rec_C, 1, new_sp_szs_C+off_C,
814 if (do_dealloc)
cdealloc(buckets_C[off_C]);
815 buckets_C[off_C] = pass_C;
821 off_A -= ilda_A[i]*tidx_arr[i];
822 off_B -= ilda_B[i]*tidx_arr[i];
823 off_C -= ilda_C[i]*tidx_arr[i];
827 off_A += ilda_A[i]*tidx_arr[i];
828 off_B += ilda_B[i]*tidx_arr[i];
829 off_C += ilda_C[i]*tidx_arr[i];
830 if (tidx_arr[i] != 0)
break;
835 if (i==num_dim)
break;
844 int64_t new_size_C = 0;
845 for (
int i=0; i<nb_C; i++){
846 new_size_C += new_sp_szs_C[i];
848 new_C = (
char*)
alloc(new_size_C);
850 for (
int i=0; i<nb_C; i++){
851 memcpy(new_C+pfx, buckets_C[i], new_sp_szs_C[i]);
852 pfx += new_sp_szs_C[i];
853 if (beta_arr[i] > 0)
cdealloc(buckets_C[i]);
896 printf(
"spctr_pin_keys:\n");
899 printf(
"transforming global keys of A to local keys\n");
902 printf(
"transforming global keys of B to local keys\n");
905 printf(
"transforming global keys of C to local keys\n");
912 int64_t mem_usage = 0;
959 for (
int i=0; i<
order; i++){
982 return 2.*pin_keys_mdl.
est_time(ps);
993 char * B,
int nblk_B, int64_t
const * size_blk_B,
994 char * C,
int nblk_C, int64_t * size_blk_C,
1004 for (
int i=0; i<nblk_A; i++){
1012 for (
int i=0; i<nblk_B; i++){
1020 for (
int i=0; i<nblk_C; i++){
1033 double st_time = MPI_Wtime();
1034 char * nA, * nB, * nC;
1035 nA = A; nB = B; nC = C;
1054 double exe_time = MPI_Wtime()-st_time;
1055 double tps[] = {exe_time, 1.0, (double)nnz};
1060 nB, nblk_B, size_blk_B,
1061 nC, nblk_C, size_blk_C,
1072 st_time = MPI_Wtime();
1073 int64_t new_nnz_C=0;
1074 for (
int i=0; i<nblk_C; i++){
1078 depin(
sr_C,
order,
lens,
divisor, nblk_C,
virt_dim,
phys_rank, new_C, new_nnz_C, size_blk_C, new_C,
true);
1079 double exe_time = MPI_Wtime()-st_time;
1080 double tps[] = {exe_time, 1.0, (double)nnz};
bivar_function const * func
LinModel< 3 > seq_tsr_spctr_off_k1(seq_tsr_spctr_off_k1_init,"seq_tsr_spctr_off_k1")
int calc_phys_rank(topology const *topo) const
compute the physical rank of a mapping
double seq_tsr_spctr_k0_init[]
double seq_tsr_spctr_cst_k0_init[]
virtual int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
spctr_virt(spctr *other)
copies spctr_virt object
CTF_int::CommData cdt
communicator data for MPI comm defining this world
int64_t spmem_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
double seq_tsr_spctr_cst_off_k0_init[]
double seq_tsr_spctr_k1_init[]
int * sym
symmetries among tensor dimensions
double est_time(double const *param)
estimates model time based on observarions
virtual int pair_size() const
gets pair size el_size plus the key size
double seq_tsr_spctr_cst_k4_init[]
void observe(double const *time_param)
records observation consisting of execution time and nparam paramter values
int calc_phase() const
compute the phase of a mapping
double seq_tsr_spctr_cst_k1_init[]
LinModel< 3 > seq_tsr_spctr_cst_k1(seq_tsr_spctr_cst_k1_init,"seq_tsr_spctr_cst_k1")
virtual bool isequal(char const *a, char const *b) const
returns true if algstrct elements a and b are equal
LinModel< 2 > pin_keys_mdl(pin_keys_mdl_init,"pin_keys_mdl")
LinModel< 3 > seq_tsr_spctr_cst_k4(seq_tsr_spctr_cst_k4_init,"seq_tsr_spctr_cst_k4")
int * inner_ordering
ordering of the dimensions according to which the tensori s folded
int calc_phys_phase() const
compute the physical phase of a mapping
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
virtual double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
double seq_tsr_spctr_off_k1_init[]
LinModel< 3 > seq_tsr_spctr_off_k0(seq_tsr_spctr_off_k0_init,"seq_tsr_spctr_off_k0")
double seq_tsr_spctr_cst_k3_init[]
void pin(int64_t n, int order, int const *lens, int const *divisor, PairIterator pi_new)
pins keys of n pairs
int64_t size
current size of local tensor data chunk (mapping-dependent)
void * alloc(int64_t len)
alloc abstraction
LinModel< 3 > seq_tsr_spctr_k0(seq_tsr_spctr_k0_init,"seq_tsr_spctr_k0")
double seq_tsr_spctr_off_k0_init[]
virtual char const * addid() const
MPI datatype for pairs.
LinModel< 3 > seq_tsr_spctr_k3(seq_tsr_spctr_k3_init,"seq_tsr_spctr_k3")
bivar_function const * func
function to execute on elements
static void csrmultcsr(char const *A, algstrct const *sr_A, int m, int n, int k, char const *alpha, char const *B, algstrct const *sr_B, char const *beta, char *&C, algstrct const *sr_C, bivar_function const *func, bool do_offload)
computes C = beta*C + func(alpha*A*B) where A, B, and C are CSR_Matrices, while C is dense ...
bool is_sparse
whether only the non-zero elements of the tensor are stored
double pin_keys_mdl_init[]
int order
number of tensor dimensions
int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
iterates over the dense virtualization block grid and contracts
double seq_tsr_spctr_off_k2_init[]
LinModel< 3 > seq_tsr_spctr_cst_off_k2(seq_tsr_spctr_cst_off_k2_init,"seq_tsr_spctr_cst_off_k2")
bool is_custom
whether there is a elementwise custom function
double est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time the local part this kernel is estimated to take
LinModel< 3 > seq_tsr_spctr_cst_k3(seq_tsr_spctr_cst_k3_init,"seq_tsr_spctr_cst_k3")
LinModel< 3 > seq_tsr_spctr_k1(seq_tsr_spctr_k1_init,"seq_tsr_spctr_k1")
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
CTF::World * wrld
distributed processor context on which tensor is defined
LinModel< 3 > seq_tsr_spctr_cst_k2(seq_tsr_spctr_cst_k2_init,"seq_tsr_spctr_cst_k2")
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
int mst_alloc_ptr(int64_t len, void **const ptr)
mst_alloc abstraction
class for execution distributed contraction of tensors
double seq_tsr_spctr_k3_init[]
double seq_tsr_spctr_k4_init[]
int * lens
unpadded tensor edge lengths
LinModel< 3 > seq_tsr_spctr_cst_k0(seq_tsr_spctr_cst_k0_init,"seq_tsr_spctr_cst_k0")
void depin(algstrct const *sr, int order, int const *lens, int const *divisor, int nvirt, int const *virt_dim, int const *phys_rank, char *X, int64_t &new_nnz_B, int64_t *nnz_blk, char *&new_B, bool check_padding)
depins keys of n pairs
int * idx_B
indices of right operand
Linear performance models, which given measurements, provides new model guess.
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
double seq_tsr_spctr_k2_init[]
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
wraps user sequential function signature
abstraction for a serialized sparse matrix stored in column-sparse-row (CSR) layout ...
seq_tsr_spctr(spctr *other)
copies ctr object
static void coomm(char const *A, algstrct const *sr_A, int m, int n, int k, char const *alpha, char const *B, algstrct const *sr_B, char const *beta, char *C, algstrct const *sr_C, bivar_function const *func)
computes C = beta*C + func(alpha*A*B) where A is a COO_Matrix, while B and C are dense ...
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
void run(char *A, int nblk_A, int64_t const *size_blk_A, char *B, int nblk_B, int64_t const *size_blk_B, char *C, int nblk_C, int64_t *size_blk_C, char *&new_C)
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
tensor * rec_tsr
representation of folded tensor (shares data pointer)
uint64_t est_membw(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
double seq_tsr_spctr_cst_k2_init[]
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
~spctr_virt()
deallocates spctr_virt object
double est_time_fp(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time the local part this kernel is estimated to take
void spA_dnB_dnC_seq_ctr(char const *alpha, char const *A, int64_t size_A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, char const *beta, char *C, algstrct const *sr_C, int order_C, int const *edge_len_C, int const *sym_C, int const *idx_map_C, bivar_function const *func)
bool should_observe(double const *time_param)
decides whether the current instance should be observed
double est_time_rec(int nlyr, double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the execution time this kernel and its recursive calls are estimated to take ...
int * idx_C
indices of output
double est_fp(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
int64_t spmem_rec(double nnz_frac_A, double nnz_frac_B, double nnz_frac_C)
returns the number of bytes need by each processor in this kernel and its recursive calls ...
spctr_pin_keys(spctr *other)
LinModel< 3 > seq_tsr_spctr_cst_off_k0(seq_tsr_spctr_cst_off_k0_init,"seq_tsr_spctr_cst_off_k0")
LinModel< 3 > seq_tsr_spctr_k4(seq_tsr_spctr_k4_init,"seq_tsr_spctr_k4")
int el_size
size of each element of algstrct in bytes
static void csrmm(char const *A, algstrct const *sr_A, int m, int n, int k, char const *alpha, char const *B, algstrct const *sr_B, char const *beta, char *C, algstrct const *sr_C, bivar_function const *func, bool do_offload)
computes C = beta*C + func(alpha*A*B) where A is a CSR_Matrix, while B and C are dense ...
int cdealloc(void *ptr)
free abstraction
double seq_tsr_spctr_cst_off_k1_init[]
LinModel< 3 > seq_tsr_spctr_k2(seq_tsr_spctr_k2_init,"seq_tsr_spctr_k2")
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
internal distributed tensor class
double seq_tsr_spctr_cst_off_k2_init[]
LinModel< 3 > seq_tsr_spctr_cst_off_k1(seq_tsr_spctr_cst_off_k1_init,"seq_tsr_spctr_cst_off_k1")
char const * alpha
scaling of A*B
topology * topo
topology to which the tensor is mapped
static void csrmultd(char const *A, algstrct const *sr_A, int m, int n, int k, char const *alpha, char const *B, algstrct const *sr_B, char const *beta, char *C, algstrct const *sr_C, bivar_function const *func, bool do_offload)
computes C = beta*C + func(alpha*A*B) where A and B are CSR_Matrices, while C is dense ...
int * idx_A
indices of left operand
virtual char const * mulid() const
identity element for multiplication i.e. 1
void run(char *A, char *B, char *C)
int64_t sy_packed_size(int order, const int *len, const int *sym)
computes the size of a tensor in SY (NOT HOLLOW) packed symmetric layout
LinModel< 3 > seq_tsr_spctr_off_k2(seq_tsr_spctr_off_k2_init,"seq_tsr_spctr_off_k2")