3 #include "../shared/util.h" 6 #include "../interface/fun_term.h" 7 #include "../interface/idx_tensor.h" 31 if (buffer != NULL)
cdealloc(buffer);
72 printf(
"tsum_virt:\n");
73 printf(
"blk_sz_A = %ld, blk_sz_B = %ld\n",
76 printf(
"virt_dim[%d] = %d\n", i,
virt_dim[i]);
86 int * idx_arr, * lda_A, * lda_B, * beta_arr;
87 int * ilda_A, * ilda_B;
88 int64_t i, off_A, off_B;
89 int nb_A, nb_B, alloced, ret;
94 idx_arr = (
int*)this->
buffer;
107 #define SET_LDA_X(__X) \ 110 for (i=0; i<order_##__X; i++){ \ 111 lda_##__X[i] = nb_##__X; \ 112 nb_##__X = nb_##__X*virt_dim[idx_map_##__X[i]]; \ 114 memset(ilda_##__X, 0, num_dim*sizeof(int)); \ 115 for (i=0; i<order_##__X; i++){ \ 116 ilda_##__X[idx_map_##__X[i]] += lda_##__X[i]; \ 124 beta_arr = (
int*)
alloc(
sizeof(
int)*nb_B);
126 memset(idx_arr, 0, num_dim*
sizeof(
int));
127 memset(beta_arr, 0, nb_B*
sizeof(
int));
128 off_A = 0, off_B = 0;
135 if (beta_arr[off_B]>0)
144 off_A -= ilda_A[i]*idx_arr[i];
145 off_B -= ilda_B[i]*idx_arr[i];
149 off_A += ilda_A[i]*idx_arr[i];
150 off_B += ilda_B[i]*idx_arr[i];
151 if (idx_arr[i] != 0)
break;
153 if (i==num_dim)
break;
164 printf(
"tsum_replicate: \n");
165 printf(
"cdt_A = %p, size_A = %ld, ncdt_A = %d\n",
166 cdt_A, size_A, ncdt_A);
167 for (i=0; i<ncdt_A; i++){
168 printf(
"cdt_A[%d] length = %d\n",i,cdt_A[i]->
np);
170 printf(
"cdt_B = %p, size_B = %ld, ncdt_B = %d\n",
171 cdt_B, size_B, ncdt_B);
172 for (i=0; i<ncdt_B; i++){
173 printf(
"cdt_B[%d] length = %d\n",i,cdt_B[i]->
np);
204 int const * phys_mapped,
206 int64_t blk_sz_B) :
tsum(s) {
215 for (i=0; i<nphys_dim; i++){
216 if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
219 if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
229 for (i=0; i<nphys_dim; i++){
230 if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
234 if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
253 char * buf = this->
A;
324 printf(
"seq_tsr_sum:\n");
326 printf(
"edge_len_A[%d]=%d\n",i,
edge_len_A[i]);
329 printf(
"edge_len_B[%d]=%d\n",i,
edge_len_B[i]);
331 printf(
"is inner = %d\n",
is_inner);
333 printf(
"map_pfx = %ld\n",
map_pfx);
a term is an abstract object representing some expression of tensors
bool is_custom
whether there is a elementwise custom function
void execute(CTF::Idx_Tensor output) const
evalues the expression, which just scales by default
virtual void execute(CTF::Idx_Tensor output) const =0
evalues the expression, which just scales by default
int * sym
symmetries among tensor dimensions
int * idx_A
indices of left operand
int sym_seq_sum_ref(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B)
performs symmetric contraction with unblocked reference kernel
Unifun_Term operator()(Term const &A) const
evaluate B=f(A)
void allred(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op)
allreduce, same interface as MPI_Allreduce, but excluding the comm
void * alloc(int64_t len)
alloc abstraction
void run()
wraps user sequential function signature
virtual char const * addid() const
MPI datatype for pairs.
int64_t mem_fp()
returns the number of bytes of buffer space needed
int order
number of tensor dimensions
int64_t mem_fp()
returns the number of bytes of buffer space needed
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
performs replication along a dimension, generates 2.5D algs
virtual Term * clone(std::map< tensor *, tensor * > *remap=NULL) const =0
base classes must implement this copy function to retrieve pointer
tsum_replicate(tsum *other)
char const * alpha
scaling of A
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
int64_t mem_fp()
returns the number of bytes of buffer space needed
algstrct * sr
algstrct on which tensor elements and operations are defined
tsum_virt(tsum *other)
iterates over the dense virtualization block grid and contracts
virtual MPI_Op addmop() const
MPI addition operation for reductions.
univar_function const * func
int * idx_B
indices of output
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
virtual std::vector< char > get_uniq_inds() const =0
find list of unique indices that are involved in this term
seq_tsr_sum(tsum *other)
copies sum object
int sym_seq_sum_cust(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, univar_function const *func)
performs symmetric summation with custom elementwise function
int el_size
size of each element of algstrct in bytes
char const * beta
scaling of existing B
int cdealloc(void *ptr)
free abstraction
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
topology * topo
topology to which the tensor is mapped
int sym_seq_sum_inr(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, int inr_stride)
performs symmetric summation with blocked daxpy
class for execution distributed summation of tensors
virtual char const * mulid() const
identity element for multiplication i.e. 1
virtual MPI_Datatype mdtype() const
MPI datatype.