3 #include "../shared/util.h"     6 #include "../interface/fun_term.h"     7 #include "../interface/idx_tensor.h"    31     if (buffer != NULL) 
cdealloc(buffer); 
    72     printf(
"tsum_virt:\n");
    73     printf(
"blk_sz_A = %ld, blk_sz_B = %ld\n",
    76       printf(
"virt_dim[%d] = %d\n", i, 
virt_dim[i]);
    86     int * idx_arr, * lda_A, * lda_B, * beta_arr;
    87     int * ilda_A, * ilda_B;
    88     int64_t i, off_A, off_B;
    89     int nb_A, nb_B, alloced, ret; 
    94       idx_arr = (
int*)this->
buffer;
   107   #define SET_LDA_X(__X)                              \   110     for (i=0; i<order_##__X; i++){                    \   111       lda_##__X[i] = nb_##__X;                        \   112       nb_##__X = nb_##__X*virt_dim[idx_map_##__X[i]]; \   114     memset(ilda_##__X, 0, num_dim*sizeof(int));       \   115     for (i=0; i<order_##__X; i++){                    \   116       ilda_##__X[idx_map_##__X[i]] += lda_##__X[i];   \   124     beta_arr = (
int*)
alloc(
sizeof(
int)*nb_B);
   126     memset(idx_arr, 0, num_dim*
sizeof(
int));
   127     memset(beta_arr, 0, nb_B*
sizeof(
int));
   128     off_A = 0, off_B = 0;
   135       if (beta_arr[off_B]>0)
   144         off_A -= ilda_A[i]*idx_arr[i];
   145         off_B -= ilda_B[i]*idx_arr[i];
   149         off_A += ilda_A[i]*idx_arr[i];
   150         off_B += ilda_B[i]*idx_arr[i];
   151         if (idx_arr[i] != 0) 
break;
   153       if (i==num_dim) 
break;
   164     printf(
"tsum_replicate: \n");
   165     printf(
"cdt_A = %p, size_A = %ld, ncdt_A = %d\n",
   166             cdt_A, size_A, ncdt_A);
   167     for (i=0; i<ncdt_A; i++){
   168       printf(
"cdt_A[%d] length = %d\n",i,cdt_A[i]->
np);
   170     printf(
"cdt_B = %p, size_B = %ld, ncdt_B = %d\n",
   171             cdt_B, size_B, ncdt_B);
   172     for (i=0; i<ncdt_B; i++){
   173       printf(
"cdt_B[%d] length = %d\n",i,cdt_B[i]->
np);
   204                                  int const *                 phys_mapped,
   206                                  int64_t blk_sz_B) : 
tsum(s) {
   215     for (i=0; i<nphys_dim; i++){
   216       if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
   219       if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
   229     for (i=0; i<nphys_dim; i++){
   230       if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
   234       if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
   253     char * buf = this->
A;
   324     printf(
"seq_tsr_sum:\n");
   326       printf(
"edge_len_A[%d]=%d\n",i,
edge_len_A[i]);
   329       printf(
"edge_len_B[%d]=%d\n",i,
edge_len_B[i]);
   331     printf(
"is inner = %d\n", 
is_inner);
   333     printf(
"map_pfx = %ld\n", 
map_pfx);
 a term is an abstract object representing some expression of tensors 
bool is_custom
whether there is a elementwise custom function 
void execute(CTF::Idx_Tensor output) const 
evalues the expression, which just scales by default 
virtual void execute(CTF::Idx_Tensor output) const  =0
evalues the expression, which just scales by default 
int * sym
symmetries among tensor dimensions 
int * idx_A
indices of left operand 
int sym_seq_sum_ref(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B)
performs symmetric contraction with unblocked reference kernel 
Unifun_Term operator()(Term const &A) const 
evaluate B=f(A) 
void allred(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op)
allreduce, same interface as MPI_Allreduce, but excluding the comm 
void * alloc(int64_t len)
alloc abstraction 
void run()
wraps user sequential function signature 
virtual char const * addid() const 
MPI datatype for pairs. 
int64_t mem_fp()
returns the number of bytes of buffer space needed 
int order
number of tensor dimensions 
int64_t mem_fp()
returns the number of bytes of buffer space needed 
virtual void set(char *a, char const *b, int64_t n) const 
sets n elements of array a to value b 
performs replication along a dimension, generates 2.5D algs 
virtual Term * clone(std::map< tensor *, tensor * > *remap=NULL) const  =0
base classes must implement this copy function to retrieve pointer 
tsum_replicate(tsum *other)
char const * alpha
scaling of A 
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction 
int64_t mem_fp()
returns the number of bytes of buffer space needed 
algstrct * sr
algstrct on which tensor elements and operations are defined 
tsum_virt(tsum *other)
iterates over the dense virtualization block grid and contracts 
virtual MPI_Op addmop() const 
MPI addition operation for reductions. 
univar_function const * func
int * idx_B
indices of output 
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm 
virtual std::vector< char > get_uniq_inds() const  =0
find list of unique indices that are involved in this term 
seq_tsr_sum(tsum *other)
copies sum object 
int sym_seq_sum_cust(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, univar_function const *func)
performs symmetric summation with custom elementwise function 
int el_size
size of each element of algstrct in bytes 
char const * beta
scaling of existing B 
int cdealloc(void *ptr)
free abstraction 
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
topology * topo
topology to which the tensor is mapped 
int sym_seq_sum_inr(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, int const *idx_map_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, int const *idx_map_B, int inr_stride)
performs symmetric summation with blocked daxpy 
class for execution distributed summation of tensors 
virtual char const * mulid() const 
identity element for multiplication i.e. 1 
virtual MPI_Datatype mdtype() const 
MPI datatype.