3 #include "../shared/util.h"     6 #include "../interface/fun_term.h"     7 #include "../interface/idx_tensor.h"    90     printf(
"tspsum_virt:\n");
    91     printf(
"blk_sz_A = %ld, blk_sz_B = %ld\n",
    94       printf(
"virt_dim[%d] = %d\n", i, 
virt_dim[i]);
   104     int * idx_arr, * lda_A, * lda_B, * beta_arr;
   105     int * ilda_A, * ilda_B;
   106     int64_t i, off_A, off_B;
   107     int nb_A, nb_B, alloced, ret; 
   110     if (this->
buffer != NULL){    
   112       idx_arr = (
int*)this->
buffer;
   125   #define SET_LDA_X(__X)                              \   128     for (i=0; i<order_##__X; i++){                    \   129       lda_##__X[i] = nb_##__X;                        \   130       nb_##__X = nb_##__X*virt_dim[idx_map_##__X[i]]; \   132     memset(ilda_##__X, 0, num_dim*sizeof(int));       \   133     for (i=0; i<order_##__X; i++){                    \   134       ilda_##__X[idx_map_##__X[i]] += lda_##__X[i];   \   142     beta_arr = (
int*)
alloc(
sizeof(
int)*nb_B);
   144     int64_t * sp_offsets_A = NULL;
   146       sp_offsets_A = (int64_t*)
alloc(
sizeof(int64_t)*nb_A);
   148       for (
int i=1; i<nb_A; i++){
   149         sp_offsets_A[i] = sp_offsets_A[i-1]+
nnz_blk_A[i-1];
   153     int64_t * sp_offsets_B = NULL;
   154     int64_t * new_sp_szs_B = NULL;
   155     char ** buckets_B = NULL;
   157       sp_offsets_B = (int64_t*)
alloc(
sizeof(int64_t)*nb_B);
   160       buckets_B = (
char**)
alloc(
sizeof(
char*)*nb_B);
   161       for (
int i=0; i<nb_B; i++){
   165           sp_offsets_B[i] = sp_offsets_B[i-1]+
nnz_blk_B[i-1];
   171     memset(idx_arr, 0, num_dim*
sizeof(
int));
   172     memset(beta_arr, 0, nb_B*
sizeof(
int));
   173     off_A = 0, off_B = 0;
   188       if (beta_arr[off_B]>0)
   202         off_A -= ilda_A[i]*idx_arr[i];
   203         off_B -= ilda_B[i]*idx_arr[i];
   207         off_A += ilda_A[i]*idx_arr[i];
   208         off_B += ilda_B[i]*idx_arr[i];
   209         if (idx_arr[i] != 0) 
break;
   211       if (i==num_dim) 
break;
   215       for (
int i=0; i<nb_B; i++){
   220       for (
int i=0; i<nb_B; i++){
   242     printf(
"tspsum_replicate: \n");
   243     printf(
"cdt_A = %p, size_A = %ld, ncdt_A = %d\n",
   244             cdt_A, size_A, ncdt_A);
   245     for (i=0; i<ncdt_A; i++){
   246       printf(
"cdt_A[%d] length = %d\n",i,cdt_A[i]->
np);
   248     printf(
"cdt_B = %p, size_B = %ld, ncdt_B = %d\n",
   249             cdt_B, size_B, ncdt_B);
   250     for (i=0; i<ncdt_B; i++){
   251       printf(
"cdt_B[%d] length = %d\n",i,cdt_B[i]->
np);
   282                                      int const *       phys_mapped,
   295     for (i=0; i<nphys_dim; i++){
   296       if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
   299       if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
   309     for (i=0; i<nphys_dim; i++){
   310       if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
   314       if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
   332     char * buf = this->
A;
   353       if (need_free) MPI_Type_free(&md);
   446       for (
int i=0; i<s->
B->
order; i++){
   448         for (
int j=0; j<s->
A->
order; j++){
   461     printf(
"seq_tsr_spsum:\n");
   463       printf(
"edge_len_A[%d]=%d\n",i,
edge_len_A[i]);
   466       printf(
"edge_len_B[%d]=%d\n",i,
edge_len_B[i]);
   468     printf(
"is inner = %d\n", 
is_inner);
   470     printf(
"map_pfx = %ld\n", 
map_pfx);
   545     int map_idx_rev[s->
B->
order];
   547     int64_t lda_B[s->
B->
order];
   549     for (
int o=1; o<s->
B->
order; o++){
   551         lda_B[o] = lda_B[o-1]*s->
B->
lens[o];
   556     for (
int oB=0; oB<s->
B->
order; oB++){
   558       for (
int oA=0; oA<s->
A->
order; oA++){
   565         for (
int ooB=0; ooB<oB; ooB++){
   587     printf(
"tspsum_map:\n");
   596       for (
int midx=0; midx<
nmap_idx; midx++){
   606     for (
int midx=0; midx<
nmap_idx; midx++){
   615     #pragma omp parallel for   617     for (int64_t i=0; i<
nnz_A; i++){
   618       for (int64_t r=0; r<tot_rep; r++){
   619         this->
sr_A->
copy(pi_new[i*tot_rep+r].ptr, pi[i].ptr);
   623     #pragma omp parallel for   625     for (int64_t i=0; i<
nnz_A; i++){
   627       for (
int midx=0; midx<
nmap_idx; midx++){
   628         int64_t stride=phase;
   630         for (int64_t r=0; r<tot_rep/phase; r++){
   632             for (int64_t s=0; s<stride; s++){
   633               ((int64_t*)(pi_new[i*tot_rep + r*phase + m*stride + s].ptr))[0] += m*
map_idx_lda[midx];
   673     memcpy(
p, o->
p, 
sizeof(
int)*
order);
   680     int const * idx_X, * idx_Y;
   700     memcpy(
lens_new, lens, 
sizeof(
int)*this->order);
   710       for (
int i=0; i<this->
order; i++){
   712         for (
int j=0; j<Y->
order; j++){
   713           if (idx_X[i] == idx_Y[j]){
   720       int lesser[this->
order];
   721       for (
int i=0; i<this->
order; i++){
   724           for (
int j=0; j<this->
order; j++){
   725             if (i!=j && 
p[j] != -1 && 
p[j] < 
p[i]) lesser[i]++; 
   729       for (
int i=0; i<this->
order; i++){
   736       for (
int i=0; i<this->
order; i++){
   738         for (
int j=0; j<Y->
order; j++){
   739           if (idx_X[i] == idx_Y[j]){
   743         if (mapped) nmap_idx++;
   748       for (
int i=0; i<this->
order; i++){
   750         for (
int j=0; j<Y->
order; j++){
   751           if (idx_X[i] == idx_Y[j]){
   757         if (
p[i] == nm) nm++;
   761     for (
int i=0; i<this->
order; i++){
   762       if (
p[i] != i) 
skip = 
false;
   773     printf(
"tspsum_permute:\n");
   774     if (
A_or_B) printf(
"permuting A\n");
   775     else        printf(
"permuting B\n");
   776     for (
int i=0; i<
order; i++){
   777       printf(
"p[%d] = %d ",i,
p[i]);
   811       int64_t new_lda_A[
order];
   812       memset(new_lda_A, 0, 
order*
sizeof(int64_t));
   814       for (
int i=0; i<
order; i++){
   815         for (
int j=0; j<
order; j++){
   837       int64_t old_lda_B[
order];
   839       for (
int i=0; i<
order; i++){
   843       int64_t new_lda_B[
order];
   844       std::fill(new_lda_B, new_lda_B+order, 0);
   845       for (
int i=0; i<
order; i++){
   846         new_lda_B[i] = old_lda_B[
p[i]];
   878       for (
int i=0; i<
order; i++){
   881       int64_t new_lda_B[
order];
   882       int64_t old_lda_B[
order];
   884       for (
int i=0; i<
order; i++){
   888       for (
int i=0; i<
order; i++){
   889         new_lda_B[i] = old_lda_B[inv_p[i]];
   922     for (i=0; i<order_A; i++){
   923       if (idx_A[i] > dim_max) dim_max = idx_A[i];
   925     for (i=0; i<order_B; i++){
   926       if (idx_B[i] > dim_max) dim_max = idx_B[i];
   929     *order_tot = dim_max;
   930     *idx_arr = (
int*)
alloc(
sizeof(
int)*2*dim_max);
   931     std::fill((*idx_arr), (*idx_arr)+2*dim_max, -1);  
   933     for (i=0; i<order_A; i++){
   934       (*idx_arr)[2*idx_A[i]] = i;
   936     for (i=0; i<order_B; i++){
   937       (*idx_arr)[2*idx_B[i]+1] = i;
   968     printf(
"tspsum_pin_keys:\n");
   969     if (
A_or_B) printf(
"transforming global keys of A to local keys\n");
   970     else        printf(
"transforming global keys of B to local keys\n");
   975     return 3*
order*
sizeof(int);
   993     for (
int i=0; i<
order; i++){
  1043       depin(
sr_A, 
order, 
lens, 
divisor, 
nvirt_A, 
virt_dim, 
phys_rank, 
A, 
nnz_A, (int64_t*)
nnz_blk_A, 
A, 
false);
  1045       char * old_B = 
new_B;
  1046       depin(
sr_B, 
order, 
lens, 
divisor, 
nvirt_B, 
virt_dim, 
phys_rank, 
new_B, 
new_nnz_B, 
nnz_blk_B, 
new_B, 
true);
 
void dnA_spB_seq_sum(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, char const *beta, char const *B, int64_t size_B, char *&new_B, int64_t &new_size_B, algstrct const *sr_B, univar_function const *func)
performs summation between two sparse tensors assumes B contain key value pairs sorted by key...
int calc_phys_rank(topology const *topo) const 
compute the physical rank of a mapping 
bool is_custom
whether there is a elementwise custom function 
int64_t * nnz_blk
nonzero elements in each block owned locally 
bool get_mpi_dt(int64_t count, int64_t datum_size, MPI_Datatype &dt)
gives a datatype for arbitrary datum_size, errors if exceeding 32-bits 
int * sym
symmetries among tensor dimensions 
int * idx_A
indices of left operand 
virtual int pair_size() const 
gets pair size el_size plus the key size 
int calc_phase() const 
compute the phase of a mapping 
virtual char * pair_alloc(int64_t n) const 
allocate space for n (int64_t,dtype) pairs, necessary for object types 
tspsum_replicate(tspsum *other)
int * pad_edge_len
padded tensor edge lengths 
tspsum_virt(tspsum *other)
iterates over the dense virtualization block grid and contracts 
int calc_phys_phase() const 
compute the physical phase of a mapping 
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map 
void allred(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op)
allreduce, same interface as MPI_Allreduce, but excluding the comm 
void run()
wraps user sequential function signature 
void spA_dnB_seq_sum(char const *alpha, char const *A, int64_t size_A, algstrct const *sr_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, univar_function const *func)
performs summation between two sparse tensors assumes A contains key value pairs sorted by key...
virtual void copy(char *a, char const *b) const 
copies element b to element a 
int64_t mem_fp()
returns the number of bytes of buffer space needed 
void pin(int64_t n, int order, int const *lens, int const *divisor, PairIterator pi_new)
pins keys of n pairs 
void * alloc(int64_t len)
alloc abstraction 
void permute(int64_t n, int order, int const *old_lens, int64_t const *new_lda, PairIterator wA)
permutes keys of n pairs 
virtual void set_nnz_blk_A(int64_t const *nnbA)
virtual char const * addid() const 
MPI datatype for pairs. 
void sort(int64_t n)
sorts set of pairs using std::sort 
int64_t mem_fp()
returns the number of bytes of buffer space needed 
bool is_sparse
whether only the non-zero elements of the tensor are stored 
int order
number of tensor dimensions 
virtual void set(char *a, char const *b, int64_t n) const 
sets n elements of array a to value b 
int64_t mem_fp()
returns the number of bytes of buffer space needed 
int * lens
unpadded tensor edge lengths 
void depin(algstrct const *sr, int order, int const *lens, int const *divisor, int nvirt, int const *virt_dim, int const *phys_rank, char *X, int64_t &new_nnz_B, int64_t *nnz_blk, char *&new_B, bool check_padding)
depins keys of n pairs 
seq_tsr_spsum(tspsum *other)
copies sum object 
int64_t mem_fp()
returns the number of bytes of buffer space needed 
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction 
virtual void pair_dealloc(char *ptr) const 
deallocate given pointer containing contiguous array of pairs 
virtual MPI_Op addmop() const 
MPI addition operation for reductions. 
mapping * edge_map
mappings of each tensor dimension onto topology dimensions 
int * idx_B
indices of output 
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm 
int64_t mem_fp()
returns the number of bytes of buffer space needed 
univar_function const * func
int64_t calc_nvirt() const 
calculate virtualization factor of tensor return virtualization factor 
tspsum_pin_keys(tspsum *other)
int64_t mem_fp()
returns the number of bytes of buffer space needed 
int64_t nnz_loc
number of local nonzero elements 
int el_size
size of each element of algstrct in bytes 
virtual void copy_pairs(char *a, char const *b, int64_t n) const 
copies n pair from array b to array a 
int cdealloc(void *ptr)
free abstraction 
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
internal distributed tensor class 
tspsum_map(tspsum *other)
tspsum_permute(tspsum *other)
topology * topo
topology to which the tensor is mapped 
performs replication along a dimension, generates 2.5D algs 
class for execution distributed summation of tensors 
virtual char const * mulid() const 
identity element for multiplication i.e. 1 
void spA_spB_seq_sum(char const *alpha, char const *A, int64_t size_A, algstrct const *sr_A, char const *beta, char *B, int64_t size_B, char *&new_B, int64_t &new_size_B, algstrct const *sr_B, univar_function const *func, int64_t map_pfx)
performs summation between two sparse tensors assumes A and B contain key value pairs sorted by key...
virtual MPI_Datatype mdtype() const 
MPI datatype.