3 #include "../shared/util.h" 6 #include "../interface/fun_term.h" 7 #include "../interface/idx_tensor.h" 90 printf(
"tspsum_virt:\n");
91 printf(
"blk_sz_A = %ld, blk_sz_B = %ld\n",
94 printf(
"virt_dim[%d] = %d\n", i,
virt_dim[i]);
104 int * idx_arr, * lda_A, * lda_B, * beta_arr;
105 int * ilda_A, * ilda_B;
106 int64_t i, off_A, off_B;
107 int nb_A, nb_B, alloced, ret;
110 if (this->
buffer != NULL){
112 idx_arr = (
int*)this->
buffer;
125 #define SET_LDA_X(__X) \ 128 for (i=0; i<order_##__X; i++){ \ 129 lda_##__X[i] = nb_##__X; \ 130 nb_##__X = nb_##__X*virt_dim[idx_map_##__X[i]]; \ 132 memset(ilda_##__X, 0, num_dim*sizeof(int)); \ 133 for (i=0; i<order_##__X; i++){ \ 134 ilda_##__X[idx_map_##__X[i]] += lda_##__X[i]; \ 142 beta_arr = (
int*)
alloc(
sizeof(
int)*nb_B);
144 int64_t * sp_offsets_A = NULL;
146 sp_offsets_A = (int64_t*)
alloc(
sizeof(int64_t)*nb_A);
148 for (
int i=1; i<nb_A; i++){
149 sp_offsets_A[i] = sp_offsets_A[i-1]+
nnz_blk_A[i-1];
153 int64_t * sp_offsets_B = NULL;
154 int64_t * new_sp_szs_B = NULL;
155 char ** buckets_B = NULL;
157 sp_offsets_B = (int64_t*)
alloc(
sizeof(int64_t)*nb_B);
160 buckets_B = (
char**)
alloc(
sizeof(
char*)*nb_B);
161 for (
int i=0; i<nb_B; i++){
165 sp_offsets_B[i] = sp_offsets_B[i-1]+
nnz_blk_B[i-1];
171 memset(idx_arr, 0, num_dim*
sizeof(
int));
172 memset(beta_arr, 0, nb_B*
sizeof(
int));
173 off_A = 0, off_B = 0;
188 if (beta_arr[off_B]>0)
202 off_A -= ilda_A[i]*idx_arr[i];
203 off_B -= ilda_B[i]*idx_arr[i];
207 off_A += ilda_A[i]*idx_arr[i];
208 off_B += ilda_B[i]*idx_arr[i];
209 if (idx_arr[i] != 0)
break;
211 if (i==num_dim)
break;
215 for (
int i=0; i<nb_B; i++){
220 for (
int i=0; i<nb_B; i++){
242 printf(
"tspsum_replicate: \n");
243 printf(
"cdt_A = %p, size_A = %ld, ncdt_A = %d\n",
244 cdt_A, size_A, ncdt_A);
245 for (i=0; i<ncdt_A; i++){
246 printf(
"cdt_A[%d] length = %d\n",i,cdt_A[i]->
np);
248 printf(
"cdt_B = %p, size_B = %ld, ncdt_B = %d\n",
249 cdt_B, size_B, ncdt_B);
250 for (i=0; i<ncdt_B; i++){
251 printf(
"cdt_B[%d] length = %d\n",i,cdt_B[i]->
np);
282 int const * phys_mapped,
295 for (i=0; i<nphys_dim; i++){
296 if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
299 if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
309 for (i=0; i<nphys_dim; i++){
310 if (phys_mapped[2*i+0] == 0 && phys_mapped[2*i+1] == 1){
314 if (phys_mapped[2*i+1] == 0 && phys_mapped[2*i+0] == 1){
332 char * buf = this->
A;
353 if (need_free) MPI_Type_free(&md);
446 for (
int i=0; i<s->
B->
order; i++){
448 for (
int j=0; j<s->
A->
order; j++){
461 printf(
"seq_tsr_spsum:\n");
463 printf(
"edge_len_A[%d]=%d\n",i,
edge_len_A[i]);
466 printf(
"edge_len_B[%d]=%d\n",i,
edge_len_B[i]);
468 printf(
"is inner = %d\n",
is_inner);
470 printf(
"map_pfx = %ld\n",
map_pfx);
545 int map_idx_rev[s->
B->
order];
547 int64_t lda_B[s->
B->
order];
549 for (
int o=1; o<s->
B->
order; o++){
551 lda_B[o] = lda_B[o-1]*s->
B->
lens[o];
556 for (
int oB=0; oB<s->
B->
order; oB++){
558 for (
int oA=0; oA<s->
A->
order; oA++){
565 for (
int ooB=0; ooB<oB; ooB++){
587 printf(
"tspsum_map:\n");
596 for (
int midx=0; midx<
nmap_idx; midx++){
606 for (
int midx=0; midx<
nmap_idx; midx++){
615 #pragma omp parallel for 617 for (int64_t i=0; i<
nnz_A; i++){
618 for (int64_t r=0; r<tot_rep; r++){
619 this->
sr_A->
copy(pi_new[i*tot_rep+r].ptr, pi[i].ptr);
623 #pragma omp parallel for 625 for (int64_t i=0; i<
nnz_A; i++){
627 for (
int midx=0; midx<
nmap_idx; midx++){
628 int64_t stride=phase;
630 for (int64_t r=0; r<tot_rep/phase; r++){
632 for (int64_t s=0; s<stride; s++){
633 ((int64_t*)(pi_new[i*tot_rep + r*phase + m*stride + s].ptr))[0] += m*
map_idx_lda[midx];
673 memcpy(
p, o->
p,
sizeof(
int)*
order);
680 int const * idx_X, * idx_Y;
700 memcpy(
lens_new, lens,
sizeof(
int)*this->order);
710 for (
int i=0; i<this->
order; i++){
712 for (
int j=0; j<Y->
order; j++){
713 if (idx_X[i] == idx_Y[j]){
720 int lesser[this->
order];
721 for (
int i=0; i<this->
order; i++){
724 for (
int j=0; j<this->
order; j++){
725 if (i!=j &&
p[j] != -1 &&
p[j] <
p[i]) lesser[i]++;
729 for (
int i=0; i<this->
order; i++){
736 for (
int i=0; i<this->
order; i++){
738 for (
int j=0; j<Y->
order; j++){
739 if (idx_X[i] == idx_Y[j]){
743 if (mapped) nmap_idx++;
748 for (
int i=0; i<this->
order; i++){
750 for (
int j=0; j<Y->
order; j++){
751 if (idx_X[i] == idx_Y[j]){
757 if (
p[i] == nm) nm++;
761 for (
int i=0; i<this->
order; i++){
762 if (
p[i] != i)
skip =
false;
773 printf(
"tspsum_permute:\n");
774 if (
A_or_B) printf(
"permuting A\n");
775 else printf(
"permuting B\n");
776 for (
int i=0; i<
order; i++){
777 printf(
"p[%d] = %d ",i,
p[i]);
811 int64_t new_lda_A[
order];
812 memset(new_lda_A, 0,
order*
sizeof(int64_t));
814 for (
int i=0; i<
order; i++){
815 for (
int j=0; j<
order; j++){
837 int64_t old_lda_B[
order];
839 for (
int i=0; i<
order; i++){
843 int64_t new_lda_B[
order];
844 std::fill(new_lda_B, new_lda_B+order, 0);
845 for (
int i=0; i<
order; i++){
846 new_lda_B[i] = old_lda_B[
p[i]];
878 for (
int i=0; i<
order; i++){
881 int64_t new_lda_B[
order];
882 int64_t old_lda_B[
order];
884 for (
int i=0; i<
order; i++){
888 for (
int i=0; i<
order; i++){
889 new_lda_B[i] = old_lda_B[inv_p[i]];
922 for (i=0; i<order_A; i++){
923 if (idx_A[i] > dim_max) dim_max = idx_A[i];
925 for (i=0; i<order_B; i++){
926 if (idx_B[i] > dim_max) dim_max = idx_B[i];
929 *order_tot = dim_max;
930 *idx_arr = (
int*)
alloc(
sizeof(
int)*2*dim_max);
931 std::fill((*idx_arr), (*idx_arr)+2*dim_max, -1);
933 for (i=0; i<order_A; i++){
934 (*idx_arr)[2*idx_A[i]] = i;
936 for (i=0; i<order_B; i++){
937 (*idx_arr)[2*idx_B[i]+1] = i;
968 printf(
"tspsum_pin_keys:\n");
969 if (
A_or_B) printf(
"transforming global keys of A to local keys\n");
970 else printf(
"transforming global keys of B to local keys\n");
975 return 3*
order*
sizeof(int);
993 for (
int i=0; i<
order; i++){
1043 depin(
sr_A,
order,
lens,
divisor,
nvirt_A,
virt_dim,
phys_rank,
A,
nnz_A, (int64_t*)
nnz_blk_A,
A,
false);
1045 char * old_B =
new_B;
1046 depin(
sr_B,
order,
lens,
divisor,
nvirt_B,
virt_dim,
phys_rank,
new_B,
new_nnz_B,
nnz_blk_B,
new_B,
true);
void dnA_spB_seq_sum(char const *alpha, char const *A, algstrct const *sr_A, int order_A, int const *edge_len_A, int const *sym_A, char const *beta, char const *B, int64_t size_B, char *&new_B, int64_t &new_size_B, algstrct const *sr_B, univar_function const *func)
performs summation between two sparse tensors assumes B contain key value pairs sorted by key...
int calc_phys_rank(topology const *topo) const
compute the physical rank of a mapping
bool is_custom
whether there is a elementwise custom function
int64_t * nnz_blk
nonzero elements in each block owned locally
bool get_mpi_dt(int64_t count, int64_t datum_size, MPI_Datatype &dt)
gives a datatype for arbitrary datum_size, errors if exceeding 32-bits
int * sym
symmetries among tensor dimensions
int * idx_A
indices of left operand
virtual int pair_size() const
gets pair size el_size plus the key size
int calc_phase() const
compute the phase of a mapping
virtual char * pair_alloc(int64_t n) const
allocate space for n (int64_t,dtype) pairs, necessary for object types
tspsum_replicate(tspsum *other)
int * pad_edge_len
padded tensor edge lengths
tspsum_virt(tspsum *other)
iterates over the dense virtualization block grid and contracts
int calc_phys_phase() const
compute the physical phase of a mapping
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
void allred(void *inbuf, void *outbuf, int64_t count, MPI_Datatype mdtype, MPI_Op op)
allreduce, same interface as MPI_Allreduce, but excluding the comm
void run()
wraps user sequential function signature
void spA_dnB_seq_sum(char const *alpha, char const *A, int64_t size_A, algstrct const *sr_A, char const *beta, char *B, algstrct const *sr_B, int order_B, int const *edge_len_B, int const *sym_B, univar_function const *func)
performs summation between two sparse tensors assumes A contains key value pairs sorted by key...
virtual void copy(char *a, char const *b) const
copies element b to element a
int64_t mem_fp()
returns the number of bytes of buffer space needed
void pin(int64_t n, int order, int const *lens, int const *divisor, PairIterator pi_new)
pins keys of n pairs
void * alloc(int64_t len)
alloc abstraction
void permute(int64_t n, int order, int const *old_lens, int64_t const *new_lda, PairIterator wA)
permutes keys of n pairs
virtual void set_nnz_blk_A(int64_t const *nnbA)
virtual char const * addid() const
MPI datatype for pairs.
void sort(int64_t n)
sorts set of pairs using std::sort
int64_t mem_fp()
returns the number of bytes of buffer space needed
bool is_sparse
whether only the non-zero elements of the tensor are stored
int order
number of tensor dimensions
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
int64_t mem_fp()
returns the number of bytes of buffer space needed
int * lens
unpadded tensor edge lengths
void depin(algstrct const *sr, int order, int const *lens, int const *divisor, int nvirt, int const *virt_dim, int const *phys_rank, char *X, int64_t &new_nnz_B, int64_t *nnz_blk, char *&new_B, bool check_padding)
depins keys of n pairs
seq_tsr_spsum(tspsum *other)
copies sum object
int64_t mem_fp()
returns the number of bytes of buffer space needed
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
virtual void pair_dealloc(char *ptr) const
deallocate given pointer containing contiguous array of pairs
virtual MPI_Op addmop() const
MPI addition operation for reductions.
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
int * idx_B
indices of output
void bcast(void *buf, int64_t count, MPI_Datatype mdtype, int root)
broadcast, same interface as MPI_Bcast, but excluding the comm
int64_t mem_fp()
returns the number of bytes of buffer space needed
univar_function const * func
int64_t calc_nvirt() const
calculate virtualization factor of tensor return virtualization factor
tspsum_pin_keys(tspsum *other)
int64_t mem_fp()
returns the number of bytes of buffer space needed
int64_t nnz_loc
number of local nonzero elements
int el_size
size of each element of algstrct in bytes
virtual void copy_pairs(char *a, char const *b, int64_t n) const
copies n pair from array b to array a
int cdealloc(void *ptr)
free abstraction
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
internal distributed tensor class
tspsum_map(tspsum *other)
tspsum_permute(tspsum *other)
topology * topo
topology to which the tensor is mapped
performs replication along a dimension, generates 2.5D algs
class for execution distributed summation of tensors
virtual char const * mulid() const
identity element for multiplication i.e. 1
void spA_spB_seq_sum(char const *alpha, char const *A, int64_t size_A, algstrct const *sr_A, char const *beta, char *B, int64_t size_B, char *&new_B, int64_t &new_size_B, algstrct const *sr_B, univar_function const *func, int64_t map_pfx)
performs summation between two sparse tensors assumes A and B contain key value pairs sorted by key...
virtual MPI_Datatype mdtype() const
MPI datatype.