Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
scaling.cxx
Go to the documentation of this file.
1 #include "scaling.h"
2 #include "strp_tsr.h"
3 #include "../mapping/mapping.h"
4 #include "../mapping/distribution.h"
5 #include "../tensor/untyped_tensor.h"
6 #include "../shared/util.h"
7 #include "../shared/memcontrol.h"
8 #include "sym_seq_scl.h"
9 
10 namespace CTF_int {
11 
12  using namespace CTF;
13 
15  int const * idx_map_,
16  char const * alpha_){
17  A = A_;
18  alpha = alpha_;
19  is_custom = 0;
20 
21  idx_map = (int*)alloc(sizeof(int)*A->order);
22  memcpy(idx_map, idx_map_, sizeof(int)*A->order);
23  }
24 
26  char const * cidx_map,
27  char const * alpha_){
28  A = A_;
29  alpha = alpha_;
30  is_custom = 0;
31 
32  conv_idx(A->order, cidx_map, &idx_map);
33  }
34 
36  int const * idx_map_,
37  char const * alpha_,
38  endomorphism const * func_){
39  A = A_;
40  alpha = alpha_;
41  func = func_;
42  is_custom = 1;
43 
44  idx_map = (int*)alloc(sizeof(int)*A->order);
45  memcpy(idx_map, idx_map_, sizeof(int)*A->order);
46  }
47 
49  char const * cidx_map,
50  char const * alpha_,
51  endomorphism const * func_){
52  A = A_;
53  alpha = alpha_;
54  func = func_;
55  is_custom = 1;
56 
57  conv_idx(A->order, cidx_map, &idx_map);
58  }
59 
61  cdealloc(idx_map);
62  }
63 
65  int st, is_top, order_tot, iA, ret, itopo, btopo;
66  int64_t blk_sz, vrt_sz;
67  distribution * old_dst = NULL;
68  int * virt_dim, * idx_arr;
69  int * virt_blk_len, * blk_len;
70  int64_t nvirt;
71  int64_t memuse, bmemuse;
72  mapping * map;
73  tensor * tsr, * ntsr;
74  strp_tsr * str;
75  scl * hscl = NULL, ** rec_scl = NULL;
76 
77  is_top = 1;
78  tsr = A;
79  if (tsr->has_zero_edge_len){
80  return SUCCESS;
81  }
83 
84  #if DEBUG>=2
85  if (tsr->wrld->cdt.rank == 0){
86  printf("Scaling tensor %s.\n", tsr->name);
87  printf("The index mapping is");
88  for (int i=0; i<tsr->order; i++){
89  printf(" %d",idx_map[i]);
90  }
91  printf("\n");
92  printf("Old mapping for tensor %s\n",tsr->name);
93  }
94  tsr->print_map(stdout);
95  #endif
96 
97  if (A->is_sparse){
98  sp_scl();
100  return SUCCESS;
101  }
102 
103  #ifdef HOME_CONTRACT
104  int was_home = tsr->is_home;
105  if (was_home){
106  ntsr = new tensor(tsr, 0, 0);
107  ntsr->data = tsr->data;
108  ntsr->home_buffer = tsr->home_buffer;
109  ntsr->is_home = 1;
110  ntsr->has_home = 1;
111  ntsr->is_mapped = 1;
112  ntsr->topo = tsr->topo;
113  copy_mapping(tsr->order, tsr->edge_map, ntsr->edge_map);
114  ntsr->set_padding();
115  if (tsr->is_sparse){
116  CTF_int::alloc_ptr(ntsr->calc_nvirt()*sizeof(int64_t), (void**)&ntsr->nnz_blk);
117  ntsr->set_new_nnz_glb(tsr->nnz_blk);
118  }
119  } else ntsr = tsr;
120  #else
121  ntsr = tsr;
122  #endif
123  ntsr->unfold();
124  ntsr->set_padding();
125  inv_idx(ntsr->order, idx_map,
126  &order_tot, &idx_arr);
127 
128  CTF_int::alloc_ptr(sizeof(int)*ntsr->order, (void**)&blk_len);
129  CTF_int::alloc_ptr(sizeof(int)*ntsr->order, (void**)&virt_blk_len);
130  CTF_int::alloc_ptr(sizeof(int)*order_tot, (void**)&virt_dim);
131 
132  btopo = -1;
133  if (!check_self_mapping(ntsr, idx_map)){
134  old_dst = new distribution(ntsr);
135  bmemuse = INT64_MAX;
136  for (itopo=tsr->wrld->cdt.rank; itopo<(int)tsr->wrld->topovec.size(); itopo+=tsr->wrld->cdt.np){
137  ntsr->clear_mapping();
138  ntsr->set_padding();
139  ntsr->topo = tsr->wrld->topovec[itopo];
140  ntsr->is_mapped = true;
141  ret = map_self_indices(ntsr, idx_map);
142  if (ret!=SUCCESS) continue;
143  ret = ntsr->map_tensor_rem(ntsr->topo->order,
144  ntsr->topo->dim_comm, 0);
145  if (ret!=SUCCESS) continue;
146  ret = map_self_indices(ntsr, idx_map);
147  if (ret!=SUCCESS) continue;
148  if (check_self_mapping(ntsr, idx_map)) {
149  ntsr->set_padding();
150  memuse = ntsr->size;
151 
152  if (memuse >= proc_bytes_available()){
153  DPRINTF(1,"Not enough memory to scale tensor on topo %d\n", itopo);
154  continue;
155  }
156 
157  nvirt = ntsr->calc_nvirt();
158  ASSERT(nvirt != 0);
159  if (btopo == -1){
160  btopo = itopo;
161  bmemuse = memuse;
162  } else if (memuse < bmemuse){
163  btopo = itopo;
164  bmemuse = memuse;
165  }
166  }
167  }
168  if (btopo == -1){
169  bmemuse = INT64_MAX;
170  }
171  btopo = get_best_topo(1, btopo, tsr->wrld->cdt, 0.0, bmemuse);
172 
173  if (btopo == -1 || btopo == INT_MAX) {
174  if (tsr->wrld->cdt.rank==0)
175  printf("ERROR: FAILED TO MAP TENSOR SCALE\n");
177  return ERROR;
178  }
179 
180  ntsr->clear_mapping();
181  ntsr->set_padding();
182  ntsr->is_cyclic = 1;
183  ntsr->topo = tsr->wrld->topovec[btopo];
184  ntsr->is_mapped = true;
185  ret = map_self_indices(ntsr, idx_map);
186  if (ret!=SUCCESS) ABORT;
187  ret = ntsr->map_tensor_rem(ntsr->topo->order,
188  ntsr->topo->dim_comm, 0);
189  if (ret!=SUCCESS) ABORT;
190  ret = map_self_indices(ntsr, idx_map);
191  if (ret!=SUCCESS) ABORT;
192  ntsr->set_padding();
193  #if DEBUG >=2
194  if (tsr->wrld->cdt.rank == 0){
195  printf("New mapping for tensor %s\n",ntsr->name);
196  }
197  ntsr->print_map(stdout);
198  #endif
199  TAU_FSTART(redistribute_for_scale);
200  ntsr->redistribute(*old_dst);
201  TAU_FSTOP(redistribute_for_scale);
202  }
203 
204  blk_sz = ntsr->size;
205  calc_dim(ntsr->order, blk_sz, ntsr->pad_edge_len, ntsr->edge_map,
206  &vrt_sz, virt_blk_len, blk_len);
207 
208  st = strip_diag(ntsr->order, order_tot, idx_map, vrt_sz,
209  ntsr->edge_map, ntsr->topo, ntsr->sr,
210  blk_len, &blk_sz, &str);
211  if (st){
212  if (tsr->wrld->cdt.rank == 0)
213  DPRINTF(2,"Stripping tensor\n");
214  strp_scl * sscl = new strp_scl;
215  sscl->sr_A = tsr->sr;
216  hscl = sscl;
217  is_top = 0;
218  rec_scl = &sscl->rec_scl;
219 
220  sscl->rec_strp = str;
221  }
222 
223  nvirt = 1;
224  for (int i=0; i<order_tot; i++){
225  iA = idx_arr[i];
226  if (iA != -1){
227  map = &ntsr->edge_map[iA];
228  while (map->has_child) map = map->child;
229  if (map->type == VIRTUAL_MAP){
230  virt_dim[i] = map->np;
231  if (st) virt_dim[i] = virt_dim[i]/str->strip_dim[iA];
232  }
233  else virt_dim[i] = 1;
234  } else virt_dim[i] = 1;
235  nvirt *= virt_dim[i];
236  }
237 
238  /* Multiply over virtual sub-blocks */
239  if (nvirt > 1){
240  scl_virt * sclv = new scl_virt;
241  sclv->sr_A = tsr->sr;
242  if (is_top) {
243  hscl = sclv;
244  is_top = 0;
245  } else {
246  *rec_scl = sclv;
247  }
248  rec_scl = &sclv->rec_scl;
249 
250  sclv->num_dim = order_tot;
251  sclv->virt_dim = virt_dim;
252  sclv->order_A = ntsr->order;
253  sclv->blk_sz_A = vrt_sz;
254  sclv->idx_map_A = idx_map;
255  sclv->buffer = NULL;
256  } else CTF_int::cdealloc(virt_dim);
257 
258  seq_tsr_scl * sclseq = new seq_tsr_scl;
259  sclseq->sr_A = tsr->sr;
260  if (is_top) {
261  hscl = sclseq;
262  is_top = 0;
263  } else {
264  *rec_scl = sclseq;
265  }
266  sclseq->alpha = alpha;
267  hscl->alpha = alpha;
268  if (is_custom){
269  sclseq->func = func;
270  sclseq->is_custom = 1;
271  } else {
272  sclseq->is_custom = 0;
273  }
274  sclseq->order = ntsr->order;
275  sclseq->idx_map = idx_map;
276  sclseq->edge_len = virt_blk_len;
277  sclseq->sym = ntsr->sym;
278 
279  hscl->A = ntsr->data;
280 
281  CTF_int::cdealloc(idx_arr);
282  CTF_int::cdealloc(blk_len);
283 
284  hscl->run();
285  delete hscl;
286 
287 
288 
289  #ifdef HOME_CONTRACT
290  if (was_home && !ntsr->is_home){
291  if (tsr->wrld->cdt.rank == 0)
292  DPRINTF(2,"Migrating tensor %s back to home\n", tsr->name);
293  if (old_dst != NULL) delete old_dst;
294  old_dst = new distribution(ntsr);
295 /* save_mapping(ntsr,
296  &old_phase, &old_rank,
297  &old_virt_dim, &old_pe_lda,
298  &old_size,
299  &was_cyclic, &old_padding,
300  &old_edge_len, &ntsr->topo);*/
301  tsr->data = ntsr->data;
302  tsr->is_home = 0;
303 
304  if (tsr->is_sparse){
305  tsr->sr->pair_dealloc(ntsr->home_buffer);
306  ntsr->home_buffer = NULL;
307  CTF_int::alloc_ptr(ntsr->calc_nvirt()*sizeof(int64_t), (void**)&tsr->nnz_blk);
308  tsr->set_new_nnz_glb(ntsr->nnz_blk);
309  }
310 
311  TAU_FSTART(redistribute_for_scale_home);
312  tsr->redistribute(*old_dst);
313  TAU_FSTOP(redistribute_for_scale_home);
314  if (!tsr->is_sparse){
315  tsr->sr->copy(tsr->home_buffer, tsr->data, tsr->size);
316  tsr->sr->dealloc(tsr->data);
317  tsr->data = tsr->home_buffer;
318  }
319  tsr->is_home = 1;
320  tsr->has_home = 1;
321  ntsr->is_data_aliased = 1;
322  delete ntsr;
323  } else if (was_home){
324  if (ntsr->data != tsr->data){
325  printf("Tensor %s is a copy of %s and did not leave home but buffer is %p was %p\n", ntsr->name, tsr->name, ntsr->data, tsr->data);
326  ABORT;
327 
328  }
329  ntsr->has_home = 0;
330  ntsr->is_home = 0;
331  ntsr->is_data_aliased = 1;
332  delete ntsr;
333  }
334  #endif
335 
336  if (old_dst != NULL) delete old_dst;
337  #if DEBUG>=2
338  if (tsr->wrld->cdt.rank == 0)
339  printf("Done scaling tensor %s.\n", tsr->name);
340  #endif
341 
343  return SUCCESS;
344 
345  }
346 
348  TAU_FSTART(sp_scl);
349  bool has_rep_idx = false;
350  for (int i=0; i<A->order; i++){
351  for (int j=0; j<i; j++){
352  if (idx_map[i] == idx_map[j]) has_rep_idx = true;
353  }
354  }
355 
356  PairIterator pi(A->sr, A->data);
357  // if applying custom function, apply immediately on reduced form
358  if (!has_rep_idx){
359  if (is_custom){
360 #ifdef USE_OMP
361  #pragma omp parallel for
362 #endif
363  for (int64_t i=0; i<A->nnz_loc; i++){
364  if (alpha != NULL)
365  A->sr->mul(pi[i].d(), alpha, pi[i].d());
366  func->apply_f(pi[i].d());
367  }
368  } else {
369 #ifdef USE_OMP
370  #pragma omp parallel for
371 #endif
372  for (int64_t i=0; i<A->nnz_loc; i++){
373  A->sr->mul(pi[i].d(), alpha, pi[i].d());
374  }
375  }
376  } else {
377  int nrep_idx=0;
378  int rep_inds[A->order];
379  for (int i=0; i<A->order; i++){
380  for (int j=0; j<A->order; j++){
381  if (i!=j && idx_map[i] == idx_map[j]){
382  rep_inds[nrep_idx] = i;
383  nrep_idx++;
384  break;
385  }
386  }
387  }
388 
389  int64_t ldas[A->order];
390  ldas[0] = 1;
391  for (int i=1; i<A->order; i++){
392  ldas[i] = ldas[i-1]*A->lens[i-1];
393  }
394 #ifdef USE_OMP
395  #pragma omp parallel for
396 #endif
397  for (int64_t i=0; i<A->nnz_loc; i++){
398  bool pass=true;
399  int64_t pkey[A->order];
400  for (int j=0; j<nrep_idx; j++){
401  pkey[rep_inds[j]] = (pi[i].k()/ldas[rep_inds[j]])%A->lens[rep_inds[j]];
402  for (int k=0; k<j; k++){
403  if (idx_map[rep_inds[j]] == idx_map[rep_inds[k]] &&
404  pkey[rep_inds[j]] != pkey[rep_inds[k]]){
405  pass=false;
406  }
407  }
408  }
409  if (pass){
410  if (alpha != NULL)
411  A->sr->mul(pi[i].d(), alpha, pi[i].d());
412  if (is_custom)
413  func->apply_f(pi[i].d());
414  }
415  }
416  }
417  TAU_FSTOP(sp_scl);
418  }
419 }
char const * alpha
Definition: scale_tsr.h:16
char * home_buffer
buffer associated with home mapping of tensor, to which it is returned
CTF_int::CommData cdt
communicator data for MPI comm defining this world
Definition: world.h:32
map_type type
Definition: mapping.h:22
bool is_home
whether the latest tensor data is in the home buffer
int64_t * nnz_blk
nonzero elements in each block owned locally
int * sym
symmetries among tensor dimensions
#define DPRINTF(...)
Definition: util.h:235
void calc_dim(int order, int64_t size, int const *edge_len, mapping const *edge_map, int64_t *vrt_sz, int *vrt_edge_len, int *blk_edge_len)
calculate the block-sizes of a tensor
char * A
Definition: scale_tsr.h:14
untyped internal class for singly-typed single variable function (Endomorphism)
Definition: sym_seq_scl.h:12
int * pad_edge_len
padded tensor edge lengths
void inv_idx(int order_A, int const *idx_A, int order_B, int const *idx_B, int order_C, int const *idx_C, int *order_tot, int **idx_arr)
invert index map
Definition: ctr_tsr.cxx:592
scaling(tensor *A, int const *idx_map, char const *alpha)
constructor definining contraction with C&#39;s mul and add ops
Definition: scaling.cxx:14
virtual void copy(char *a, char const *b) const
copies element b to element a
Definition: algstrct.cxx:538
bool has_home
whether the tensor has a home mapping/buffer
int64_t size
current size of local tensor data chunk (mapping-dependent)
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
int get_best_topo(int64_t nvirt, int topo, CommData global_comm, int64_t bcomm_vol, int64_t bmemuse)
get the best topologoes (least nvirt) over all procs
Definition: topology.cxx:591
void set_new_nnz_glb(int64_t const *nnz_blk)
sets the number of nonzeros both locally (nnz_loc) and overall globally (nnz_tot) ...
virtual void dealloc(char *ptr) const
deallocate given pointer containing contiguous array of values
Definition: algstrct.cxx:689
void copy_mapping(int order, mapping const *mapping_A, mapping *mapping_B)
copies mapping A to B
Definition: mapping.cxx:190
algstrct const * sr_A
Definition: scale_tsr.h:15
bool is_sparse
whether only the non-zero elements of the tensor are stored
int order
number of tensor dimensions
void set_padding()
sets padding and local size of a tensor given a mapping
int64_t blk_sz_A
Definition: scale_tsr.h:36
int const * idx_map_A
Definition: scale_tsr.h:37
CTF::World * wrld
distributed processor context on which tensor is defined
class for execution distributed scaling of a tensor
Definition: scaling.h:14
~scaling()
destructor
Definition: scaling.cxx:60
bool is_cyclic
whether the tensor data is cyclically distributed (blocked if false)
int strip_diag(int order, int order_tot, int const *idx_map, int64_t vrt_sz, mapping const *edge_map, topology const *topo, algstrct const *sr, int *blk_edge_len, int64_t *blk_sz, strp_tsr **stpr)
build stack required for stripping out diagonals of tensor
Definition: strp_tsr.cxx:273
virtual void run()
Definition: scale_tsr.h:19
endomorphism const * func
Definition: scale_tsr.h:57
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
void sp_scl()
scales a sparse tensor
Definition: scaling.cxx:347
void print_map(FILE *stream=stdout, bool allcall=1) const
displays mapping information
bool is_data_aliased
whether the tensor data is an alias of another tensor object&#39;s data
int64_t k() const
returns key of pair at head of ptr
Definition: algstrct.cxx:789
algstrct * sr
algstrct on which tensor elements and operations are defined
virtual void pair_dealloc(char *ptr) const
deallocate given pointer containing contiguous array of pairs
Definition: algstrct.cxx:693
mapping * edge_map
mappings of each tensor dimension onto topology dimensions
void unfold(bool was_mod=0)
undo the folding of a local tensor block unsets is_folded and deletes rec_tsr
CommData * dim_comm
Definition: topology.h:20
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
bool is_mapped
whether a mapping has been selected
mapping * child
Definition: mapping.h:26
int map_tensor_rem(int num_phys_dims, CommData *phys_comm, int fill=0)
map the remainder of a tensor
int64_t calc_nvirt() const
calculate virtualization factor of tensor return virtualization factor
int check_self_mapping(tensor const *tsr, int const *idx_map)
checks mapping in preparation for tensors scale, summ or contract
Definition: mapping.cxx:332
int const * sym
Definition: scale_tsr.h:53
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
int64_t proc_bytes_available()
gives total memory available on this MPI process
Definition: memcontrol.cxx:655
Definition: apsp.cxx:17
std::vector< CTF_int::topology * > topovec
derived topologies
Definition: world.h:28
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
internal distributed tensor class
int execute()
run scaling
Definition: scaling.cxx:64
int map_self_indices(tensor const *tsr, int const *idx_map)
create virtual mapping for idx_maps that have repeating indices
Definition: mapping.cxx:423
topology * topo
topology to which the tensor is mapped
int redistribute(distribution const &old_dist, int const *old_offsets=NULL, int *const *old_permutation=NULL, int const *new_offsets=NULL, int *const *new_permutation=NULL)
permutes the data of a tensor to its new layout
void * buffer
Definition: scale_tsr.h:17
strp_tsr * rec_strp
Definition: strp_tsr.h:61
bool has_zero_edge_len
if true tensor has a zero edge length, so is zero, which short-cuts stuff
int const * idx_map
Definition: scale_tsr.h:52
#define ABORT
Definition: util.h:162
char * name
name given to tensor
int conv_idx(int order, type const *cidx, int **iidx)
Definition: common.cxx:50
void clear_mapping()
zeros out mapping