Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
pad.cxx
Go to the documentation of this file.
1 
2 #include "pad.h"
3 #include "../shared/util.h"
4 
5 namespace CTF_int {
6  void pad_key(int order,
7  int64_t num_pair,
8  int const * edge_len,
9  int const * padding,
10  PairIterator pairs,
11  algstrct const * sr,
12  int const * offsets){
13  int64_t i, j, lda;
14  int64_t knew, k;
16  if (offsets == NULL){
17  #ifdef USE_OMP
18  #pragma omp parallel for private(knew, k, lda, i, j)
19  #endif
20  for (i=0; i<num_pair; i++){
21  k = pairs[i].k();
22  lda = 1;
23  knew = 0;
24  for (j=0; j<order; j++){
25  knew += lda*(k%edge_len[j]);
26  lda *= (edge_len[j]+padding[j]);
27  k = k/edge_len[j];
28  }
29  pairs[i].write_key(knew);
30  }
31  } else {
32  #ifdef USE_OMP
33  #pragma omp parallel for private(knew, k, lda, i, j)
34  #endif
35  for (i=0; i<num_pair; i++){
36  k = pairs[i].k();
37  lda = 1;
38  knew = 0;
39  for (j=0; j<order; j++){
40  knew += lda*((k%edge_len[j])+offsets[j]);
41  lda *= (edge_len[j]+padding[j]);
42  k = k/edge_len[j];
43  }
44  pairs[i].write_key(knew);
45  }
46  }
48 
49  }
50 
51  void depad_tsr(int order,
52  int64_t num_pair,
53  int const * edge_len,
54  int const * sym,
55  int const * padding,
56  int const * prepadding,
57  char const * pairsb,
58  char * new_pairsb,
59  int64_t * new_num_pair,
60  algstrct const * sr){
62  ConstPairIterator pairs = ConstPairIterator(sr, pairsb);
63  PairIterator new_pairs = PairIterator(sr, new_pairsb);
64 #ifdef USE_OMP
65  int64_t num_ins;
66  int mntd = omp_get_max_threads();
67  int64_t * num_ins_t = (int64_t*)CTF_int::alloc(sizeof(int64_t)*mntd);
68  int64_t * pre_ins_t = (int64_t*)CTF_int::alloc(sizeof(int64_t)*mntd);
69 
70  std::fill(num_ins_t, num_ins_t+mntd, 0);
71 
72  int act_ntd;
73  TAU_FSTART(depad_tsr_cnt);
74  #pragma omp parallel
75  {
76  int64_t i, j, st, end, tid;
77  int64_t k;
78  int64_t kparts[order];
79  tid = omp_get_thread_num();
80  int ntd = omp_get_num_threads();
81  #pragma omp master
82  {
83  act_ntd = omp_get_num_threads();
84  }
85 
86  st = (num_pair/ntd)*tid;
87  if (tid == ntd-1)
88  end = num_pair;
89  else
90  end = (num_pair/ntd)*(tid+1);
91 
92  num_ins_t[tid] = 0;
93  for (i=st; i<end; i++){
94  k = pairs[i].k();
95  for (j=0; j<order; j++){
96  kparts[j] = k%(edge_len[j]+padding[j]);
97  if (kparts[j] >= (int64_t)edge_len[j] ||
98  kparts[j] < prepadding[j]) break;
99  k = k/(edge_len[j]+padding[j]);
100  }
101  if (j==order){
102  for (j=0; j<order; j++){
103  if (sym[j] == SY){
104  if (kparts[j+1] < kparts[j])
105  break;
106  }
107  if (sym[j] == AS || sym[j] == SH){
108  if (kparts[j+1] <= kparts[j])
109  break;
110  }
111  }
112  if (j==order){
113  num_ins_t[tid]++;
114  }
115  }
116  }
117  }
118  TAU_FSTOP(depad_tsr_cnt);
119 
120  pre_ins_t[0] = 0;
121  for (int j=1; j<mntd; j++){
122  pre_ins_t[j] = num_ins_t[j-1] + pre_ins_t[j-1];
123  }
124 
125  TAU_FSTART(depad_tsr_move);
126  #pragma omp parallel
127  {
128  int64_t i, j, st, end, tid;
129  int64_t k;
130  int64_t kparts[order];
131  tid = omp_get_thread_num();
132  int ntd = omp_get_num_threads();
133 
134  #pragma omp master
135  {
136  assert(act_ntd == ntd);
137  }
138  st = (num_pair/ntd)*tid;
139  if (tid == ntd-1)
140  end = num_pair;
141  else
142  end = (num_pair/ntd)*(tid+1);
143 
144  for (i=st; i<end; i++){
145  k = pairs[i].k();
146  for (j=0; j<order; j++){
147  kparts[j] = k%(edge_len[j]+padding[j]);
148  if (kparts[j] >= (int64_t)edge_len[j] ||
149  kparts[j] < prepadding[j]) break;
150  k = k/(edge_len[j]+padding[j]);
151  }
152  if (j==order){
153  for (j=0; j<order; j++){
154  if (sym[j] == SY){
155  if (kparts[j+1] < kparts[j])
156  break;
157  }
158  if (sym[j] == AS || sym[j] == SH){
159  if (kparts[j+1] <= kparts[j])
160  break;
161  }
162  }
163  if (j==order){
164  new_pairs[pre_ins_t[tid]].write(pairs[i]);
165  pre_ins_t[tid]++;
166  }
167  }
168  }
169  }
170  TAU_FSTOP(depad_tsr_move);
171  num_ins = pre_ins_t[act_ntd-1];
172 
173  *new_num_pair = num_ins;
174  CTF_int::cdealloc(pre_ins_t);
175  CTF_int::cdealloc(num_ins_t);
176 #else
177  int64_t i, j, num_ins;
178  int64_t * kparts;
179  int64_t k;
180 
181  CTF_int::alloc_ptr(sizeof(int64_t)*order, (void**)&kparts);
182 
183  num_ins = 0;
184  for (i=0; i<num_pair; i++){
185  k = pairs[i].k();
186  for (j=0; j<order; j++){
187  kparts[j] = k%(edge_len[j]+padding[j]);
188  if (kparts[j] >= (int64_t)edge_len[j] ||
189  kparts[j] < prepadding[j]) break;
190  k = k/(edge_len[j]+padding[j]);
191  }
192  if (j==order){
193  for (j=0; j<order; j++){
194  if (sym[j] == SY){
195  if (kparts[j+1] < kparts[j])
196  break;
197  }
198  if (sym[j] == AS || sym[j] == SH){
199  if (kparts[j+1] <= kparts[j])
200  break;
201  }
202  }
203  if (j==order){
204  new_pairs[num_ins].write(pairs[i]);
205  num_ins++;
206  }
207  }
208  }
209  *new_num_pair = num_ins;
210  CTF_int::cdealloc(kparts);
211 
212 #endif
214  }
215 /*
216 
217  void pad_tsr(int order,
218  int64_t size,
219  int const * edge_len,
220  int const * sym,
221  int const * padding,
222  int const * phys_phase,
223  int * phase_rank,
224  int const * virt_phase,
225  char const * old_data,
226  char ** new_pairs,
227  int64_t * new_size,
228  algstrct const * sr){
229  int i, imax, act_lda;
230  int64_t new_el, pad_el;
231  int pad_max, virt_lda, outside, offset, edge_lda;
232  int * idx;
233  CTF_int::alloc_ptr(order*sizeof(int), (void**)&idx);
234  char * padded_pairsb;
235 
236  pad_el = 0;
237 
238  for (;;){
239  memset(idx, 0, order*sizeof(int));
240  for (;;){
241  if (sym[0] != NS)
242  pad_max = idx[1]+1;
243  else
244  pad_max = (edge_len[0]+padding[0])/phys_phase[0];
245  pad_el+=pad_max;
246  for (act_lda=1; act_lda<order; act_lda++){
247  idx[act_lda]++;
248  imax = (edge_len[act_lda]+padding[act_lda])/phys_phase[act_lda];
249  if (sym[act_lda] != NS)
250  imax = idx[act_lda+1]+1;
251  if (idx[act_lda] >= imax)
252  idx[act_lda] = 0;
253  if (idx[act_lda] != 0) break;
254  }
255  if (act_lda == order) break;
256 
257  }
258  for (act_lda=0; act_lda<order; act_lda++){
259  phase_rank[act_lda]++;
260  if (phase_rank[act_lda]%virt_phase[act_lda]==0)
261  phase_rank[act_lda] -= virt_phase[act_lda];
262  if (phase_rank[act_lda]%virt_phase[act_lda]!=0) break;
263  }
264  if (act_lda == order) break;
265  }
266  CTF_int::alloc_ptr(pad_el*(sizeof(int64_t)+sr->el_size), (void**)&padded_pairsb);
267  PairIterator padded_pairs = PairIterator(sr, padded_pairsb);
268  new_el = 0;
269  offset = 0;
270  outside = -1;
271  virt_lda = 1;
272  for (i=0; i<order; i++){
273  offset += phase_rank[i]*virt_lda;
274  virt_lda*=(edge_len[i]+padding[i]);
275  }
276 
277  for (;;){
278  memset(idx, 0, order*sizeof(int));
279  for (;;){
280  if (sym[0] != NS){
281  if (idx[1] < edge_len[0]/phys_phase[0]) {
282  imax = idx[1];
283  if (sym[0] != SY && phase_rank[0] < phase_rank[1])
284  imax++;
285  if (sym[0] == SY && phase_rank[0] <= phase_rank[1])
286  imax++;
287  } else {
288  imax = edge_len[0]/phys_phase[0];
289  if (phase_rank[0] < edge_len[0]%phys_phase[0])
290  imax++;
291  }
292  pad_max = idx[1]+1;
293  } else {
294  imax = edge_len[0]/phys_phase[0];
295  if (phase_rank[0] < edge_len[0]%phys_phase[0])
296  imax++;
297  pad_max = (edge_len[0]+padding[0])/phys_phase[0];
298  }
299  if (outside == -1){
300  for (i=0; i<pad_max-imax; i++){
301  padded_pairs[new_el+i].write_key(offset + (imax+i)*phys_phase[0]);
302  padded_pairs[new_el+i].write_val(sr->addid());
303  }
304  new_el+=pad_max-imax;
305  } else {
306  for (i=0; i<pad_max; i++){
307  padded_pairs[new_el+i].write_key(offset + i*phys_phase[0]);
308  padded_pairs[new_el+i].write_val(sr->addid());
309  }
310  new_el += pad_max;
311  }
312 
313  edge_lda = edge_len[0]+padding[0];
314  for (act_lda=1; act_lda<order; act_lda++){
315  offset -= idx[act_lda]*edge_lda*phys_phase[act_lda];
316  idx[act_lda]++;
317  imax = (edge_len[act_lda]+padding[act_lda])/phys_phase[act_lda];
318  if (sym[act_lda] != NS && idx[act_lda+1]+1 <= imax){
319  imax = idx[act_lda+1]+1;
320  // if (phase_rank[act_lda] < phase_rank[sym[act_lda]])
321  // imax++;
322  }
323  if (idx[act_lda] >= imax)
324  idx[act_lda] = 0;
325  offset += idx[act_lda]*edge_lda*phys_phase[act_lda];
326  if (idx[act_lda] > edge_len[act_lda]/phys_phase[act_lda] ||
327  (idx[act_lda] == edge_len[act_lda]/phys_phase[act_lda] &&
328  (edge_len[act_lda]%phys_phase[act_lda] <= phase_rank[act_lda]))){
329  if (outside < act_lda)
330  outside = act_lda;
331  } else {
332  if (outside == act_lda)
333  outside = -1;
334  }
335  if (sym[act_lda] != NS && idx[act_lda] == idx[act_lda+1]){
336  if (sym[act_lda] != SY &&
337  phase_rank[act_lda] >= phase_rank[act_lda+1]){
338  if (outside < act_lda)
339  outside = act_lda;
340  }
341  if (sym[act_lda] == SY &&
342  phase_rank[act_lda] > phase_rank[act_lda+1]){
343  if (outside < act_lda)
344  outside = act_lda;
345  }
346  }
347  if (idx[act_lda] != 0) break;
348  edge_lda*=(edge_len[act_lda]+padding[act_lda]);
349  }
350  if (act_lda == order) break;
351 
352  }
353  virt_lda = 1;
354  for (act_lda=0; act_lda<order; act_lda++){
355  offset -= phase_rank[act_lda]*virt_lda;
356  phase_rank[act_lda]++;
357  if (phase_rank[act_lda]%virt_phase[act_lda]==0)
358  phase_rank[act_lda] -= virt_phase[act_lda];
359  offset += phase_rank[act_lda]*virt_lda;
360  if (phase_rank[act_lda]%virt_phase[act_lda]!=0) break;
361  virt_lda*=(edge_len[act_lda]+padding[act_lda]);
362  }
363  if (act_lda == order) break;
364 
365  }
366  CTF_int::cdealloc(idx);
367  DEBUG_PRINTF("order = %d new_el=%ld, size = %ld, pad_el = %ld\n", order, new_el, size, pad_el);
368  ASSERT(new_el + size == pad_el);
369  memcpy(padded_pairs[new_el].ptr, old_data, size*(sizeof(int64_t)+sr->el_size));
370  *new_pairs = padded_pairsb;
371  *new_size = pad_el;
372  }
373 */
374  void zero_padding(int order,
375  int64_t size,
376  int nvirt,
377  int const * edge_len,
378  int const * sym,
379  int const * padding,
380  int const * phase,
381  int const * phys_phase,
382  int const * virt_phase,
383  int const * cphase_rank,
384  char * vdata,
385  algstrct const * sr){
386 
387  if (order == 0) return;
389  #ifdef USE_OMP
390  #pragma omp parallel
391  #endif
392  {
393  int i, act_lda, act_max, curr_idx, sym_idx;
394  int is_outside;
395  int64_t p, buf_offset;
396  int * idx, * virt_rank, * phase_rank, * virt_len;
397  char * data;
398 
399  CTF_int::alloc_ptr(order*sizeof(int), (void**)&idx);
400  CTF_int::alloc_ptr(order*sizeof(int), (void**)&virt_rank);
401  CTF_int::alloc_ptr(order*sizeof(int), (void**)&phase_rank);
402  CTF_int::alloc_ptr(order*sizeof(int), (void**)&virt_len);
403  for (int dim=0; dim<order; dim++){
404  virt_len[dim] = edge_len[dim]/phase[dim];
405  }
406 
407  memcpy(phase_rank, cphase_rank, order*sizeof(int));
408  memset(virt_rank, 0, sizeof(int)*order);
409 
410  int tid, ntd, vst, vend;
411  #ifdef USE_OMP
412  tid = omp_get_thread_num();
413  ntd = omp_get_num_threads();
414  #else
415  tid = 0;
416  ntd = 1;
417  #endif
418 
419  int * st_idx=NULL, * end_idx;
420  int64_t st_index = 0;
421  int64_t end_index = size/nvirt;
422 
423  if (ntd <= nvirt){
424  vst = (nvirt/ntd)*tid;
425  vst += MIN(nvirt%ntd,tid);
426  vend = vst+(nvirt/ntd);
427  if (tid < nvirt % ntd) vend++;
428  } else {
429  int64_t vrt_sz = size/nvirt;
430  int64_t chunk = size/ntd;
431  int64_t st_chunk = chunk*tid + MIN(tid,size%ntd);
432  int64_t end_chunk = st_chunk+chunk;
433  if (tid<(size%ntd))
434  end_chunk++;
435  vst = st_chunk/vrt_sz;
436  vend = end_chunk/vrt_sz;
437  if ((end_chunk%vrt_sz) > 0) vend++;
438 
439  st_index = st_chunk-vst*vrt_sz;
440  end_index = end_chunk-(vend-1)*vrt_sz;
441 
442  CTF_int::alloc_ptr(order*sizeof(int), (void**)&st_idx);
443  CTF_int::alloc_ptr(order*sizeof(int), (void**)&end_idx);
444 
445  int * ssym;
446  CTF_int::alloc_ptr(order*sizeof(int), (void**)&ssym);
447  for (int dim=0;dim<order;dim++){
448  if (sym[dim] != NS) ssym[dim] = SY;
449  else ssym[dim] = NS;
450  }
451 
452  //calculate index with all indices, to properly load balance with symmetry
453  //then clip the first index to avoid logic inside the inner loop
454  calc_idx_arr(order, virt_len, ssym, st_index, st_idx);
455  calc_idx_arr(order, virt_len, ssym, end_index, end_idx);
456 
457  CTF_int::cdealloc(ssym);
458 
459  if (st_idx[0] != 0){
460  st_index -= st_idx[0];
461  st_idx[0] = 0;
462  }
463  if (end_idx[0] != 0){
464  end_index += virt_len[0]-end_idx[0];
465  }
466  CTF_int::cdealloc(end_idx);
467  }
468  ASSERT(tid != ntd-1 || vend == nvirt);
469  for (p=0; p<nvirt; p++){
470  if (p>=vst && p<vend){
471  int is_sh_pad0 = 0;
472  if (((sym[0] == AS || sym[0] == SH) && phase_rank[0] >= phase_rank[1]) ||
473  ( sym[0] == SY && phase_rank[0] > phase_rank[1]) ) {
474  is_sh_pad0 = 1;
475  }
476  int pad0 = (padding[0]+phase_rank[0])/phase[0];
477  int len0 = virt_len[0]-pad0;
478  int plen0 = virt_len[0];
479  data = vdata + sr->el_size*p*(size/nvirt);
480 
481  if (p==vst && st_index != 0){
482  idx[0] = 0;
483  memcpy(idx+1,st_idx+1,(order-1)*sizeof(int));
484  buf_offset = st_index;
485  } else {
486  buf_offset = 0;
487  memset(idx, 0, order*sizeof(int));
488  }
489 
490  for (;;){
491  is_outside = 0;
492  for (i=1; i<order; i++){
493  curr_idx = idx[i]*phase[i]+phase_rank[i];
494  if (curr_idx >= edge_len[i] - padding[i]){
495  is_outside = 1;
496  break;
497  } else if (i < order-1) {
498  sym_idx = idx[i+1]*phase[i+1]+phase_rank[i+1];
499  if (((sym[i] == AS || sym[i] == SH) && curr_idx >= sym_idx) ||
500  ( sym[i] == SY && curr_idx > sym_idx) ) {
501  is_outside = 1;
502  break;
503  }
504  }
505  }
506  /* for (i=0; i<order; i++){
507  printf("phase_rank[%d] = %d, idx[%d] = %d, ",i,phase_rank[i],i,idx[i]);
508  }
509  printf("\n");
510  printf("data["PRId64"]=%lf is_outside = %d\n", buf_offset+p*(size/nvirt), data[buf_offset], is_outside);*/
511 
512  if (sym[0] != NS) plen0 = idx[1]+1;
513 
514  if (is_outside){
515  // std::fill(data+buf_offset, data+buf_offset+plen0, 0.0);
516  //for (int64_t j=buf_offset; j<buf_offset+plen0; j++){
517  //}
518  sr->set(data+buf_offset*sr->el_size, sr->addid(), plen0);
519  } else {
520  int s1 = MIN(plen0-is_sh_pad0,len0);
521  /* if (sym[0] == SH) s1 = MIN(s1, len0-1);*/
522  // std::fill(data+buf_offset+s1, data+buf_offset+plen0, 0.0);
523  //for (int64_t j=buf_offset+s1; j<buf_offset+plen0; j++){
524  // data[j] = 0.0;
525  //}
526  sr->set(data+(buf_offset+s1)*sr->el_size, sr->addid(), plen0-s1);
527  }
528  buf_offset+=plen0;
529  if (p == vend-1 && buf_offset >= end_index) break;
530  /* Increment indices and set up offsets */
531  for (i=1; i < order; i++){
532  idx[i]++;
533  act_max = virt_len[i];
534  if (sym[i] != NS){
535  // sym_idx = idx[i+1]*phase[i+1]+phase_rank[i+1];
536  // act_max = MIN(act_max,((sym_idx-phase_rank[i])/phase[i]+1));
537  act_max = MIN(act_max,idx[i+1]+1);
538  }
539  if (idx[i] >= act_max)
540  idx[i] = 0;
541  ASSERT(edge_len[i]%phase[i] == 0);
542  if (idx[i] > 0)
543  break;
544  }
545  if (i >= order) break;
546  }
547  }
548  for (act_lda=0; act_lda < order; act_lda++){
549  phase_rank[act_lda] -= virt_rank[act_lda]*phys_phase[act_lda];
550  virt_rank[act_lda]++;
551  if (virt_rank[act_lda] >= virt_phase[act_lda])
552  virt_rank[act_lda] = 0;
553  phase_rank[act_lda] += virt_rank[act_lda]*phys_phase[act_lda];
554  if (virt_rank[act_lda] > 0)
555  break;
556  }
557  }
558  CTF_int::cdealloc(idx);
559  CTF_int::cdealloc(virt_rank);
560  CTF_int::cdealloc(virt_len);
561  CTF_int::cdealloc(phase_rank);
562  if (st_idx != NULL) CTF_int::cdealloc(st_idx);
563  }
565  }
566 
567  void scal_diag(int order,
568  int64_t size,
569  int nvirt,
570  int const * edge_len,
571  int const * sym,
572  int const * padding,
573  int const * phase,
574  int const * phys_phase,
575  int const * virt_phase,
576  int const * cphase_rank,
577  char * vdata,
578  algstrct const * sr,
579  int const * sym_mask){
581  #ifdef USE_OMP
582  #pragma omp parallel
583  #endif
584  {
585  int i, act_lda, act_max;
586  int perm_factor;
587  int64_t p, buf_offset;
588  int * idx, * virt_rank, * phase_rank, * virt_len;
589  char * data;
590 
591  CTF_int::alloc_ptr(order*sizeof(int), (void**)&idx);
592  CTF_int::alloc_ptr(order*sizeof(int), (void**)&virt_rank);
593  CTF_int::alloc_ptr(order*sizeof(int), (void**)&phase_rank);
594  CTF_int::alloc_ptr(order*sizeof(int), (void**)&virt_len);
595  for (int dim=0; dim<order; dim++){
596  virt_len[dim] = edge_len[dim]/phase[dim];
597  }
598 
599  memcpy(phase_rank, cphase_rank, order*sizeof(int));
600  memset(virt_rank, 0, sizeof(int)*order);
601 
602  int tid, ntd, vst, vend;
603  #ifdef USE_OMP
604  tid = omp_get_thread_num();
605  ntd = omp_get_num_threads();
606  #else
607  tid = 0;
608  ntd = 1;
609  #endif
610 
611  int * st_idx=NULL, * end_idx;
612  int64_t st_index = 0;
613  int64_t end_index = size/nvirt;
614 
615  if (ntd <= nvirt){
616  vst = (nvirt/ntd)*tid;
617  vst += MIN(nvirt%ntd,tid);
618  vend = vst+(nvirt/ntd);
619  if (tid < nvirt % ntd) vend++;
620  } else {
621  int64_t vrt_sz = size/nvirt;
622  int64_t chunk = size/ntd;
623  int64_t st_chunk = chunk*tid + MIN(tid,size%ntd);
624  int64_t end_chunk = st_chunk+chunk;
625  if (tid<(size%ntd))
626  end_chunk++;
627  vst = st_chunk/vrt_sz;
628  vend = end_chunk/vrt_sz;
629  if ((end_chunk%vrt_sz) > 0) vend++;
630 
631  st_index = st_chunk-vst*vrt_sz;
632  end_index = end_chunk-(vend-1)*vrt_sz;
633 
634  CTF_int::alloc_ptr(order*sizeof(int), (void**)&st_idx);
635  CTF_int::alloc_ptr(order*sizeof(int), (void**)&end_idx);
636 
637  int * ssym;
638  CTF_int::alloc_ptr(order*sizeof(int), (void**)&ssym);
639  for (int dim=0;dim<order;dim++){
640  if (sym[dim] != NS) ssym[dim] = SY;
641  else ssym[dim] = NS;
642  }
643 
644  //calculate index with all indices, to properly load balance with symmetry
645  //then clip the first index to avoid logic inside the inner loop
646  calc_idx_arr(order, virt_len, ssym, st_index, st_idx);
647  calc_idx_arr(order, virt_len, ssym, end_index, end_idx);
648 
649  CTF_int::cdealloc(ssym);
650 
651  if (st_idx[0] != 0){
652  st_index -= st_idx[0];
653  st_idx[0] = 0;
654  }
655  if (end_idx[0] != 0){
656  end_index -= end_idx[0];
657  end_idx[0] = 0;
658  }
659  CTF_int::cdealloc(end_idx);
660  }
661  ASSERT(tid != ntd-1 || vend == nvirt);
662  for (p=0; p<nvirt; p++){
663  if (st_index == end_index) break;
664  if (p>=vst && p<vend){
665  /*int is_sh_pad0 = 0;
666  if (((sym[0] == AS || sym[0] == SH) && phase_rank[0] >= phase_rank[1]) ||
667  ( sym[0] == SY && phase_rank[0] > phase_rank[1]) ) {
668  is_sh_pad0 = 1;
669  }
670  int len0 = virt_len[0]-pad0;
671  int pad0 = (padding[0]+phase_rank[0])/phase[0];*/
672  int plen0 = virt_len[0];
673  data = vdata + sr->el_size*p*(size/nvirt);
674 
675  if (p==vst && st_index != 0){
676  idx[0] = 0;
677  memcpy(idx+1,st_idx+1,(order-1)*sizeof(int));
678  buf_offset = st_index;
679  } else {
680  buf_offset = 0;
681  memset(idx, 0, order*sizeof(int));
682  }
683 
684  for (;;){
685  if (sym[0] != NS) plen0 = idx[1]+1;
686  perm_factor = 1;
687  for (i=1; i<order; i++){
688  if (sym_mask[i] == 1){
689  int curr_idx_i = idx[i]*phase[i]+phase_rank[i];
690  int iperm = 1;
691  for (int j=i+1; j<order; j++){
692  if (sym_mask[j] == 1){
693  int curr_idx_j = idx[j]*phase[j]+phase_rank[j];
694  if (curr_idx_i == curr_idx_j) iperm++;
695  }
696  }
697  perm_factor *= iperm;
698  }
699  }
700  if (sym_mask[0] == 0){
701  if (perm_factor != 1){
702  char scal_fact[sr->el_size];
703  sr->cast_double(1./perm_factor, scal_fact);
704  sr->scal(plen0, scal_fact,data+buf_offset*sr->el_size, 1);
705  }
706  } else {
707 
708  if (perm_factor != 1){
709  char scal_fact[sr->el_size];
710  sr->cast_double(1./perm_factor, scal_fact);
711  sr->scal(idx[1]+1, scal_fact,data+buf_offset*sr->el_size, 1);
712  }
713  int curr_idx_0 = idx[1]*phase[0]+phase_rank[0];
714  int iperm = 1;
715  for (int j=1; j<order; j++){
716  if (sym_mask[j] == 1){
717  int curr_idx_j = idx[j]*phase[j]+phase_rank[j];
718  if (curr_idx_0 == curr_idx_j) iperm++;
719  }
720  }
721  char scal_fact2[sr->el_size];
722  sr->cast_double(1./iperm, scal_fact2);
723  sr->scal(1, scal_fact2, data+(buf_offset+idx[1])*sr->el_size, 1);
724  }
725  buf_offset+=plen0;
726  if (p == vend-1 && buf_offset >= end_index) break;
727  /* Increment indices and set up offsets */
728  for (i=1; i < order; i++){
729  idx[i]++;
730  act_max = virt_len[i];
731  if (sym[i] != NS){
732  // sym_idx = idx[i+1]*phase[i+1]+phase_rank[i+1];
733  // act_max = MIN(act_max,((sym_idx-phase_rank[i])/phase[i]+1));
734  act_max = MIN(act_max,idx[i+1]+1);
735  }
736  if (idx[i] >= act_max)
737  idx[i] = 0;
738  ASSERT(edge_len[i]%phase[i] == 0);
739  if (idx[i] > 0)
740  break;
741  }
742  if (i >= order) break;
743  }
744  }
745  for (act_lda=0; act_lda < order; act_lda++){
746  phase_rank[act_lda] -= virt_rank[act_lda]*phys_phase[act_lda];
747  virt_rank[act_lda]++;
748  if (virt_rank[act_lda] >= virt_phase[act_lda])
749  virt_rank[act_lda] = 0;
750  phase_rank[act_lda] += virt_rank[act_lda]*phys_phase[act_lda];
751  if (virt_rank[act_lda] > 0)
752  break;
753  }
754  }
755  CTF_int::cdealloc(idx);
756  CTF_int::cdealloc(virt_rank);
757  CTF_int::cdealloc(virt_len);
758  CTF_int::cdealloc(phase_rank);
759  if (st_idx != NULL) CTF_int::cdealloc(st_idx);
760  }
762  }
763 
764 }
void calc_idx_arr(int order, int const *lens, int const *sym, int64_t idx, int *idx_arr)
Definition: util.cxx:72
void write(char const *buf, int64_t n=1)
sets internal pairs to provided data
Definition: algstrct.cxx:805
void zero_padding(int order, int64_t size, int nvirt, int const *edge_len, int const *sym, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int const *cphase_rank, char *vdata, algstrct const *sr)
sets to zero all values in padded region of tensor
Definition: pad.cxx:374
void write_key(int64_t key)
sets key of head pair to key
Definition: algstrct.cxx:821
#define ASSERT(...)
Definition: util.h:88
void * alloc(int64_t len)
alloc abstraction
Definition: memcontrol.cxx:365
Definition: common.h:37
virtual char const * addid() const
MPI datatype for pairs.
Definition: algstrct.cxx:89
virtual void set(char *a, char const *b, int64_t n) const
sets n elements of array a to value b
Definition: algstrct.cxx:629
int64_t k() const
returns key of pair at head of ptr
Definition: algstrct.cxx:764
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
int64_t k() const
returns key of pair at head of ptr
Definition: algstrct.cxx:789
#define TAU_FSTOP(ARG)
Definition: util.h:281
#define TAU_FSTART(ARG)
Definition: util.h:280
virtual void scal(int n, char const *alpha, char *X, int incX) const
X["i"]=alpha*X["i"];.
Definition: algstrct.cxx:262
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
void pad_key(int order, int64_t num_pair, int const *edge_len, int const *padding, PairIterator pairs, algstrct const *sr, int const *offsets)
applies padding to keys
Definition: pad.cxx:6
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
void scal_diag(int order, int64_t size, int nvirt, int const *edge_len, int const *sym, int const *padding, int const *phase, int const *phys_phase, int const *virt_phase, int const *cphase_rank, char *vdata, algstrct const *sr, int const *sym_mask)
scales each element by 1/(number of entries equivalent to it after permutation of indices for which s...
Definition: pad.cxx:567
#define MIN(a, b)
Definition: util.h:176
Definition: common.h:37
void depad_tsr(int order, int64_t num_pair, int const *edge_len, int const *sym, int const *padding, int const *prepadding, char const *pairsb, char *new_pairsb, int64_t *new_num_pair, algstrct const *sr)
retrieves the unpadded pairs
Definition: pad.cxx:51
virtual void cast_double(double d, char *c) const
c = &d
Definition: algstrct.cxx:150
Definition: common.h:37
Definition: common.h:37