Cyclops Tensor Framework
parallel arithmetic on multidimensional arrays
nosym_transp.cxx
Go to the documentation of this file.
1 
2 
3 #ifdef USE_HPTT
4 #include <hptt.h>
5 #endif
6 
7 #include "nosym_transp.h"
8 #include "../shared/util.h"
9 #include "../tensor/untyped_tensor.h"
10 
11 
12 namespace CTF_int {
13 
14  //static double init_ct_ps[] = {COST_LATENCY, 1.5*COST_MEMBW};
15  LinModel<2> long_contig_transp_mdl(long_contig_transp_mdl_init,"long_contig_transp_mdl");
16  LinModel<2> shrt_contig_transp_mdl(shrt_contig_transp_mdl_init,"shrt_contig_transp_mdl");
17  LinModel<2> non_contig_transp_mdl(non_contig_transp_mdl_init,"non_contig_transp_mdl");
18 
19 
20 //#define OPT_NOSYM_TR
21 //#define CL_BLOCK
22 
23 #ifdef OPT_NOSYM_TR
24 #ifndef CL_BLOCK
25 
37  template <int idim>
38  void nosym_transpose_tmp(int const * edge_len,
39  char const * data,
40  char * swap_data,
41  int64_t const * lda,
42  int64_t const * new_lda,
43  int64_t off_old,
44  int64_t off_new,
45  algstrct const * sr){
46  for (int i=0; i<edge_len[idim]; i++){
47  nosym_transpose_tmp<idim-1>(edge_len, data, swap_data, lda, new_lda, off_old+i*lda[idim], off_new+i*new_lda[idim], sr);
48  }
49  }
50 
51  template <>
52  inline void nosym_transpose_tmp<0>(int const * edge_len,
53  char const * data,
54  char * swap_data,
55  int64_t const * lda,
56  int64_t const * new_lda,
57  int64_t off_old,
58  int64_t off_new,
59  algstrct const * sr){
60  sr->copy(edge_len[0], data+sr->el_size*off_old, lda[0], swap_data+sr->el_size*off_new, new_lda[0]);
61  }
62 
63  template
64  void nosym_transpose_tmp<8>(int const * edge_len,
65  char const * data,
66  char * swap_data,
67  int64_t const * lda,
68  int64_t const * new_lda,
69  int64_t off_old,
70  int64_t off_new,
71  algstrct const * sr);
72 
73 
74 #else
75  #define CACHELINE 8
76  template <typename dtyp, bool dir>
77  void nosym_transpose_inr(int const * edge_len,
78  dtyp const * __restrict data,
79  dtyp * __restrict swap_data,
80  int idim_new_lda1,
81  int64_t const * lda,
82  int64_t const * new_lda,
83  int64_t off_old,
84  int64_t off_new,
85  int i_new_lda1){
86  //int new_lda1_n = MIN(edge_len[idim_new_lda1]-i_new_lda1,CACHELINE);
87  if (edge_len[idim_new_lda1] - i_new_lda1 >= CACHELINE){
88  #pragma ivdep
89  for (int i=0; i<edge_len[0]-CACHELINE+1; i+=CACHELINE){
90  #pragma unroll
91  #pragma vector always nontemporal
92  for (int j=0; j<CACHELINE; j++){
93  #pragma unroll
94  #pragma vector always nontemporal
95  for (int k=0; k<CACHELINE; k++){
96  if (dir){
97  swap_data[off_new+(i+k)*new_lda[0]+j] = data[off_old+j*lda[idim_new_lda1]+i+k];
98  } else {
99  swap_data[off_old+j*lda[idim_new_lda1]+i+k] = data[off_new+(i+k)*new_lda[0]+j];
100  }
101  }
102  }
103  }
104  {
105  int m=edge_len[0]%CACHELINE;
106  int i=edge_len[0]-m;
107  #pragma unroll
108  #pragma vector always nontemporal
109  for (int j=0; j<CACHELINE; j++){
110  #pragma unroll
111  #pragma vector always nontemporal
112  for (int k=0; k<m; k++){
113  if (dir){
114  swap_data[off_new+(i+k)*new_lda[0]+j] = data[off_old+j*lda[idim_new_lda1]+i+k];
115  } else {
116  swap_data[off_old+j*lda[idim_new_lda1]+i+k] = data[off_new+(i+k)*new_lda[0]+j];
117  }
118  }
119  }
120  }
121  } else {
122  int n = edge_len[idim_new_lda1] - i_new_lda1;
123  #pragma ivdep
124  for (int i=0; i<edge_len[0]-CACHELINE+1; i+=CACHELINE){
125  #pragma unroll
126  #pragma vector always nontemporal
127  for (int j=0; j<n; j++){
128  #pragma unroll
129  #pragma vector always nontemporal
130  for (int k=0; k<CACHELINE; k++){
131  if (dir){
132  swap_data[off_new+(i+k)*new_lda[0]+j] = data[off_old+j*lda[idim_new_lda1]+i+k];
133  } else {
134  swap_data[off_old+j*lda[idim_new_lda1]+i+k] = data[off_new+(i+k)*new_lda[0]+j];
135  }
136  }
137  }
138  }
139  {
140  int m=edge_len[0]%CACHELINE;
141  int i=edge_len[0]-m;
142  #pragma unroll
143  #pragma vector always nontemporal
144  for (int j=0; j<n; j++){
145  #pragma unroll
146  #pragma vector always nontemporal
147  for (int k=0; k<m; k++){
148  if (dir){
149  swap_data[off_new+(i+k)*new_lda[0]+j] = data[off_old+j*lda[idim_new_lda1]+i+k];
150  } else {
151  swap_data[off_old+j*lda[idim_new_lda1]+i+k] = data[off_new+(i+k)*new_lda[0]+j];
152  }
153  }
154  }
155  }
156  }
157  }
158 
159  template
160  void nosym_transpose_inr<double,true>
161  (int const * edge_len,
162  double const * __restrict data,
163  double * __restrict swap_data,
164  int idim_new_lda1,
165  int64_t const * lda,
166  int64_t const * new_lda,
167  int64_t off_old,
168  int64_t off_new,
169  int i_new_lda1);
170 
171  template
172  void nosym_transpose_inr<double,false>
173  (int const * edge_len,
174  double const * __restrict data,
175  double * __restrict swap_data,
176  int idim_new_lda1,
177  int64_t const * lda,
178  int64_t const * new_lda,
179  int64_t off_old,
180  int64_t off_new,
181  int i_new_lda1);
182 
183  template <int idim>
184  void nosym_transpose_opt(int const * edge_len,
185  char const * data,
186  char * swap_data,
187  int dir,
188  int idim_new_lda1,
189  int64_t const * lda,
190  int64_t const * new_lda,
191  int64_t off_old,
192  int64_t off_new,
193  int i_new_lda1,
194  algstrct const * sr){
195  if (idim == idim_new_lda1){
196  for (int i=0; i<edge_len[idim]; i+=CACHELINE){
197  nosym_transpose_opt<idim-1>(edge_len, data, swap_data, dir, idim_new_lda1, lda, new_lda, off_old+i*lda[idim], off_new+i, i, sr);
198  }
199  } else {
200  for (int i=0; i<edge_len[idim]; i++){
201  nosym_transpose_opt<idim-1>(edge_len, data, swap_data, dir, idim_new_lda1, lda, new_lda, off_old+i*lda[idim], off_new+i*new_lda[idim], i_new_lda1, sr);
202  }
203  }
204  }
205 
206 
207 
208 
209  template <>
210  inline void nosym_transpose_opt<0>(int const * edge_len,
211  char const * data,
212  char * swap_data,
213  int dir,
214  int idim_new_lda1,
215  int64_t const * lda,
216  int64_t const * new_lda,
217  int64_t off_old,
218  int64_t off_new,
219  int i_new_lda1,
220  algstrct const * sr){
221 
222  ASSERT(lda[0] == 1);
223  if (idim_new_lda1 == 0){
224  if (dir)
225  memcpy(swap_data+sr->el_size*off_new, data+sr->el_size*off_old, sr->el_size*edge_len[0]);
226  else
227  memcpy(swap_data+sr->el_size*off_old, data+sr->el_size*off_new, sr->el_size*edge_len[0]);
228  return;
229  }
230 
231  if (sr->el_size == 8){
232  if (dir){
233  return nosym_transpose_inr<double,true>
234  (edge_len,
235  (double const*)data,
236  (double*)swap_data,
237  idim_new_lda1,
238  lda,
239  new_lda,
240  off_old,
241  off_new,
242  i_new_lda1);
243  } else {
244  return nosym_transpose_inr<double,false>
245  (edge_len,
246  (double const*)data,
247  (double*)swap_data,
248  idim_new_lda1,
249  lda,
250  new_lda,
251  off_old,
252  off_new,
253  i_new_lda1);
254  }
255  }
256  {
257  //FIXME: prealloc?
258  char buf[sr->el_size*CACHELINE*CACHELINE];
259 
260  int new_lda1_n = MIN(edge_len[idim_new_lda1]-i_new_lda1,CACHELINE);
261  if (dir) {
262  for (int i=0; i<edge_len[0]-CACHELINE+1; i+=CACHELINE){
263  for (int j=0; j<new_lda1_n; j++){
264  //printf("reading CLINE data from %ld stride 1 to %d stride %d\n",off_old+j*lda[idim_new_lda1]+i, j, new_lda1_n);
265  sr->copy(CACHELINE, data+sr->el_size*(off_old+j*lda[idim_new_lda1]+i), 1, buf+sr->el_size*j, new_lda1_n);
266  }
267  for (int j=0; j<CACHELINE; j++){
268  //printf("writing %d data from %l stride 1 to %ld stride 1\n",new_lda1_n,j*new_lda1_n,off_new+(i+j)*new_lda[0], j, new_lda1_n);
269  sr->copy(new_lda1_n, buf+sr->el_size*j*new_lda1_n, 1, swap_data+sr->el_size*(off_new+(i+j)*new_lda[0]), 1);
270  }
271  }
272  int lda1_n = edge_len[0]%CACHELINE;
273  for (int j=0; j<new_lda1_n; j++){
274  sr->copy(lda1_n, data+sr->el_size*(off_old+j*lda[idim_new_lda1]+edge_len[0]-lda1_n), 1, buf+sr->el_size*j, new_lda1_n);
275  }
276  for (int j=0; j<lda1_n; j++){
277  sr->copy(new_lda1_n, buf+sr->el_size*j*new_lda1_n, 1, swap_data+sr->el_size*(off_new+(edge_len[0]-lda1_n+j)*new_lda[0]), 1);
278  }
279  } else {
280  for (int i=0; i<edge_len[0]-CACHELINE+1; i+=CACHELINE){
281  for (int j=0; j<CACHELINE; j++){
282  //printf("reading %d data[%d] to buf[%d]\n",new_lda1_n,off_new+(i+j)*new_lda[0], j);
283  sr->copy(new_lda1_n, data+sr->el_size*(off_new+(i+j)*new_lda[0]), 1, buf+sr->el_size*j, CACHELINE);
284  }
285  for (int j=0; j<new_lda1_n; j++){
286  //printf("writing %d buf[%d] to swap[%d]\n",CACHELINE,j*CACHELINE,off_old+j*lda[idim_new_lda1]+i);
287  sr->copy(CACHELINE, buf+sr->el_size*j*CACHELINE, 1, swap_data+sr->el_size*(off_old+j*lda[idim_new_lda1]+i), 1);
288  }
289  }
290  int lda1_n = edge_len[0]%CACHELINE;
291  for (int j=0; j<lda1_n; j++){
292  sr->copy(new_lda1_n, data+sr->el_size*(off_new+(edge_len[0]-lda1_n+j)*new_lda[0]), 1, buf+sr->el_size*j, lda1_n);
293  }
294  for (int j=0; j<new_lda1_n; j++){
295  sr->copy(lda1_n, buf+sr->el_size*j*lda1_n, 1, swap_data+sr->el_size*(off_old+j*lda[idim_new_lda1]+edge_len[0]-lda1_n), 1);
296  }
297 
298  }
299  }
300  }
301 
302 
303  template
304  void nosym_transpose_opt<8>(int const * edge_len,
305  char const * data,
306  char * swap_data,
307  int dir,
308  int idim_new_lda1,
309  int64_t const * lda,
310  int64_t const * new_lda,
311  int64_t off_old,
312  int64_t off_new,
313  int i_new_lda1,
314  algstrct const * sr);
315 
316 #endif
317 #endif
318 
319  bool hptt_is_applicable(int order, int const * new_order, int elementSize) {
320 #ifdef USE_HPTT
321  bool is_diff = false;
322  for (int i=0; i<order; i++){
323  if (new_order[i] != i) is_diff = true;
324  }
325 
326  return is_diff && (elementSize == sizeof(float) || elementSize == sizeof(double) || elementSize == sizeof(hptt::DoubleComplex));
327 #else
328  return 0;
329 #endif
330  }
331 
332  void nosym_transpose_hptt(int order,
333  int const * st_new_order,
334  int const * st_edge_len,
335  int dir,
336  char const * st_buffer,
337  char * new_buffer,
338  algstrct const * sr){
339 #ifdef USE_HPTT
340  int new_order[order];
341  int edge_len[order];
342  if (dir){
343  memcpy(new_order, st_new_order, order*sizeof(int));
344  memcpy(edge_len, st_edge_len, order*sizeof(int));
345  } else {
346  for (int i=0; i<order; i++){
347  new_order[st_new_order[i]] = i;
348  edge_len[i] = st_edge_len[st_new_order[i]];
349  }
350  }
351 #ifdef USE_OMP
352  int numThreads = MIN(24,omp_get_max_threads());
353 #else
354  int numThreads = 1;
355 #endif
356 
357  // prepare perm and size for HPTT
358  int perm[order];
359  for(int i=0;i < order; ++i){
360  perm[i] = new_order[i];
361  }
362  int size[order];
363  int64_t total = 1;
364  for(int i=0;i < order; ++i){
365  size[i] = edge_len[i];
366  total *= edge_len[i];
367  }
368 
369  const int elementSize = sr->el_size;
370  if (elementSize == sizeof(float)){
371  auto plan = hptt::create_plan( perm, order,
372  1.0, ((float*)st_buffer), size, NULL,
373  0.0, ((float*)new_buffer), NULL,
374  hptt::ESTIMATE, numThreads );
375  if (nullptr != plan){
376  plan->execute();
377  } else
378  fprintf(stderr, "ERROR in HPTT: plan == NULL\n");
379  } else if (elementSize == sizeof(double)){
380  auto plan = hptt::create_plan( perm, order,
381  1.0, ((double*)st_buffer), size, NULL,
382  0.0, ((double*)new_buffer), NULL,
383  hptt::ESTIMATE, numThreads );
384  if (nullptr != plan){
385  plan->execute();
386  } else
387  fprintf(stderr, "ERROR in HPTT: plan == NULL\n");
388  } else if( elementSize == sizeof(hptt::DoubleComplex) ) {
389  auto plan = hptt::create_plan( perm, order,
390  hptt::DoubleComplex(1.0), ((hptt::DoubleComplex*)st_buffer), size, NULL,
391  hptt::DoubleComplex(0.0), ((hptt::DoubleComplex*)new_buffer), NULL,
392  hptt::ESTIMATE, numThreads );
393  if (nullptr != plan){
394  plan->execute();
395  } else
396  fprintf(stderr, "ERROR in HPTT: plan == NULL\n");
397  } else {
398  ABORT; //transpose_ref( size, perm, order, ((double*)st_buffer), 1.0, ((double*)new_buffer), 0.0);
399  }
400 #endif
401  }
402 
404  int all_fdim_A,
405  int const * all_flen_A,
406  int const * new_order,
407  int dir){
408 
409  bool is_diff = false;
410  for (int i=0; i<all_fdim_A; i++){
411  if (new_order[i] != i) is_diff = true;
412  }
413  if (!is_diff) return;
414 
416  if (all_fdim_A == 0){
418  return;
419  }
420 
421  bool use_hptt = false;
422 #if USE_HPTT
423  use_hptt = hptt_is_applicable(all_fdim_A, new_order, A->sr->el_size);
424 #endif
425  int nvirt_A = A->calc_nvirt();
426 
427 
428 #ifdef TUNE
429  double st_time = MPI_Wtime();
430  {
431  // Check if we need to execute this function for the sake of training
432  int64_t contig0 = 1;
433  for (int i=0; i<all_fdim_A; i++){
434  if (new_order[i] == i) contig0 *= all_flen_A[i];
435  else break;
436  }
437 
438  int64_t tot_sz = 1;
439  for (int i=0; i<all_fdim_A; i++){
440  tot_sz *= all_flen_A[i];
441  }
442  tot_sz *= nvirt_A;
443 
444  double tps[] = {0.0, 1.0, (double)tot_sz};
445  bool should_run = true;
446  if (contig0 < 4){
447  should_run = non_contig_transp_mdl.should_observe(tps);
448  } else if (contig0 <= 64){
449  should_run = shrt_contig_transp_mdl.should_observe(tps);
450  } else {
451  should_run = long_contig_transp_mdl.should_observe(tps);
452  }
453  if (!should_run) return;
454  }
455 #endif
456 
457  if (use_hptt){
458  char * new_buffer;
459  if (A->left_home_transp){
460  assert(dir == 0);
461  new_buffer = A->home_buffer;
462  } else {
463  new_buffer = A->sr->alloc(A->size);
464  }
465  for (int i=0; i<nvirt_A; i++){
466  nosym_transpose_hptt(all_fdim_A, new_order, all_flen_A, dir,
467  A->data + A->sr->el_size*i*(A->size/nvirt_A),
468  new_buffer + A->sr->el_size*i*(A->size/nvirt_A), A->sr);
469  }
470  if (!A->has_home || !A->is_home){
471  if (A->left_home_transp){
472  A->is_home = true;
473  A->left_home_transp = false;
474  A->sr->dealloc(A->data);
475  A->data = A->home_buffer;
476  } else {
477  A->sr->dealloc(A->data);
478  A->data = new_buffer;
479  }
480  } else {
481  A->is_home = false;
482  A->left_home_transp = true;
483  A->data = new_buffer;
484  }
485  } else {
486  for (int i=0; i<nvirt_A; i++){
487  nosym_transpose(all_fdim_A, new_order, all_flen_A,
488  A->data + A->sr->el_size*i*(A->size/nvirt_A), dir, A->sr);
489  }
490  }
491 
492 #ifdef TUNE
493  int64_t contig0 = 1;
494  for (int i=0; i<all_fdim_A; i++){
495  if (new_order[i] == i) contig0 *= all_flen_A[i];
496  else break;
497  }
498 
499  int64_t tot_sz = 1;
500  for (int i=0; i<all_fdim_A; i++){
501  tot_sz *= all_flen_A[i];
502  }
503  tot_sz *= nvirt_A;
504 
505  double exe_time = MPI_Wtime() - st_time;
506  double tps[] = {exe_time, 1.0, (double)tot_sz};
507  if (contig0 < 4){
508  non_contig_transp_mdl.observe(tps);
509  } else if (contig0 <= 64){
510  shrt_contig_transp_mdl.observe(tps);
511  } else {
512  long_contig_transp_mdl.observe(tps);
513  }
514 #endif
516  }
517 
518 
519  void nosym_transpose(int order,
520  int const * new_order,
521  int const * edge_len,
522  char * data,
523  int dir,
524  algstrct const * sr){
525  int64_t * chunk_size;
526  char ** tswap_data;
527 
528  #ifdef USE_OMP
529  int max_ntd = MIN(16,omp_get_max_threads());
530  CTF_int::alloc_ptr(max_ntd*sizeof(char*), (void**)&tswap_data);
531  CTF_int::alloc_ptr(max_ntd*sizeof(int64_t), (void**)&chunk_size);
532  std::fill(chunk_size, chunk_size+max_ntd, 0);
533  #else
534  int max_ntd=1;
535  CTF_int::alloc_ptr(sizeof(char*), (void**)&tswap_data);
536  CTF_int::alloc_ptr(sizeof(int64_t), (void**)&chunk_size);
537  /*printf("transposing");
538  for (int id=0; id<order; id++){
539  printf(" new_order[%d]=%d",id,new_order[id]);
540  }
541  printf("\n");*/
542  #endif
543  nosym_transpose(order, new_order, edge_len, data, dir, max_ntd, tswap_data, chunk_size, sr);
544  #ifdef USE_OMP
545  #pragma omp parallel num_threads(max_ntd)
546  #endif
547  {
548  int tid;
549  #ifdef USE_OMP
550  tid = omp_get_thread_num();
551  #else
552  tid = 0;
553  #endif
554  int64_t thread_chunk_size = chunk_size[tid];
555  int i;
556  char * swap_data = tswap_data[tid];
557  int64_t toff = 0;
558  for (i=0; i<tid; i++) toff += chunk_size[i];
559  if (thread_chunk_size > 0){
560  memcpy(data+sr->el_size*(toff),swap_data,sr->el_size*thread_chunk_size);
561  }
562  }
563  for (int i=0; i<max_ntd; i++) {
564  int64_t thread_chunk_size = chunk_size[i];
565  if (thread_chunk_size > 0)
566  CTF_int::cdealloc(tswap_data[i]);
567  }
568 
569  CTF_int::cdealloc(tswap_data);
570  CTF_int::cdealloc(chunk_size);
571 
572  int64_t contig0 = 1;
573  for (int i=0; i<order; i++){
574  if (new_order[i] == i) contig0 *= edge_len[i];
575  else break;
576  }
577 
578  int64_t tot_sz = 1;
579  for (int i=0; i<order; i++){
580  tot_sz *= edge_len[i];
581  }
583 
584  }
585 
586  void nosym_transpose(int order,
587  int const * new_order,
588  int const * edge_len,
589  char const * data,
590  int dir,
591  int max_ntd,
592  char ** tswap_data,
593  int64_t * chunk_size,
594  algstrct const * sr){
595  int64_t local_size;
596  int64_t j, last_dim;
597  int64_t * lda, * new_lda;
598 
599  TAU_FSTART(nosym_transpose_thr);
600  CTF_int::alloc_ptr(order*sizeof(int64_t), (void**)&lda);
601  CTF_int::alloc_ptr(order*sizeof(int64_t), (void**)&new_lda);
602  if (dir){
603  last_dim = new_order[order-1];
604  } else {
605  last_dim = order - 1;
606  }
607  // last_dim = order-1;
608 
609  lda[0] = 1;
610  for (j=1; j<order; j++){
611  lda[j] = lda[j-1]*edge_len[j-1];
612  }
613 
614  local_size = lda[order-1]*edge_len[order-1];
615  new_lda[new_order[0]] = 1;
616  for (j=1; j<order; j++){
617  new_lda[new_order[j]] = new_lda[new_order[j-1]]*edge_len[new_order[j-1]];
618  }
619  ASSERT(local_size == new_lda[new_order[order-1]]*edge_len[new_order[order-1]]);
620 #ifdef OPT_NOSYM_TR
621  if (order <= 8){
622  int idim_new_lda1 = new_order[0];
623  CTF_int::alloc_ptr(local_size*sr->el_size, (void**)&tswap_data[0]);
624  chunk_size[0] = local_size;
625 #ifdef CL_BLOCK
626 #define CASE_NTO(i) \
627  case i: \
628  nosym_transpose_opt<i-1>(edge_len,data,tswap_data[0],dir,idim_new_lda1,lda,new_lda,0,0,0,sr); \
629  break;
630 #else
631 #define CASE_NTO(i) \
632  case i: \
633  if (dir) nosym_transpose_tmp<i-1>(edge_len,data,tswap_data[0],lda,new_lda,0,0,sr); \
634  else nosym_transpose_tmp<i-1>(edge_len,data,tswap_data[0],new_lda,lda,0,0,sr); \
635  break;
636 #endif
637  switch (order){
638  CASE_NTO(1);
639  CASE_NTO(2);
640  CASE_NTO(3);
641  CASE_NTO(4);
642  CASE_NTO(5);
643  CASE_NTO(6);
644  CASE_NTO(7);
645  CASE_NTO(8);
646  default:
647  ASSERT(0);
648  break;
649  }
650  /*for (int64_t ii=0; ii<local_size; ii++){
651  printf(" [%ld] ",ii);
652  sr->print(tswap_data[0]+ii*sr->el_size);
653  }
654  printf("\n");*/
655  CTF_int::cdealloc(lda);
656  CTF_int::cdealloc(new_lda);
657  TAU_FSTOP(nosym_transpose_thr);
658  return;
659  }
660 #endif
661 
662  #ifdef USE_OMP
663  #pragma omp parallel num_threads(max_ntd)
664  #endif
665  {
666  int64_t i, off_old, off_new, tid, ntd, last_max, toff_new, toff_old;
667  int64_t tidx_off;
668  int64_t thread_chunk_size;
669  int64_t * idx;
670  char * swap_data;
671  CTF_int::alloc_ptr(order*sizeof(int64_t), (void**)&idx);
672  memset(idx, 0, order*sizeof(int64_t));
673 
674  #ifdef USE_OMP
675  tid = omp_get_thread_num();
676  ntd = omp_get_num_threads();
677  #else
678  tid = 0;
679  ntd = 1;
680  thread_chunk_size = local_size;
681  #endif
682  last_max = 1;
683  tidx_off = 0;
684  off_old = 0;
685  off_new = 0;
686  toff_old = 0;
687  toff_new = 0;
688  if (order != 1){
689  tidx_off = (edge_len[last_dim]/ntd)*tid;
690  idx[last_dim] = tidx_off;
691  last_max = (edge_len[last_dim]/ntd)*(tid+1);
692  if (tid == ntd-1) last_max = edge_len[last_dim];
693  off_old = idx[last_dim]*lda[last_dim];
694  off_new = idx[last_dim]*new_lda[last_dim];
695  toff_old = off_old;
696  toff_new = off_new;
697  // print64_tf("%d %d %d %d %d\n", tid, ntd, idx[last_dim], last_max, edge_len[last_dim]);
698  thread_chunk_size = (local_size*(last_max-tidx_off))/edge_len[last_dim];
699  } else {
700  thread_chunk_size = local_size;
701  last_dim = 1;
702  }
703  chunk_size[tid] = 0;
704  if (last_max != 0 && tidx_off != last_max && (order != 1 || tid == 0)){
705  chunk_size[tid] = thread_chunk_size;
706  if (thread_chunk_size <= 0)
707  printf("ERRORR thread_chunk_size = %ld, tid = %ld, local_size = %ld\n", thread_chunk_size, tid, local_size);
708  CTF_int::alloc_ptr(thread_chunk_size*sr->el_size, (void**)&tswap_data[tid]);
709  swap_data = tswap_data[tid];
710  for (;;){
711  if (last_dim != 0){
712  if (dir) {
713  sr->copy(edge_len[0], data+sr->el_size*(off_old), lda[0], swap_data+sr->el_size*(off_new-toff_new), new_lda[0]);
714  } else
715  sr->copy(edge_len[0], data+sr->el_size*(off_new), new_lda[0], swap_data+sr->el_size*(off_old-toff_old), lda[0]);
716 
717  idx[0] = 0;
718  } else {
719  if (dir)
720  sr->copy(last_max-tidx_off, data+sr->el_size*(off_old), lda[0], swap_data+sr->el_size*(off_new-toff_new), new_lda[0]);
721  else
722  sr->copy(last_max-tidx_off, data+sr->el_size*(off_new), new_lda[0], swap_data+sr->el_size*(off_old-toff_old), lda[0]);
723 /* printf("Wrote following values from");
724  for (int asi=0; asi<lda[0]; asi++){
725  printf("\n %ld to %ld\n",(off_new)+asi,(off_old-toff_old)+asi*lda[0]);
726  sr->print(data+sr->el_size*(off_new+asi*lda[0]));
727  }
728  printf("\n");*/
729  idx[0] = tidx_off;
730  }
731 
732  for (i=1; i<order; i++){
733  off_old -= idx[i]*lda[i];
734  off_new -= idx[i]*new_lda[i];
735  if (i == last_dim){
736  idx[i] = (idx[i] == last_max-1 ? tidx_off : idx[i]+1);
737  off_old += idx[i]*lda[i];
738  off_new += idx[i]*new_lda[i];
739  if (idx[i] != tidx_off) break;
740  } else {
741  idx[i] = (idx[i] == edge_len[i]-1 ? 0 : idx[i]+1);
742  off_old += idx[i]*lda[i];
743  off_new += idx[i]*new_lda[i];
744  if (idx[i] != 0) break;
745  }
746  }
747  if (i==order) break;
748  }
749  }
750  CTF_int::cdealloc(idx);
751  }
752  CTF_int::cdealloc(lda);
753  CTF_int::cdealloc(new_lda);
754  TAU_FSTOP(nosym_transpose_thr);
755  }
756 
757  double est_time_transp(int order,
758  int const * new_order,
759  int const * edge_len,
760  int dir,
761  algstrct const * sr){
762  if (order == 0) return 0.0;
763  int64_t contig0 = 1;
764  for (int i=0; i<order; i++){
765  if (new_order[i] == i) contig0 *= edge_len[i];
766  else break;
767  }
768 
769  int64_t tot_sz = 1;
770  for (int i=0; i<order; i++){
771  tot_sz *= edge_len[i];
772  }
773 
774  //if nothing transpose then transpose gratis
775  if (contig0==tot_sz) return 0.0;
776 
777  //if the number of elements copied in the innermost loop is small, add overhead to bandwidth cost of transpose, otherwise say its linear
778  //this model ignores cache-line size
779  double ps[] = {1.0, (double)tot_sz};
780  if (contig0 < 4){
781  return non_contig_transp_mdl.est_time(ps);
782  } else if (contig0 <= 64){
783  return shrt_contig_transp_mdl.est_time(ps);
784  } else {
785  return long_contig_transp_mdl.est_time(ps);
786  }
787  }
788 
789 }
void nosym_transpose_hptt(int order, int const *st_new_order, int const *st_edge_len, int dir, char const *st_buffer, char *new_buffer, algstrct const *sr)
char * home_buffer
buffer associated with home mapping of tensor, to which it is returned
bool is_home
whether the latest tensor data is in the home buffer
double non_contig_transp_mdl_init[]
Definition: init_models.cxx:27
virtual void copy(char *a, char const *b) const
copies element b to element a
Definition: algstrct.cxx:538
LinModel< 2 > long_contig_transp_mdl(long_contig_transp_mdl_init,"long_contig_transp_mdl")
bool has_home
whether the tensor has a home mapping/buffer
int64_t size
current size of local tensor data chunk (mapping-dependent)
#define ASSERT(...)
Definition: util.h:88
virtual void dealloc(char *ptr) const
deallocate given pointer containing contiguous array of values
Definition: algstrct.cxx:689
virtual char * alloc(int64_t n) const
allocate space for n items, necessary for object types
Definition: algstrct.cxx:685
bool hptt_is_applicable(int order, int const *new_order, int elementSize)
Checks if the HPTT library is applicable.
LinModel< 2 > non_contig_transp_mdl(non_contig_transp_mdl_init,"non_contig_transp_mdl")
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
Definition: memcontrol.cxx:320
double est_time_transp(int order, int const *new_order, int const *edge_len, int dir, algstrct const *sr)
estimates time needed to transposes a non-symmetric (folded) tensor based on performance models ...
algstrct * sr
algstrct on which tensor elements and operations are defined
#define TAU_FSTOP(ARG)
Definition: util.h:281
bool left_home_transp
whether the tensor left home to transpose
#define TAU_FSTART(ARG)
Definition: util.h:280
int64_t calc_nvirt() const
calculate virtualization factor of tensor return virtualization factor
int el_size
size of each element of algstrct in bytes
Definition: algstrct.h:16
int cdealloc(void *ptr)
free abstraction
Definition: memcontrol.cxx:480
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
Definition: algstrct.h:34
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
internal distributed tensor class
void nosym_transpose(tensor *A, int all_fdim_A, int const *all_flen_A, int const *new_order, int dir)
double long_contig_transp_mdl_init[]
Definition: init_models.cxx:25
#define MIN(a, b)
Definition: util.h:176
#define ABORT
Definition: util.h:162
LinModel< 2 > shrt_contig_transp_mdl(shrt_contig_transp_mdl_init,"shrt_contig_transp_mdl")
double shrt_contig_transp_mdl_init[]
Definition: init_models.cxx:26