8 #include "../shared/util.h" 9 #include "../tensor/untyped_tensor.h" 38 void nosym_transpose_tmp(
int const * edge_len,
42 int64_t
const * new_lda,
46 for (
int i=0; i<edge_len[idim]; i++){
47 nosym_transpose_tmp<idim-1>(edge_len, data, swap_data, lda, new_lda, off_old+i*lda[idim], off_new+i*new_lda[idim], sr);
52 inline void nosym_transpose_tmp<0>(
int const * edge_len,
56 int64_t
const * new_lda,
60 sr->copy(edge_len[0], data+sr->el_size*off_old, lda[0], swap_data+sr->el_size*off_new, new_lda[0]);
64 void nosym_transpose_tmp<8>(
int const * edge_len,
68 int64_t
const * new_lda,
76 template <
typename dtyp,
bool dir>
77 void nosym_transpose_inr(
int const * edge_len,
78 dtyp
const * __restrict data,
79 dtyp * __restrict swap_data,
82 int64_t
const * new_lda,
87 if (edge_len[idim_new_lda1] - i_new_lda1 >= CACHELINE){
89 for (
int i=0; i<edge_len[0]-CACHELINE+1; i+=CACHELINE){
91 #pragma vector always nontemporal 92 for (
int j=0; j<CACHELINE; j++){
94 #pragma vector always nontemporal 95 for (
int k=0; k<CACHELINE; k++){
97 swap_data[off_new+(i+k)*new_lda[0]+j] = data[off_old+j*lda[idim_new_lda1]+i+k];
99 swap_data[off_old+j*lda[idim_new_lda1]+i+k] = data[off_new+(i+k)*new_lda[0]+j];
105 int m=edge_len[0]%CACHELINE;
108 #pragma vector always nontemporal 109 for (
int j=0; j<CACHELINE; j++){
111 #pragma vector always nontemporal 112 for (
int k=0; k<m; k++){
114 swap_data[off_new+(i+k)*new_lda[0]+j] = data[off_old+j*lda[idim_new_lda1]+i+k];
116 swap_data[off_old+j*lda[idim_new_lda1]+i+k] = data[off_new+(i+k)*new_lda[0]+j];
122 int n = edge_len[idim_new_lda1] - i_new_lda1;
124 for (
int i=0; i<edge_len[0]-CACHELINE+1; i+=CACHELINE){
126 #pragma vector always nontemporal 127 for (
int j=0; j<n; j++){
129 #pragma vector always nontemporal 130 for (
int k=0; k<CACHELINE; k++){
132 swap_data[off_new+(i+k)*new_lda[0]+j] = data[off_old+j*lda[idim_new_lda1]+i+k];
134 swap_data[off_old+j*lda[idim_new_lda1]+i+k] = data[off_new+(i+k)*new_lda[0]+j];
140 int m=edge_len[0]%CACHELINE;
143 #pragma vector always nontemporal 144 for (
int j=0; j<n; j++){
146 #pragma vector always nontemporal 147 for (
int k=0; k<m; k++){
149 swap_data[off_new+(i+k)*new_lda[0]+j] = data[off_old+j*lda[idim_new_lda1]+i+k];
151 swap_data[off_old+j*lda[idim_new_lda1]+i+k] = data[off_new+(i+k)*new_lda[0]+j];
160 void nosym_transpose_inr<double,true>
161 (
int const * edge_len,
162 double const * __restrict data,
163 double * __restrict swap_data,
166 int64_t
const * new_lda,
172 void nosym_transpose_inr<double,false>
173 (
int const * edge_len,
174 double const * __restrict data,
175 double * __restrict swap_data,
178 int64_t
const * new_lda,
184 void nosym_transpose_opt(
int const * edge_len,
190 int64_t
const * new_lda,
194 algstrct
const * sr){
195 if (idim == idim_new_lda1){
196 for (
int i=0; i<edge_len[idim]; i+=CACHELINE){
197 nosym_transpose_opt<idim-1>(edge_len, data, swap_data, dir, idim_new_lda1, lda, new_lda, off_old+i*lda[idim], off_new+i, i, sr);
200 for (
int i=0; i<edge_len[idim]; i++){
201 nosym_transpose_opt<idim-1>(edge_len, data, swap_data, dir, idim_new_lda1, lda, new_lda, off_old+i*lda[idim], off_new+i*new_lda[idim], i_new_lda1, sr);
210 inline void nosym_transpose_opt<0>(
int const * edge_len,
216 int64_t
const * new_lda,
220 algstrct
const * sr){
223 if (idim_new_lda1 == 0){
225 memcpy(swap_data+sr->el_size*off_new, data+sr->el_size*off_old, sr->el_size*edge_len[0]);
227 memcpy(swap_data+sr->el_size*off_old, data+sr->el_size*off_new, sr->el_size*edge_len[0]);
231 if (sr->el_size == 8){
233 return nosym_transpose_inr<double,true>
244 return nosym_transpose_inr<double,false>
258 char buf[sr->el_size*CACHELINE*CACHELINE];
260 int new_lda1_n =
MIN(edge_len[idim_new_lda1]-i_new_lda1,CACHELINE);
262 for (
int i=0; i<edge_len[0]-CACHELINE+1; i+=CACHELINE){
263 for (
int j=0; j<new_lda1_n; j++){
265 sr->copy(CACHELINE, data+sr->el_size*(off_old+j*lda[idim_new_lda1]+i), 1, buf+sr->el_size*j, new_lda1_n);
267 for (
int j=0; j<CACHELINE; j++){
269 sr->copy(new_lda1_n, buf+sr->el_size*j*new_lda1_n, 1, swap_data+sr->el_size*(off_new+(i+j)*new_lda[0]), 1);
272 int lda1_n = edge_len[0]%CACHELINE;
273 for (
int j=0; j<new_lda1_n; j++){
274 sr->copy(lda1_n, data+sr->el_size*(off_old+j*lda[idim_new_lda1]+edge_len[0]-lda1_n), 1, buf+sr->el_size*j, new_lda1_n);
276 for (
int j=0; j<lda1_n; j++){
277 sr->copy(new_lda1_n, buf+sr->el_size*j*new_lda1_n, 1, swap_data+sr->el_size*(off_new+(edge_len[0]-lda1_n+j)*new_lda[0]), 1);
280 for (
int i=0; i<edge_len[0]-CACHELINE+1; i+=CACHELINE){
281 for (
int j=0; j<CACHELINE; j++){
283 sr->copy(new_lda1_n, data+sr->el_size*(off_new+(i+j)*new_lda[0]), 1, buf+sr->el_size*j, CACHELINE);
285 for (
int j=0; j<new_lda1_n; j++){
287 sr->copy(CACHELINE, buf+sr->el_size*j*CACHELINE, 1, swap_data+sr->el_size*(off_old+j*lda[idim_new_lda1]+i), 1);
290 int lda1_n = edge_len[0]%CACHELINE;
291 for (
int j=0; j<lda1_n; j++){
292 sr->copy(new_lda1_n, data+sr->el_size*(off_new+(edge_len[0]-lda1_n+j)*new_lda[0]), 1, buf+sr->el_size*j, lda1_n);
294 for (
int j=0; j<new_lda1_n; j++){
295 sr->copy(lda1_n, buf+sr->el_size*j*lda1_n, 1, swap_data+sr->el_size*(off_old+j*lda[idim_new_lda1]+edge_len[0]-lda1_n), 1);
304 void nosym_transpose_opt<8>(
int const * edge_len,
310 int64_t
const * new_lda,
314 algstrct
const * sr);
321 bool is_diff =
false;
322 for (
int i=0; i<order; i++){
323 if (new_order[i] != i) is_diff =
true;
326 return is_diff && (elementSize ==
sizeof(float) || elementSize ==
sizeof(
double) || elementSize ==
sizeof(hptt::DoubleComplex));
333 int const * st_new_order,
334 int const * st_edge_len,
336 char const * st_buffer,
340 int new_order[order];
343 memcpy(new_order, st_new_order, order*
sizeof(
int));
344 memcpy(edge_len, st_edge_len, order*
sizeof(
int));
346 for (
int i=0; i<order; i++){
347 new_order[st_new_order[i]] = i;
348 edge_len[i] = st_edge_len[st_new_order[i]];
352 int numThreads =
MIN(24,omp_get_max_threads());
359 for(
int i=0;i < order; ++i){
360 perm[i] = new_order[i];
364 for(
int i=0;i < order; ++i){
365 size[i] = edge_len[i];
366 total *= edge_len[i];
369 const int elementSize = sr->
el_size;
370 if (elementSize ==
sizeof(
float)){
371 auto plan = hptt::create_plan( perm, order,
372 1.0, ((
float*)st_buffer), size, NULL,
373 0.0, ((
float*)new_buffer), NULL,
374 hptt::ESTIMATE, numThreads );
375 if (
nullptr != plan){
378 fprintf(stderr,
"ERROR in HPTT: plan == NULL\n");
379 }
else if (elementSize ==
sizeof(
double)){
380 auto plan = hptt::create_plan( perm, order,
381 1.0, ((
double*)st_buffer), size, NULL,
382 0.0, ((
double*)new_buffer), NULL,
383 hptt::ESTIMATE, numThreads );
384 if (
nullptr != plan){
387 fprintf(stderr,
"ERROR in HPTT: plan == NULL\n");
388 }
else if( elementSize ==
sizeof(hptt::DoubleComplex) ) {
389 auto plan = hptt::create_plan( perm, order,
390 hptt::DoubleComplex(1.0), ((hptt::DoubleComplex*)st_buffer), size, NULL,
391 hptt::DoubleComplex(0.0), ((hptt::DoubleComplex*)new_buffer), NULL,
392 hptt::ESTIMATE, numThreads );
393 if (
nullptr != plan){
396 fprintf(stderr,
"ERROR in HPTT: plan == NULL\n");
405 int const * all_flen_A,
406 int const * new_order,
409 bool is_diff =
false;
410 for (
int i=0; i<all_fdim_A; i++){
411 if (new_order[i] != i) is_diff =
true;
413 if (!is_diff)
return;
416 if (all_fdim_A == 0){
421 bool use_hptt =
false;
429 double st_time = MPI_Wtime();
433 for (
int i=0; i<all_fdim_A; i++){
434 if (new_order[i] == i) contig0 *= all_flen_A[i];
439 for (
int i=0; i<all_fdim_A; i++){
440 tot_sz *= all_flen_A[i];
444 double tps[] = {0.0, 1.0, (double)tot_sz};
445 bool should_run =
true;
448 }
else if (contig0 <= 64){
453 if (!should_run)
return;
465 for (
int i=0; i<nvirt_A; i++){
478 A->
data = new_buffer;
483 A->
data = new_buffer;
486 for (
int i=0; i<nvirt_A; i++){
494 for (
int i=0; i<all_fdim_A; i++){
495 if (new_order[i] == i) contig0 *= all_flen_A[i];
500 for (
int i=0; i<all_fdim_A; i++){
501 tot_sz *= all_flen_A[i];
505 double exe_time = MPI_Wtime() - st_time;
506 double tps[] = {exe_time, 1.0, (double)tot_sz};
509 }
else if (contig0 <= 64){
520 int const * new_order,
521 int const * edge_len,
525 int64_t * chunk_size;
529 int max_ntd =
MIN(16,omp_get_max_threads());
532 std::fill(chunk_size, chunk_size+max_ntd, 0);
543 nosym_transpose(order, new_order, edge_len, data, dir, max_ntd, tswap_data, chunk_size, sr);
545 #pragma omp parallel num_threads(max_ntd) 550 tid = omp_get_thread_num();
554 int64_t thread_chunk_size = chunk_size[tid];
556 char * swap_data = tswap_data[tid];
558 for (i=0; i<tid; i++) toff += chunk_size[i];
559 if (thread_chunk_size > 0){
560 memcpy(data+sr->
el_size*(toff),swap_data,sr->
el_size*thread_chunk_size);
563 for (
int i=0; i<max_ntd; i++) {
564 int64_t thread_chunk_size = chunk_size[i];
565 if (thread_chunk_size > 0)
573 for (
int i=0; i<order; i++){
574 if (new_order[i] == i) contig0 *= edge_len[i];
579 for (
int i=0; i<order; i++){
580 tot_sz *= edge_len[i];
587 int const * new_order,
588 int const * edge_len,
593 int64_t * chunk_size,
597 int64_t * lda, * new_lda;
603 last_dim = new_order[order-1];
605 last_dim = order - 1;
610 for (j=1; j<order; j++){
611 lda[j] = lda[j-1]*edge_len[j-1];
614 local_size = lda[order-1]*edge_len[order-1];
615 new_lda[new_order[0]] = 1;
616 for (j=1; j<order; j++){
617 new_lda[new_order[j]] = new_lda[new_order[j-1]]*edge_len[new_order[j-1]];
619 ASSERT(local_size == new_lda[new_order[order-1]]*edge_len[new_order[order-1]]);
622 int idim_new_lda1 = new_order[0];
624 chunk_size[0] = local_size;
626 #define CASE_NTO(i) \ 628 nosym_transpose_opt<i-1>(edge_len,data,tswap_data[0],dir,idim_new_lda1,lda,new_lda,0,0,0,sr); \ 631 #define CASE_NTO(i) \ 633 if (dir) nosym_transpose_tmp<i-1>(edge_len,data,tswap_data[0],lda,new_lda,0,0,sr); \ 634 else nosym_transpose_tmp<i-1>(edge_len,data,tswap_data[0],new_lda,lda,0,0,sr); \ 663 #pragma omp parallel num_threads(max_ntd) 666 int64_t i, off_old, off_new, tid, ntd, last_max, toff_new, toff_old;
668 int64_t thread_chunk_size;
672 memset(idx, 0, order*
sizeof(int64_t));
675 tid = omp_get_thread_num();
676 ntd = omp_get_num_threads();
680 thread_chunk_size = local_size;
689 tidx_off = (edge_len[last_dim]/ntd)*tid;
690 idx[last_dim] = tidx_off;
691 last_max = (edge_len[last_dim]/ntd)*(tid+1);
692 if (tid == ntd-1) last_max = edge_len[last_dim];
693 off_old = idx[last_dim]*lda[last_dim];
694 off_new = idx[last_dim]*new_lda[last_dim];
698 thread_chunk_size = (local_size*(last_max-tidx_off))/edge_len[last_dim];
700 thread_chunk_size = local_size;
704 if (last_max != 0 && tidx_off != last_max && (order != 1 || tid == 0)){
705 chunk_size[tid] = thread_chunk_size;
706 if (thread_chunk_size <= 0)
707 printf(
"ERRORR thread_chunk_size = %ld, tid = %ld, local_size = %ld\n", thread_chunk_size, tid, local_size);
709 swap_data = tswap_data[tid];
713 sr->
copy(edge_len[0], data+sr->
el_size*(off_old), lda[0], swap_data+sr->
el_size*(off_new-toff_new), new_lda[0]);
715 sr->
copy(edge_len[0], data+sr->
el_size*(off_new), new_lda[0], swap_data+sr->
el_size*(off_old-toff_old), lda[0]);
720 sr->
copy(last_max-tidx_off, data+sr->
el_size*(off_old), lda[0], swap_data+sr->
el_size*(off_new-toff_new), new_lda[0]);
722 sr->
copy(last_max-tidx_off, data+sr->
el_size*(off_new), new_lda[0], swap_data+sr->
el_size*(off_old-toff_old), lda[0]);
732 for (i=1; i<order; i++){
733 off_old -= idx[i]*lda[i];
734 off_new -= idx[i]*new_lda[i];
736 idx[i] = (idx[i] == last_max-1 ? tidx_off : idx[i]+1);
737 off_old += idx[i]*lda[i];
738 off_new += idx[i]*new_lda[i];
739 if (idx[i] != tidx_off)
break;
741 idx[i] = (idx[i] == edge_len[i]-1 ? 0 : idx[i]+1);
742 off_old += idx[i]*lda[i];
743 off_new += idx[i]*new_lda[i];
744 if (idx[i] != 0)
break;
758 int const * new_order,
759 int const * edge_len,
762 if (order == 0)
return 0.0;
764 for (
int i=0; i<order; i++){
765 if (new_order[i] == i) contig0 *= edge_len[i];
770 for (
int i=0; i<order; i++){
771 tot_sz *= edge_len[i];
775 if (contig0==tot_sz)
return 0.0;
779 double ps[] = {1.0, (double)tot_sz};
782 }
else if (contig0 <= 64){
void nosym_transpose_hptt(int order, int const *st_new_order, int const *st_edge_len, int dir, char const *st_buffer, char *new_buffer, algstrct const *sr)
char * home_buffer
buffer associated with home mapping of tensor, to which it is returned
bool is_home
whether the latest tensor data is in the home buffer
double non_contig_transp_mdl_init[]
virtual void copy(char *a, char const *b) const
copies element b to element a
LinModel< 2 > long_contig_transp_mdl(long_contig_transp_mdl_init,"long_contig_transp_mdl")
bool has_home
whether the tensor has a home mapping/buffer
int64_t size
current size of local tensor data chunk (mapping-dependent)
virtual void dealloc(char *ptr) const
deallocate given pointer containing contiguous array of values
virtual char * alloc(int64_t n) const
allocate space for n items, necessary for object types
bool hptt_is_applicable(int order, int const *new_order, int elementSize)
Checks if the HPTT library is applicable.
LinModel< 2 > non_contig_transp_mdl(non_contig_transp_mdl_init,"non_contig_transp_mdl")
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
double est_time_transp(int order, int const *new_order, int const *edge_len, int dir, algstrct const *sr)
estimates time needed to transposes a non-symmetric (folded) tensor based on performance models ...
algstrct * sr
algstrct on which tensor elements and operations are defined
bool left_home_transp
whether the tensor left home to transpose
int64_t calc_nvirt() const
calculate virtualization factor of tensor return virtualization factor
int el_size
size of each element of algstrct in bytes
int cdealloc(void *ptr)
free abstraction
algstrct (algebraic structure) defines the elementwise operations computed in each tensor contraction...
char * data
tensor data, either the data or the key-value pairs should exist at any given time ...
internal distributed tensor class
void nosym_transpose(tensor *A, int all_fdim_A, int const *all_flen_A, int const *new_order, int dir)
double long_contig_transp_mdl_init[]
LinModel< 2 > shrt_contig_transp_mdl(shrt_contig_transp_mdl_init,"shrt_contig_transp_mdl")
double shrt_contig_transp_mdl_init[]