4 #include "../shared/util.h" 5 #include "../mapping/mapping.h" 38 for (
int i=0; i<
order; i++){
55 memcpy(
lens, lens_, order_*
sizeof(
int));
61 int stride = 1, cut = 0;
63 for (
int i=0; i<
order; i++){
66 (((rank/(stride*
lens[i]))*stride)+cut),
70 cut = (rank - (rank/stride)*stride);
78 for (
int i=0; i<
order; i++){
87 for (
int i=0; i<
order; i++){
103 topo =
new topology(1, dl, glb_comm, 1);
110 topo =
new topology(order, dim_len, glb_comm, 1);
123 topo_dims[0] = hw.Size[0];
124 topo_dims[1] = hw.Size[1];
125 topo_dims[2] = hw.Size[2];
126 topo_dims[3] = hw.Size[3];
127 topo_dims[4] = hw.Size[4];
128 topo_dims[5] =
MIN(4, np/(topo_dims[0]*topo_dims[1]*
129 topo_dims[2]*topo_dims[3]*
131 topo_dims[6] = (np/ (topo_dims[0]*topo_dims[1]*
132 topo_dims[2]*topo_dims[3]*
136 if (topo_dims[i] > 1){
137 dl[
dim] = topo_dims[i];
141 topo =
new topology(dim, topo_dims, glb_comm, 1);
149 topo =
new topology(order, dim_len, glb_comm, 1);
155 if (1<<(
int)log2(np) != np){
157 topo =
new topology(order, dim_len, glb_comm, 1);
161 if ((
int)log2(np) == 0) order = 0;
162 else if ((
int)log2(np) <= 2) order = 1;
163 else if ((
int)log2(np) <= 4) order = 2;
166 switch ((
int)log2(np)){
242 topo =
new topology(order, dim_len, glb_comm, 1);
248 if (1<<(
int)log2(np) != np){
250 topo =
new topology(order, dim_len, glb_comm, 1);
254 order =
MIN((
int)log2(np),8);
258 switch ((
int)log2(np)){
388 topo =
new topology(order, dim_len, glb_comm, 1);
395 topo =
new topology(order, dim_len, glb_comm, 1);
410 std::vector< topology* >
get_all_topos(
CommData cdt,
int n_uf,
int const * uniq_fact,
int const * mults,
int n_prepend,
int const * prelens){
411 std::vector<topology*> topos;
413 int num_divisors = 1;
414 for (
int i=0; i<n_uf; i++){
415 num_divisors *= (1+mults[i]);
416 ASSERT(num_divisors < 1E6);
419 if (num_divisors == 1){
420 topos.push_back(
new topology(n_prepend, prelens, cdt));
424 int new_prelens[n_prepend+1];
425 memcpy(new_prelens, prelens, n_prepend*
sizeof(
int));
428 for (
int div=1; div<num_divisors; div++){
433 for (
int i=0; i<n_uf; i++){
434 dmults[i] = idiv%(1+mults[i]);
435 sub_mults[i] = mults[i]-dmults[i];
436 idiv = idiv/(1+mults[i]);
437 len0 *= std::pow(uniq_fact[i], dmults[i]);
439 new_prelens[n_prepend] = len0;
440 std::vector< topology* > new_topos =
get_all_topos(cdt, n_uf, uniq_fact, sub_mults, n_prepend+1, new_prelens);
442 for (
unsigned i=0; i<new_topos.size(); i++){
443 topos.push_back(new_topos[i]);
450 std::vector<topology*> topovec;
452 int nfact, * factors;
455 topovec.push_back(
new topology(nfact, factors, cdt));
456 if (cdt.
np >= 7 && cdt.
rank == 0)
457 DPRINTF(1,
"CTF WARNING: using a world with a prime number of processors may lead to very bad performance\n");
461 std::sort(factors,factors+nfact);
463 assert(factors[0] != 1);
464 for (
int i=1; i<nfact; i++){
465 if (factors[i] != factors[i-1]) n_uf++;
469 DPRINTF(1,
"CTF WARNING: using a world with a number of processors that contains 3 or more unique prime factors may lead to suboptimal performance, when possible use p=2^k3^l processors for some k,l\n");
474 uniq_fact[0] = factors[0];
476 for (
int i=1; i<nfact; i++){
477 if (factors[i] != factors[i-1]){
479 uniq_fact[i_uf] = factors[i];
481 }
else mults[i_uf]++;
490 std::vector<topology*> topovec;
491 std::vector<topology*> perm_vec;
492 perm_vec.push_back(phys_topology);
501 for (
int i=0; i<(int)perm_vec.size(); i++){
502 for (
int j=0; j<perm_vec[i]->order; j++){
503 if (perm_vec[i]->
lens[j] != 2){
504 for (
int k=0; k<perm_vec[i]->order; k++){
505 if (j!=k && perm_vec[i]->
lens[j] != perm_vec[i]->
lens[k]){
506 int new_lens[perm_vec[i]->order];
507 memcpy(new_lens,perm_vec[i]->
lens,perm_vec[i]->
order*
sizeof(
int));
508 new_lens[j] = perm_vec[i]->lens[k];
509 new_lens[k] = perm_vec[i]->lens[j];
515 perm_vec.push_back(new_topo);
517 }
else delete new_topo;
525 for (
int i=1; i<(int)perm_vec.size(); i++){
526 std::vector<topology*> temp_vec =
peel_torus(perm_vec[i], cdt);
527 for (
int j=0; j<(int)temp_vec.size(); j++){
529 topovec.push_back(temp_vec[j]);
530 }
else delete temp_vec[j];
539 std::vector< topology* > topos;
540 topos.push_back(
new topology(*topo));
542 if (topo->
order <= 1)
return topos;
544 int * new_lens = (
int*)
alloc(
sizeof(
int)*topo->
order-1);
546 for (
int i=0; i<topo->
order-1; i++){
547 for (
int j=0; j<i; j++){
548 new_lens[j] = topo->
lens[j];
550 new_lens[i] = topo->
lens[i]*topo->
lens[i+1];
551 for (
int j=i+2; j<topo->
order; j++){
552 new_lens[j-1] = topo->
lens[j];
555 topos.push_back(new_topo);
558 for (
int i=1; i<(int)topos.size(); i++){
559 std::vector< topology* > more_topos =
peel_torus(topos[i], glb_comm);
560 for (
int j=0; j<(int)more_topos.size(); j++){
562 topos.push_back(more_topos[j]);
564 delete more_topos[j];
572 std::vector< topology* > & topovec){
574 std::vector< topology* >::iterator iter;
577 for (j=0, iter=topovec.begin(); iter!=topovec.end(); iter++, j++){
578 if ((*iter)->order == topo->
order){
580 for (i=0; i<(*iter)->order; i++) {
581 if ((*iter)->lens[i] != topo->
lens[i]){
586 if (found != -1)
return found;
596 int64_t gnvirt, nv, gcomm_vol, gmemuse, bv;
599 MPI_Allreduce(&nv, &gnvirt, 1, MPI_INT64_T, MPI_MIN, global_comm.
cm);
604 if (nvirt == gnvirt){
611 MPI_Allreduce(&nv, &gcomm_vol, 1, MPI_INT64_T, MPI_MIN, global_comm.
cm);
612 if (bcomm_vol != gcomm_vol){
616 MPI_Allreduce(&bv, &gmemuse, 1, MPI_INT64_T, MPI_MIN, global_comm.
cm);
617 if (bmemuse != gmemuse){
620 MPI_Allreduce(&btopo, >opo, 1, MPI_INT, MPI_MIN, global_comm.
cm);
633 int & num_sub_phys_dims,
637 int phys_mapped[topo->
order];
641 memset(phys_mapped, 0, topo->
order*
sizeof(
int));
643 num_sub_phys_dims = 0;
645 for (i=0; i<order_A; i++){
646 map = &edge_map_A[i];
648 phys_mapped[map->
cdt] = 1;
653 for (i=0; i<order_B; i++){
654 map = &edge_map_B[i];
656 phys_mapped[map->
cdt] = 1;
662 num_sub_phys_dims = 0;
663 for (i=0; i<topo->
order; i++){
664 if (phys_mapped[i] == 0){
670 num_sub_phys_dims = 0;
671 for (i=0; i<topo->
order; i++){
672 if (phys_mapped[i] == 0){
673 sub_phys_comm[num_sub_phys_dims] = topo->
dim_comm[i];
674 comm_idx[num_sub_phys_dims] = i;
678 *pcomm_idx = comm_idx;
679 *psub_phys_comm = sub_phys_comm;
688 for (i=0; i<topo_keep->
order; i++){
693 }
else if (lda > topo_change->
dim_comm[j].
np){
704 int i,j,old_lda,new_np;
705 mapping * old_map, * new_map, * new_rec_map;
707 for (i=0; i<
order; i++){
709 old_map = &edge_map[i];
711 new_rec_map = new_map;
713 old_lda = old_topo->
lda[old_map->
cdt];
716 for (j=0; j<new_topo->
order; j++){
717 if (new_topo->
lda[j] == old_lda)
break;
721 new_rec_map->
cdt = j;
723 new_np *= new_rec_map->
np;
724 if (new_np<old_map->
np) {
725 old_lda = old_lda * new_rec_map->
np;
728 new_rec_map = new_rec_map->
child;
730 }
while (new_np<old_map->
np);
743 new_rec_map = new_rec_map->
child;
744 old_map = old_map->
child;
753 edge_map[i] = *new_map;
std::vector< topology * > peel_torus(topology const *topo, CommData glb_comm)
folds specified topology into all configurations of lesser dimensionality
void * alloc(int64_t len)
alloc abstraction
int find_topology(topology const *topo, std::vector< topology * > &topovec)
searches for an equivalent topology in avector of topologies
int get_best_topo(int64_t nvirt, int topo, CommData global_comm, int64_t bcomm_vol, int64_t bmemuse)
get the best topologoes (least nvirt) over all procs
std::vector< topology * > peel_perm_torus(topology *phys_topology, CommData cdt)
folds specified topology and all of its permutations into all configurations of lesser dimensionality...
topology * get_phys_topo(CommData glb_comm, TOPOLOGY mach)
get dimension and torus lengths of specified topology
int alloc_ptr(int64_t len, void **const ptr)
alloc abstraction
std::vector< topology * > get_all_topos(CommData cdt, int n_uf, int const *uniq_fact, int const *mults, int n_prepend, int const *prelens)
computes all unique factorizations into non-primes each yielding a topology, prepending additional fa...
int can_morph(topology const *topo_keep, topology const *topo_change)
determines if two topologies are compatible with each other
int cdealloc(void *ptr)
free abstraction
topology(topology const &other)
copy constructor
void activate(MPI_Comm parent)
activate this subcommunicator by splitting parent_comm
void factorize(int n, int *nfactor, int **factor)
computes the size of a tensor in packed symmetric layout
void extract_free_comms(topology const *topo, int order_A, mapping const *edge_map_A, int order_B, mapping const *edge_map_B, int &num_sub_phys_dims, CommData **psub_phys_comm, int **pcomm_idx)
extracts the set of physical dimensions still available for mapping
std::vector< topology * > get_generic_topovec(CommData cdt)
computes all topology configurations given undelying physical topology information ...
void clear()
resets mapping to NOT_MAPPED
void morph_topo(topology const *new_topo, topology const *old_topo, int order, mapping *edge_map)
morphs a tensor topology into another