Skip to content

Commit

Permalink
move optimise rings ranks
Browse files Browse the repository at this point in the history
  • Loading branch information
Jackmin801 committed Nov 9, 2024
1 parent 83424ac commit 0555031
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions src/zeroband/comms.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,14 +266,18 @@ def _check_heartbeats(self) -> List[str]:
return dead_nodes

def _optimize_ring_ranks(self):
start_time = time.perf_counter()
self._measure_connectivity()
self.global_pg.barrier()
self._logger.debug(f"Time taken to measure connectivity: {time.perf_counter() - start_time}")
start_time = time.perf_counter()
self._logger.debug("Calculating TSP")
if self._global_leader:
pings = self.get_pings()
min_dist, path = toposolve.TSPSolver().solve_tsp(pings)
print(f"Min distance: {min_dist}")
print(f"Path: {path}")
self._logger.debug(f"Time taken to calculate TSP: {time.perf_counter() - start_time}")

def _resolve_world(self, admit_joiners: bool = False) -> bool:
"""Set the new world size and ranks for all nodes if there are joiners or dead nodes. Else, do nothing.
Expand Down Expand Up @@ -334,6 +338,8 @@ def maybe_reinit_global_pg(self, admit_joiners: bool = False) -> bool:
bool: True if the global_pg was reinitialized, False otherwise.
"""

self._optimize_ring_ranks()

if not self.enable:
# no op if disabled
return
Expand Down Expand Up @@ -389,8 +395,6 @@ def maybe_reinit_global_pg(self, admit_joiners: bool = False) -> bool:
self._logger.error(f"Error recreating process group: {e}. Retrying...")
return self.maybe_reinit_global_pg(admit_joiners=admit_joiners)

self._optimize_ring_ranks()

if self._global_leader:
self._clear_joiners()
self.global_store.set("status", "running")
Expand Down

0 comments on commit 0555031

Please sign in to comment.