-
Notifications
You must be signed in to change notification settings - Fork 61
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
a97d564
commit f3c770c
Showing
9 changed files
with
202 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
export WANDB_MODE=disabled | ||
export GLOBAL_ADDR=localhost | ||
export GLOBAL_PORT=1234 | ||
#export GLOBAL_WORLD_SIZE=2 | ||
|
||
export CUDA_VISIBLE_DEVICES=2,3 | ||
export GLOBAL_UNIQUE_ID=2 | ||
export GLOBAL_RANK=100 | ||
|
||
#export GLOO_SOCKET_IFNAME=tailscale0 | ||
export ZERO_BAND_LOG_LEVEL=DEBUG | ||
export ZERO_BAND_LOG_ALL_RANK=true | ||
|
||
uv run torchrun --nproc_per_node=2 \ | ||
--rdzv-endpoint localhost:1000$GLOBAL_UNIQUE_ID \ | ||
src/zeroband/train.py \ | ||
@configs/150M/3090.toml \ | ||
--no-wandb-resume |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
a,b | ||
0,0 | ||
2,1 | ||
1,2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import os | ||
import torch.distributed as dist | ||
import torch | ||
import time | ||
|
||
rank = int(os.environ['RANK']) | ||
def rprint(*args): | ||
print(f"[Rank {rank}] {' '.join(map(str, args))}\n", end="") | ||
|
||
class EDM: | ||
def __init__(self): | ||
master_addr = os.environ['MASTER_ADDR'] | ||
master_port = int(os.environ['MASTER_PORT']) | ||
rank = int(os.environ['RANK']) | ||
world_size = int(os.environ['WORLD_SIZE']) | ||
|
||
self.rank = rank | ||
self.world_size = world_size | ||
|
||
rprint("Creating Store") | ||
self.global_store = dist.TCPStore( | ||
host_name=master_addr, | ||
port=master_port + 1, | ||
is_master=(rank==0), | ||
world_size=2, | ||
) | ||
|
||
rprint("Store created. Creating ProcessGroupGloo") | ||
self.global_pg = dist.distributed_c10d.ProcessGroupGloo(self.global_store, rank, world_size) | ||
rprint("ProcessGroupGloo created") | ||
|
||
self.measure_connectivity() | ||
|
||
def measure_connectivity(self): | ||
recv_work = [] | ||
for i in range(self.world_size): | ||
if i == self.rank: | ||
continue | ||
tensor = torch.ones(1_000_000, dtype=torch.float32) | ||
rprint(f"Recv from peer {i} with tag {self.rank + self.world_size * i}") | ||
recv_work.append(self.global_pg.recv([tensor], i, self.rank + self.world_size * i)) | ||
|
||
self.global_pg.barrier().wait() | ||
for i in range(self.world_size): | ||
if i == self.rank: | ||
continue | ||
rprint(f"Pinging peer {i}") | ||
time_taken = self.ping_peer(i) | ||
rprint(f"Ping to peer {i} took {time_taken} seconds") | ||
|
||
for work in recv_work: | ||
work.wait() | ||
|
||
def ping_peer(self, peer_rank: int) -> float: | ||
tensor = torch.ones(1_000_000, dtype=torch.float32) | ||
start_time = time.perf_counter() | ||
rprint(f"Send from peer {self.rank} to {peer_rank} with tag {self.rank * self.world_size + peer_rank}") | ||
self.global_pg.send([tensor], peer_rank, self.rank * self.world_size + peer_rank).wait() | ||
end_time = time.perf_counter() | ||
return end_time - start_time | ||
|
||
def main(): | ||
edm = EDM() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
export WANDB_MODE=disabled | ||
export GLOBAL_ADDR=localhost | ||
export GLOBAL_PORT=1234 | ||
#export GLOBAL_WORLD_SIZE=2 | ||
|
||
export CUDA_VISIBLE_DEVICES=0,1 | ||
export GLOBAL_UNIQUE_ID=0 | ||
#export GLOBAL_RANK=0 | ||
|
||
#export GLOO_SOCKET_IFNAME=tailscale0 | ||
export ZERO_BAND_LOG_LEVEL=DEBUG | ||
export ZERO_BAND_LOG_ALL_RANK=true | ||
|
||
uv run torchrun --nproc_per_node=2 \ | ||
--rdzv-endpoint localhost:1000$GLOBAL_UNIQUE_ID \ | ||
src/zeroband/train.py \ | ||
@configs/150M/3090.toml \ | ||
--no-wandb-resume |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
export WANDB_MODE=disabled | ||
export GLOBAL_ADDR=localhost | ||
export GLOBAL_PORT=1234 | ||
#export GLOBAL_WORLD_SIZE=2 | ||
|
||
export CUDA_VISIBLE_DEVICES=4,5 | ||
export GLOBAL_UNIQUE_ID=1 | ||
export GLOBAL_RANK=100 | ||
|
||
#export GLOO_SOCKET_IFNAME=tailscale0 | ||
export ZERO_BAND_LOG_LEVEL=DEBUG | ||
export ZERO_BAND_LOG_ALL_RANK=true | ||
|
||
uv run torchrun --nproc_per_node=2 \ | ||
--rdzv-endpoint localhost:1000$GLOBAL_UNIQUE_ID \ | ||
src/zeroband/train.py \ | ||
@configs/150M/3090.toml \ | ||
--no-wandb-resume |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
export WANDB_MODE=disabled | ||
export GLOBAL_ADDR=localhost | ||
export GLOBAL_PORT=1234 | ||
#export GLOBAL_WORLD_SIZE=2 | ||
|
||
export CUDA_VISIBLE_DEVICES=2,3 | ||
export GLOBAL_UNIQUE_ID=2 | ||
export GLOBAL_RANK=100 | ||
|
||
#export GLOO_SOCKET_IFNAME=tailscale0 | ||
export ZERO_BAND_LOG_LEVEL=DEBUG | ||
export ZERO_BAND_LOG_ALL_RANK=true | ||
|
||
uv run torchrun --nproc_per_node=2 \ | ||
--rdzv-endpoint localhost:1000$GLOBAL_UNIQUE_ID \ | ||
src/zeroband/train.py \ | ||
@configs/150M/3090.toml \ | ||
--no-wandb-resume |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
gid,grank | ||
0,0 | ||
2,1 | ||
1,1 |