Skip to content

Commit

Permalink
fix config
Browse files Browse the repository at this point in the history
  • Loading branch information
samsja committed Dec 5, 2024
1 parent 02e1caa commit 8a23a11
Show file tree
Hide file tree
Showing 18 changed files with 54 additions and 17 deletions.
10 changes: 6 additions & 4 deletions configs/10B/H100.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,16 @@ sched_type = "wsd-sqrt"
batch_size = 128 #1M tokens bs
warmup_steps = 1000
total_steps = 1_000_000_000_000
lr = 7.5e-5

adam_betas1 = 0.9
adam_betas2 = 0.95
weight_decay = 0.1

z_loss = true

[optim.optim]
lr = 7.5e-5
betas1 = 0.9
betas2 = 0.95
weight_decay = 0.1

[data]
seq_length = 8192
dataset_name_or_paths = "/data/datasets/fineweb-edu,/data/datasets/fineweb,/data/datasets/StackV1-popular,/data/datasets/dclm-baseline-1.0-parquet,/data/datasets/open-web-math"
Expand Down
11 changes: 6 additions & 5 deletions configs/10B/H100_cooldown.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@ batch_size = 128 #1M tokens bs
warmup_steps = 1000
stable_steps = 74700
total_steps = 90400
lr = 7.5e-5

adam_betas1 = 0.9
adam_betas2 = 0.95
weight_decay = 0.1

z_loss = true

[optim.optim]
lr = 7.5e-5
betas1 = 0.9
betas2 = 0.95
weight_decay = 0.1

[data]
seq_length = 8192
dataset_name_or_paths = "/data/datasets/fineweb-edu,/data/datasets/fineweb,/data/datasets/StackV1-popular"
Expand Down
2 changes: 2 additions & 0 deletions configs/13B/H100.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ ac_ckpt = true
batch_size = 1024 #2M tokens bs
warmup_steps = 1000
total_steps = 88_000

[optim.optim]
lr = 3e-4

[data]
Expand Down
2 changes: 2 additions & 0 deletions configs/150M/3090.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ reshard_after_forward = true
batch_size = 512
warmup_steps = 1000
total_steps = 88_000

[optim.optim]
lr = 4e-4


Expand Down
5 changes: 4 additions & 1 deletion configs/150M/A40.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,7 @@ reshard_after_forward = true
batch_size = 512
warmup_steps = 1000
total_steps = 88_000
lr = 4e-4

[optim.optim]
lr = 4e-4

5 changes: 4 additions & 1 deletion configs/150M/H100.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,7 @@ micro_bs = 64 # change this base on the gpu
batch_size = 512
warmup_steps = 1000
total_steps = 88_000
lr = 4e-4

[optim.optim]
lr = 4e-4

4 changes: 3 additions & 1 deletion configs/150M_short/3090.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@ reshard_after_forward = true
batch_size = 512
warmup_steps = 1000
total_steps = 15_000
lr = 4e-4

[optim.optim]
lr = 4e-4
5 changes: 4 additions & 1 deletion configs/150M_short/A40.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ type_model = "llama2"
[train]
micro_bs = 32 # change this base on the gpu
reshard_after_forward = true

[optim]
batch_size = 512
warmup_steps = 1000
total_steps = 15_000
lr = 4e-4

[optim.optim]
lr = 4e-4
2 changes: 2 additions & 0 deletions configs/150M_short/H100.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ micro_bs = 64 # change this base on the gpu
batch_size = 512
warmup_steps = 1000
total_steps = 15_000

[optim.optim]
lr = 4e-4
2 changes: 2 additions & 0 deletions configs/1B/H100.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ micro_bs = 16
batch_size = 2048
warmup_steps = 1000
total_steps = 88_000

[optim.optim]
lr = 4e-4
2 changes: 2 additions & 0 deletions configs/1B/H100_c4.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ micro_bs = 16
batch_size = 128
warmup_steps = 1000
total_steps = 88_000

[optim.optim]
lr = 3e-4

[data]
Expand Down
4 changes: 3 additions & 1 deletion configs/1B/H100_llama2_edu.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,7 @@ batch_size = 256
warmup_steps = 1000
total_steps = 1_000_000_000_000
sched_type = "wsd-sqrt"
lr = 4e-4
z_loss = true

[optim.optim]
lr = 4e-4
4 changes: 3 additions & 1 deletion configs/1B/H100_llama2_edu_no_feat.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,7 @@ batch_size = 256
warmup_steps = 1000
total_steps = 1_000_000_000_000
sched_type = "wsd-sqrt"
lr = 2e-4
z_loss = false

[optim.optim]
lr = 2e-4
3 changes: 2 additions & 1 deletion configs/1B/H100_llama3.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ batch_size = 256
warmup_steps = 1000
total_steps = 1_000_000_000_000
sched_type = "wsd-sqrt"
lr = 4e-4
z_loss = true

[optim.optim]
lr = 4e-4
4 changes: 3 additions & 1 deletion configs/1B_diloco/H100.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ micro_bs = 16
batch_size = 2048
warmup_steps = 1000
total_steps = 88_000
lr = 4e-4

z_loss = true

[optim.optim]
lr = 4e-4


[diloco]
inner_steps = 50
Expand Down
2 changes: 2 additions & 0 deletions configs/7B/H100.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ micro_bs = 1
batch_size = 1024 #2M tokens bs
warmup_steps = 1000
total_steps = 88_000

[optim.optim]
lr = 3e-4

[data]
Expand Down
2 changes: 2 additions & 0 deletions configs/7B_diloco/H100.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ micro_bs = 1
batch_size = 1024 #2M tokens bs
warmup_steps = 1000
total_steps = 88_000

[optim.optim]
lr = 3e-4

[data]
Expand Down
2 changes: 2 additions & 0 deletions configs/test.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,6 @@ num_workers = 1
batch_size = 128
warmup_steps = 1000
total_steps = 88_000

[optim.optim]
lr = 4e-4

0 comments on commit 8a23a11

Please sign in to comment.