fix config

PrimeIntellect-ai · Dec 5, 2024 · 8a23a11 · 8a23a11
1 parent 02e1caa
commit 8a23a11
Show file tree

Hide file tree

Showing 18 changed files with 54 additions and 17 deletions.
diff --git a/configs/10B/H100.toml b/configs/10B/H100.toml
@@ -11,14 +11,16 @@ sched_type = "wsd-sqrt"
 batch_size = 128 #1M tokens bs
 warmup_steps = 1000
 total_steps = 1_000_000_000_000
-lr = 7.5e-5
 
-adam_betas1 = 0.9
-adam_betas2 = 0.95
-weight_decay = 0.1
 
 z_loss = true
 
+[optim.optim]
+lr = 7.5e-5
+betas1 = 0.9
+betas2 = 0.95
+weight_decay = 0.1
+
 [data]
 seq_length = 8192
 dataset_name_or_paths = "/data/datasets/fineweb-edu,/data/datasets/fineweb,/data/datasets/StackV1-popular,/data/datasets/dclm-baseline-1.0-parquet,/data/datasets/open-web-math"

diff --git a/configs/10B/H100_cooldown.toml b/configs/10B/H100_cooldown.toml
@@ -12,14 +12,15 @@ batch_size = 128 #1M tokens bs
 warmup_steps = 1000
 stable_steps = 74700
 total_steps = 90400
-lr = 7.5e-5
-
-adam_betas1 = 0.9
-adam_betas2 = 0.95
-weight_decay = 0.1
 
 z_loss = true
 
+[optim.optim]
+lr = 7.5e-5
+betas1 = 0.9
+betas2 = 0.95
+weight_decay = 0.1
+
 [data]
 seq_length = 8192
 dataset_name_or_paths = "/data/datasets/fineweb-edu,/data/datasets/fineweb,/data/datasets/StackV1-popular"

diff --git a/configs/13B/H100.toml b/configs/13B/H100.toml
@@ -9,6 +9,8 @@ ac_ckpt = true
 batch_size = 1024 #2M tokens bs
 warmup_steps = 1000
 total_steps = 88_000
+
+[optim.optim]
 lr = 3e-4
 
 [data]

diff --git a/configs/150M/3090.toml b/configs/150M/3090.toml
@@ -10,6 +10,8 @@ reshard_after_forward = true
 batch_size = 512
 warmup_steps = 1000
 total_steps = 88_000
+
+[optim.optim]
 lr = 4e-4
 
 

diff --git a/configs/150M/A40.toml b/configs/150M/A40.toml
@@ -10,4 +10,7 @@ reshard_after_forward = true
 batch_size = 512
 warmup_steps = 1000
 total_steps = 88_000
-lr = 4e-4
+
+[optim.optim]
+lr = 4e-4
+
diff --git a/configs/150M/H100.toml b/configs/150M/H100.toml
@@ -9,4 +9,7 @@ micro_bs = 64 # change this base on the gpu
 batch_size = 512
 warmup_steps = 1000
 total_steps = 88_000
-lr = 4e-4
+
+[optim.optim]
+lr = 4e-4
+
diff --git a/configs/150M_short/3090.toml b/configs/150M_short/3090.toml
@@ -10,4 +10,6 @@ reshard_after_forward = true
 batch_size = 512
 warmup_steps = 1000
 total_steps = 15_000
-lr = 4e-4
+
+[optim.optim]
+lr = 4e-4
diff --git a/configs/150M_short/A40.toml b/configs/150M_short/A40.toml
@@ -5,8 +5,11 @@ type_model = "llama2"
 [train]
 micro_bs = 32 # change this base on the gpu
 reshard_after_forward = true
+
 [optim]
 batch_size = 512
 warmup_steps = 1000
 total_steps = 15_000
-lr = 4e-4
+
+[optim.optim]
+lr = 4e-4
diff --git a/configs/150M_short/H100.toml b/configs/150M_short/H100.toml
@@ -9,4 +9,6 @@ micro_bs = 64 # change this base on the gpu
 batch_size = 512
 warmup_steps = 1000
 total_steps = 15_000
+
+[optim.optim]
 lr = 4e-4
diff --git a/configs/1B/H100.toml b/configs/1B/H100.toml
@@ -9,4 +9,6 @@ micro_bs = 16
 batch_size = 2048
 warmup_steps = 1000
 total_steps = 88_000
+
+[optim.optim]
 lr = 4e-4
diff --git a/configs/1B/H100_c4.toml b/configs/1B/H100_c4.toml
@@ -9,6 +9,8 @@ micro_bs = 16
 batch_size = 128
 warmup_steps = 1000
 total_steps = 88_000
+
+[optim.optim]
 lr = 3e-4
 
 [data]

diff --git a/configs/1B/H100_llama2_edu.toml b/configs/1B/H100_llama2_edu.toml
@@ -17,5 +17,7 @@ batch_size = 256
 warmup_steps = 1000
 total_steps = 1_000_000_000_000
 sched_type = "wsd-sqrt"
-lr = 4e-4
 z_loss = true
+
+[optim.optim]
+lr = 4e-4
diff --git a/configs/1B/H100_llama2_edu_no_feat.toml b/configs/1B/H100_llama2_edu_no_feat.toml
@@ -19,5 +19,7 @@ batch_size = 256
 warmup_steps = 1000
 total_steps = 1_000_000_000_000
 sched_type = "wsd-sqrt"
-lr = 2e-4
 z_loss = false
+
+[optim.optim]
+lr = 2e-4
diff --git a/configs/1B/H100_llama3.toml b/configs/1B/H100_llama3.toml
@@ -17,6 +17,7 @@ batch_size = 256
 warmup_steps = 1000
 total_steps = 1_000_000_000_000
 sched_type = "wsd-sqrt"
-lr = 4e-4
 z_loss = true
 
+[optim.optim]
+lr = 4e-4
diff --git a/configs/1B_diloco/H100.toml b/configs/1B_diloco/H100.toml
@@ -9,10 +9,12 @@ micro_bs = 16
 batch_size = 2048
 warmup_steps = 1000
 total_steps = 88_000
-lr = 4e-4
 
 z_loss = true
 
+[optim.optim]
+lr = 4e-4
+
 
 [diloco]
 inner_steps = 50

diff --git a/configs/7B/H100.toml b/configs/7B/H100.toml
@@ -9,6 +9,8 @@ micro_bs = 1
 batch_size = 1024 #2M tokens bs
 warmup_steps = 1000
 total_steps = 88_000
+
+[optim.optim]
 lr = 3e-4
 
 [data]

diff --git a/configs/7B_diloco/H100.toml b/configs/7B_diloco/H100.toml
@@ -9,6 +9,8 @@ micro_bs = 1
 batch_size = 1024 #2M tokens bs
 warmup_steps = 1000
 total_steps = 88_000
+
+[optim.optim]
 lr = 3e-4
 
 [data]

diff --git a/configs/test.toml b/configs/test.toml
@@ -15,4 +15,6 @@ num_workers = 1
 batch_size = 128
 warmup_steps = 1000
 total_steps = 88_000
+
+[optim.optim]
 lr = 4e-4