diff --git a/configs/150M/H100-fast.toml b/configs/150M/H100-fast.toml new file mode 100644 index 00000000..464a46d8 --- /dev/null +++ b/configs/150M/H100-fast.toml @@ -0,0 +1,22 @@ +name_model = "150M" +project = "debug_150m_zero_band" +type_model = "llama2" + +[train] +micro_bs = 64 # change this base on the gpu +reshard_after_forward = true + +[optim] +batch_size = 512 +warmup_steps = 278 +total_steps = 8192 + +[optim.optim] +lr = 0.003551730141097694 +betas1 = 0.9454835470717078 +betas2 = 0.9190488086654895 +weight_decay = 0.24530252977858977 + +[data] +seq_length = 1024 +dataset_name_or_paths = "datasets/fineweb-edu"