NVIDIA · TaekyungHeo · Jun 4, 2024 · May 19, 2024 · Jun 3, 2024 · Jun 4, 2024
diff --git a/conf/v0.6/general/test/gpt.toml b/conf/v0.6/general/test/gpt.toml
@@ -0,0 +1,3 @@
+name = "gpt"
+description = "gpt"
+test_template_name = "NeMoLauncher"
diff --git a/conf/v0.6/general/test/llama.toml b/conf/v0.6/general/test/llama.toml
@@ -0,0 +1,13 @@
+name = "llama"
+description = "Llama2 70b"
+test_template_name = "NeMoLauncher"
+# FIXME : ~training.model.position_embedding_type was added in the extra_cmd_args in order to fix a bug from NeMo repository (https://github.com/NVIDIA/NeMo).
+# the commit that should fix this issue in NeMo is : 5b296e8af832c67d361fdfb80a165db3affaf76a.
+# Once the new release of NeMoLauncher includes this commit (check by downloading the corresponding container and look inside /opt for this commit), ~training.model.position_embedding_type should be removed from the extra args
+extra_cmd_args = "~training.model.position_embedding_type +training.model.fsdp=True ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer training.model.virtual_pipeline_model_parallel_size=null training.model.megatron_amp_O2=False training.model.activations_checkpoint_num_layers=null training.model.gradient_accumulation_fusion=False training.model.use_cpu_initialization=True training.model.optim.name=fused_adam training.model.tokenizer.model=TOKENIZER_MODEL training.exp_manager.create_wandb_logger=False"
+
+[cmd_args]
+"training" = "llama/llama2_70b"
+"training.trainer.max_steps" = "120"
+"training.model.global_batch_size" = "256"
+"training.model.pipeline_model_parallel_size" = "1"
diff --git a/conf/v0.6/general/test/nemo_launcher.toml b/conf/v0.6/general/test/nemo_launcher.toml
diff --git a/conf/v0.6/general/test_template/nemo_launcher.toml b/conf/v0.6/general/test_template/nemo_launcher.toml
@@ -33,7 +33,7 @@ name = "NeMoLauncher"
     default = "8"
 
   [cmd_args.training]
-  values = ["gpt3/40b_improved"]
+  values = ["gpt3/40b_improved", "llama/llama2_70b"]
   default = "gpt3/40b_improved"
     [cmd_args.training.exp_manager]
       [cmd_args.training.exp_manager.create_checkpoint_callback]
@@ -42,9 +42,8 @@ name = "NeMoLauncher"
 
     [cmd_args.training.trainer]
       [cmd_args.training.trainer.max_steps]
-      type = "preset"
-      values = ["100", "500", "1000", "2000"]
-      default = "100"
+      type = "int"
+      default = "400"
 
       [cmd_args.training.trainer.val_check_interval]
       type = "preset"
@@ -62,8 +61,7 @@ name = "NeMoLauncher"
 
     [cmd_args.training.model]
       [cmd_args.training.model.global_batch_size]
-      type = "preset"
-      values = ["8", "16", "32", "128"]
+      type = "int"
       default = "128"
 
       [cmd_args.training.model.micro_batch_size]