support Llama testing

NVIDIA · Jun 3, 2024 · 5d7761a · 5d7761a
1 parent ee3c928
commit 5d7761a
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 9 deletions.
diff --git a/conf/v0.6/general/test/gpt.toml b/conf/v0.6/general/test/gpt.toml
@@ -0,0 +1,5 @@
+name = "gpt"
+description = "gpt"
+test_template_name = "NeMoLauncher"
+
+[cmd_args]
diff --git a/conf/v0.6/general/test/llama.toml b/conf/v0.6/general/test/llama.toml
@@ -0,0 +1,12 @@
+name = "llama"
+description = "Llama2 70b"
+test_template_name = "NeMoLauncher"
+# FIX ME : ~training.model.position_embedding_type is a quick fix that should be changed with newer version of container
+# the commit that should fix this issue is : 5b296e8af832c67d361fdfb80a165db3affaf76a
+extra_cmd_args = "~training.model.position_embedding_type +training.model.fsdp=True ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer training.model.virtual_pipeline_model_parallel_size=null training.model.megatron_amp_O2=False training.model.activations_checkpoint_num_layers=null training.model.gradient_accumulation_fusion=False training.model.use_cpu_initialization=True training.model.optim.name=fused_adam training.model.tokenizer.model=TOKENIZER_MODEL training.exp_manager.create_wandb_logger=False"
+
+[cmd_args]
+"training" = "llama/llama2_70b"
+"training.trainer.max_steps" = "120"
+"training.model.global_batch_size" = "256"
+"training.model.pipeline_model_parallel_size" = "1"
diff --git a/conf/v0.6/general/test/nemo_launcher.toml b/conf/v0.6/general/test/nemo_launcher.toml
diff --git a/conf/v0.6/general/test_template/nemo_launcher.toml b/conf/v0.6/general/test_template/nemo_launcher.toml
@@ -33,7 +33,7 @@ name = "NeMoLauncher"
     default = "8"
 
   [cmd_args.training]
-  values = ["gpt3/40b_improved"]
+  values = ["gpt3/40b_improved", "llama/llama2_70b"]
   default = "gpt3/40b_improved"
     [cmd_args.training.exp_manager]
       [cmd_args.training.exp_manager.create_checkpoint_callback]
@@ -42,9 +42,8 @@ name = "NeMoLauncher"
 
     [cmd_args.training.trainer]
       [cmd_args.training.trainer.max_steps]
-      type = "preset"
-      values = ["100", "500", "1000", "2000"]
-      default = "100"
+      type = "int"
+      default = "400"
 
       [cmd_args.training.trainer.val_check_interval]
       type = "preset"
@@ -62,8 +61,7 @@ name = "NeMoLauncher"
 
     [cmd_args.training.model]
       [cmd_args.training.model.global_batch_size]
-      type = "preset"
-      values = ["8", "16", "32", "128"]
+      type = "int"
       default = "128"
 
       [cmd_args.training.model.micro_batch_size]