From 2b028cc4c21a409558bcee05130a28a64a48d2b4 Mon Sep 17 00:00:00 2001
From: jeffnvidia <jmahou@nvidia.com>
Date: Sun, 19 May 2024 17:27:34 +0300
Subject: [PATCH 1/3] support Llama testing

---
 conf/v0.6/general/test/gpt.toml                    |  5 +++++
 conf/v0.6/general/test/llama.toml                  | 12 ++++++++++++
 conf/v0.6/general/test/nemo_launcher.toml          |  3 ---
 conf/v0.6/general/test_template/nemo_launcher.toml | 10 ++++------
 4 files changed, 21 insertions(+), 9 deletions(-)
 create mode 100644 conf/v0.6/general/test/gpt.toml
 create mode 100644 conf/v0.6/general/test/llama.toml
 delete mode 100644 conf/v0.6/general/test/nemo_launcher.toml

diff --git a/conf/v0.6/general/test/gpt.toml b/conf/v0.6/general/test/gpt.toml
new file mode 100644
index 000000000..b14d40467
--- /dev/null
+++ b/conf/v0.6/general/test/gpt.toml
@@ -0,0 +1,5 @@
+name = "gpt"
+description = "gpt"
+test_template_name = "NeMoLauncher"
+
+[cmd_args]
\ No newline at end of file
diff --git a/conf/v0.6/general/test/llama.toml b/conf/v0.6/general/test/llama.toml
new file mode 100644
index 000000000..12ecaed79
--- /dev/null
+++ b/conf/v0.6/general/test/llama.toml
@@ -0,0 +1,12 @@
+name = "llama"
+description = "Llama2 70b"
+test_template_name = "NeMoLauncher"
+# FIX ME : ~training.model.position_embedding_type is a quick fix that should be changed with newer version of container
+# the commit that should fix this issue is : 5b296e8af832c67d361fdfb80a165db3affaf76a
+extra_cmd_args = "~training.model.position_embedding_type +training.model.fsdp=True ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer training.model.virtual_pipeline_model_parallel_size=null training.model.megatron_amp_O2=False training.model.activations_checkpoint_num_layers=null training.model.gradient_accumulation_fusion=False training.model.use_cpu_initialization=True training.model.optim.name=fused_adam training.model.tokenizer.model=TOKENIZER_MODEL training.exp_manager.create_wandb_logger=False"
+
+[cmd_args]
+"training" = "llama/llama2_70b"
+"training.trainer.max_steps" = "120"
+"training.model.global_batch_size" = "256"
+"training.model.pipeline_model_parallel_size" = "1"
\ No newline at end of file
diff --git a/conf/v0.6/general/test/nemo_launcher.toml b/conf/v0.6/general/test/nemo_launcher.toml
deleted file mode 100644
index 89840f2d4..000000000
--- a/conf/v0.6/general/test/nemo_launcher.toml
+++ /dev/null
@@ -1,3 +0,0 @@
-name = "nemo_launcher"
-description = "NeMo-Launcher"
-test_template_name = "NeMoLauncher"
diff --git a/conf/v0.6/general/test_template/nemo_launcher.toml b/conf/v0.6/general/test_template/nemo_launcher.toml
index 59cee1752..17738d3aa 100644
--- a/conf/v0.6/general/test_template/nemo_launcher.toml
+++ b/conf/v0.6/general/test_template/nemo_launcher.toml
@@ -33,7 +33,7 @@ name = "NeMoLauncher"
     default = "8"
 
   [cmd_args.training]
-  values = ["gpt3/40b_improved"]
+  values = ["gpt3/40b_improved", "llama/llama2_70b"]
   default = "gpt3/40b_improved"
     [cmd_args.training.exp_manager]
       [cmd_args.training.exp_manager.create_checkpoint_callback]
@@ -42,9 +42,8 @@ name = "NeMoLauncher"
 
     [cmd_args.training.trainer]
       [cmd_args.training.trainer.max_steps]
-      type = "preset"
-      values = ["100", "500", "1000", "2000"]
-      default = "100"
+      type = "int"
+      default = "400"
 
       [cmd_args.training.trainer.val_check_interval]
       type = "preset"
@@ -62,8 +61,7 @@ name = "NeMoLauncher"
 
     [cmd_args.training.model]
       [cmd_args.training.model.global_batch_size]
-      type = "preset"
-      values = ["8", "16", "32", "128"]
+      type = "int"
       default = "128"
 
       [cmd_args.training.model.micro_batch_size]

From 7044db7e2efd17d807425e69fff2e504d33d4a17 Mon Sep 17 00:00:00 2001
From: jeffnvidia <jmahou@nvidia.com>
Date: Mon, 3 Jun 2024 18:22:48 +0300
Subject: [PATCH 2/3] change the FIX ME comment

---
 conf/v0.6/general/test/llama.toml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/conf/v0.6/general/test/llama.toml b/conf/v0.6/general/test/llama.toml
index 12ecaed79..bc0e486ec 100644
--- a/conf/v0.6/general/test/llama.toml
+++ b/conf/v0.6/general/test/llama.toml
@@ -1,8 +1,9 @@
 name = "llama"
 description = "Llama2 70b"
 test_template_name = "NeMoLauncher"
-# FIX ME : ~training.model.position_embedding_type is a quick fix that should be changed with newer version of container
-# the commit that should fix this issue is : 5b296e8af832c67d361fdfb80a165db3affaf76a
+# FIX ME : ~training.model.position_embedding_type was added in the extra_cmd_args in order to fix a bug from NeMo repository (https://github.com/NVIDIA/NeMo).
+# the commit that should fix this issue in NeMo is : 5b296e8af832c67d361fdfb80a165db3affaf76a.
+# Once the new release of NeMoLauncher includes this commit (check by downloading the corresponding container and look inside /opt for this commit), ~training.model.position_embedding_type should be removed from the extra args
 extra_cmd_args = "~training.model.position_embedding_type +training.model.fsdp=True ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer training.model.virtual_pipeline_model_parallel_size=null training.model.megatron_amp_O2=False training.model.activations_checkpoint_num_layers=null training.model.gradient_accumulation_fusion=False training.model.use_cpu_initialization=True training.model.optim.name=fused_adam training.model.tokenizer.model=TOKENIZER_MODEL training.exp_manager.create_wandb_logger=False"
 
 [cmd_args]

From e8fd40a642c3fec3ade8cda0888bd2502d592992 Mon Sep 17 00:00:00 2001
From: jeffnvidia <jmahou@nvidia.com>
Date: Tue, 4 Jun 2024 13:26:48 +0300
Subject: [PATCH 3/3] remove cmd_args from gpt.toml and adjust FIXME in
 Llama.toml

---
 conf/v0.6/general/test/gpt.toml   | 4 +---
 conf/v0.6/general/test/llama.toml | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/conf/v0.6/general/test/gpt.toml b/conf/v0.6/general/test/gpt.toml
index b14d40467..5c7400b61 100644
--- a/conf/v0.6/general/test/gpt.toml
+++ b/conf/v0.6/general/test/gpt.toml
@@ -1,5 +1,3 @@
 name = "gpt"
 description = "gpt"
-test_template_name = "NeMoLauncher"
-
-[cmd_args]
\ No newline at end of file
+test_template_name = "NeMoLauncher"
\ No newline at end of file
diff --git a/conf/v0.6/general/test/llama.toml b/conf/v0.6/general/test/llama.toml
index bc0e486ec..44c13bbc3 100644
--- a/conf/v0.6/general/test/llama.toml
+++ b/conf/v0.6/general/test/llama.toml
@@ -1,7 +1,7 @@
 name = "llama"
 description = "Llama2 70b"
 test_template_name = "NeMoLauncher"
-# FIX ME : ~training.model.position_embedding_type was added in the extra_cmd_args in order to fix a bug from NeMo repository (https://github.com/NVIDIA/NeMo).
+# FIXME : ~training.model.position_embedding_type was added in the extra_cmd_args in order to fix a bug from NeMo repository (https://github.com/NVIDIA/NeMo).
 # the commit that should fix this issue in NeMo is : 5b296e8af832c67d361fdfb80a165db3affaf76a.
 # Once the new release of NeMoLauncher includes this commit (check by downloading the corresponding container and look inside /opt for this commit), ~training.model.position_embedding_type should be removed from the extra args
 extra_cmd_args = "~training.model.position_embedding_type +training.model.fsdp=True ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer training.model.virtual_pipeline_model_parallel_size=null training.model.megatron_amp_O2=False training.model.activations_checkpoint_num_layers=null training.model.gradient_accumulation_fusion=False training.model.use_cpu_initialization=True training.model.optim.name=fused_adam training.model.tokenizer.model=TOKENIZER_MODEL training.exp_manager.create_wandb_logger=False"