From 2b028cc4c21a409558bcee05130a28a64a48d2b4 Mon Sep 17 00:00:00 2001 From: jeffnvidia Date: Sun, 19 May 2024 17:27:34 +0300 Subject: [PATCH 1/3] support Llama testing --- conf/v0.6/general/test/gpt.toml | 5 +++++ conf/v0.6/general/test/llama.toml | 12 ++++++++++++ conf/v0.6/general/test/nemo_launcher.toml | 3 --- conf/v0.6/general/test_template/nemo_launcher.toml | 10 ++++------ 4 files changed, 21 insertions(+), 9 deletions(-) create mode 100644 conf/v0.6/general/test/gpt.toml create mode 100644 conf/v0.6/general/test/llama.toml delete mode 100644 conf/v0.6/general/test/nemo_launcher.toml diff --git a/conf/v0.6/general/test/gpt.toml b/conf/v0.6/general/test/gpt.toml new file mode 100644 index 000000000..b14d40467 --- /dev/null +++ b/conf/v0.6/general/test/gpt.toml @@ -0,0 +1,5 @@ +name = "gpt" +description = "gpt" +test_template_name = "NeMoLauncher" + +[cmd_args] \ No newline at end of file diff --git a/conf/v0.6/general/test/llama.toml b/conf/v0.6/general/test/llama.toml new file mode 100644 index 000000000..12ecaed79 --- /dev/null +++ b/conf/v0.6/general/test/llama.toml @@ -0,0 +1,12 @@ +name = "llama" +description = "Llama2 70b" +test_template_name = "NeMoLauncher" +# FIX ME : ~training.model.position_embedding_type is a quick fix that should be changed with newer version of container +# the commit that should fix this issue is : 5b296e8af832c67d361fdfb80a165db3affaf76a +extra_cmd_args = "~training.model.position_embedding_type +training.model.fsdp=True ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer training.model.virtual_pipeline_model_parallel_size=null training.model.megatron_amp_O2=False training.model.activations_checkpoint_num_layers=null training.model.gradient_accumulation_fusion=False training.model.use_cpu_initialization=True training.model.optim.name=fused_adam training.model.tokenizer.model=TOKENIZER_MODEL training.exp_manager.create_wandb_logger=False" + +[cmd_args] +"training" = "llama/llama2_70b" +"training.trainer.max_steps" = "120" +"training.model.global_batch_size" = "256" +"training.model.pipeline_model_parallel_size" = "1" \ No newline at end of file diff --git a/conf/v0.6/general/test/nemo_launcher.toml b/conf/v0.6/general/test/nemo_launcher.toml deleted file mode 100644 index 89840f2d4..000000000 --- a/conf/v0.6/general/test/nemo_launcher.toml +++ /dev/null @@ -1,3 +0,0 @@ -name = "nemo_launcher" -description = "NeMo-Launcher" -test_template_name = "NeMoLauncher" diff --git a/conf/v0.6/general/test_template/nemo_launcher.toml b/conf/v0.6/general/test_template/nemo_launcher.toml index 59cee1752..17738d3aa 100644 --- a/conf/v0.6/general/test_template/nemo_launcher.toml +++ b/conf/v0.6/general/test_template/nemo_launcher.toml @@ -33,7 +33,7 @@ name = "NeMoLauncher" default = "8" [cmd_args.training] - values = ["gpt3/40b_improved"] + values = ["gpt3/40b_improved", "llama/llama2_70b"] default = "gpt3/40b_improved" [cmd_args.training.exp_manager] [cmd_args.training.exp_manager.create_checkpoint_callback] @@ -42,9 +42,8 @@ name = "NeMoLauncher" [cmd_args.training.trainer] [cmd_args.training.trainer.max_steps] - type = "preset" - values = ["100", "500", "1000", "2000"] - default = "100" + type = "int" + default = "400" [cmd_args.training.trainer.val_check_interval] type = "preset" @@ -62,8 +61,7 @@ name = "NeMoLauncher" [cmd_args.training.model] [cmd_args.training.model.global_batch_size] - type = "preset" - values = ["8", "16", "32", "128"] + type = "int" default = "128" [cmd_args.training.model.micro_batch_size] From 7044db7e2efd17d807425e69fff2e504d33d4a17 Mon Sep 17 00:00:00 2001 From: jeffnvidia Date: Mon, 3 Jun 2024 18:22:48 +0300 Subject: [PATCH 2/3] change the FIX ME comment --- conf/v0.6/general/test/llama.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/conf/v0.6/general/test/llama.toml b/conf/v0.6/general/test/llama.toml index 12ecaed79..bc0e486ec 100644 --- a/conf/v0.6/general/test/llama.toml +++ b/conf/v0.6/general/test/llama.toml @@ -1,8 +1,9 @@ name = "llama" description = "Llama2 70b" test_template_name = "NeMoLauncher" -# FIX ME : ~training.model.position_embedding_type is a quick fix that should be changed with newer version of container -# the commit that should fix this issue is : 5b296e8af832c67d361fdfb80a165db3affaf76a +# FIX ME : ~training.model.position_embedding_type was added in the extra_cmd_args in order to fix a bug from NeMo repository (https://github.com/NVIDIA/NeMo). +# the commit that should fix this issue in NeMo is : 5b296e8af832c67d361fdfb80a165db3affaf76a. +# Once the new release of NeMoLauncher includes this commit (check by downloading the corresponding container and look inside /opt for this commit), ~training.model.position_embedding_type should be removed from the extra args extra_cmd_args = "~training.model.position_embedding_type +training.model.fsdp=True ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer training.model.virtual_pipeline_model_parallel_size=null training.model.megatron_amp_O2=False training.model.activations_checkpoint_num_layers=null training.model.gradient_accumulation_fusion=False training.model.use_cpu_initialization=True training.model.optim.name=fused_adam training.model.tokenizer.model=TOKENIZER_MODEL training.exp_manager.create_wandb_logger=False" [cmd_args] From e8fd40a642c3fec3ade8cda0888bd2502d592992 Mon Sep 17 00:00:00 2001 From: jeffnvidia Date: Tue, 4 Jun 2024 13:26:48 +0300 Subject: [PATCH 3/3] remove cmd_args from gpt.toml and adjust FIXME in Llama.toml --- conf/v0.6/general/test/gpt.toml | 4 +--- conf/v0.6/general/test/llama.toml | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/conf/v0.6/general/test/gpt.toml b/conf/v0.6/general/test/gpt.toml index b14d40467..5c7400b61 100644 --- a/conf/v0.6/general/test/gpt.toml +++ b/conf/v0.6/general/test/gpt.toml @@ -1,5 +1,3 @@ name = "gpt" description = "gpt" -test_template_name = "NeMoLauncher" - -[cmd_args] \ No newline at end of file +test_template_name = "NeMoLauncher" \ No newline at end of file diff --git a/conf/v0.6/general/test/llama.toml b/conf/v0.6/general/test/llama.toml index bc0e486ec..44c13bbc3 100644 --- a/conf/v0.6/general/test/llama.toml +++ b/conf/v0.6/general/test/llama.toml @@ -1,7 +1,7 @@ name = "llama" description = "Llama2 70b" test_template_name = "NeMoLauncher" -# FIX ME : ~training.model.position_embedding_type was added in the extra_cmd_args in order to fix a bug from NeMo repository (https://github.com/NVIDIA/NeMo). +# FIXME : ~training.model.position_embedding_type was added in the extra_cmd_args in order to fix a bug from NeMo repository (https://github.com/NVIDIA/NeMo). # the commit that should fix this issue in NeMo is : 5b296e8af832c67d361fdfb80a165db3affaf76a. # Once the new release of NeMoLauncher includes this commit (check by downloading the corresponding container and look inside /opt for this commit), ~training.model.position_embedding_type should be removed from the extra args extra_cmd_args = "~training.model.position_embedding_type +training.model.fsdp=True ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer training.model.virtual_pipeline_model_parallel_size=null training.model.megatron_amp_O2=False training.model.activations_checkpoint_num_layers=null training.model.gradient_accumulation_fusion=False training.model.use_cpu_initialization=True training.model.optim.name=fused_adam training.model.tokenizer.model=TOKENIZER_MODEL training.exp_manager.create_wandb_logger=False"