Skip to content

Commit

Permalink
support Llama testing
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffnvidia committed Jun 3, 2024
1 parent ee3c928 commit 5d7761a
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 9 deletions.
5 changes: 5 additions & 0 deletions conf/v0.6/general/test/gpt.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name = "gpt"
description = "gpt"
test_template_name = "NeMoLauncher"

[cmd_args]
12 changes: 12 additions & 0 deletions conf/v0.6/general/test/llama.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name = "llama"
description = "Llama2 70b"
test_template_name = "NeMoLauncher"
# FIX ME : ~training.model.position_embedding_type is a quick fix that should be changed with newer version of container
# the commit that should fix this issue is : 5b296e8af832c67d361fdfb80a165db3affaf76a
extra_cmd_args = "~training.model.position_embedding_type +training.model.fsdp=True ~training.model.optim.bucket_cap_mb ~training.model.optim.overlap_grad_sync ~training.model.optim.overlap_param_sync ~training.model.optim.contiguous_grad_buffer training.model.virtual_pipeline_model_parallel_size=null training.model.megatron_amp_O2=False training.model.activations_checkpoint_num_layers=null training.model.gradient_accumulation_fusion=False training.model.use_cpu_initialization=True training.model.optim.name=fused_adam training.model.tokenizer.model=TOKENIZER_MODEL training.exp_manager.create_wandb_logger=False"

[cmd_args]
"training" = "llama/llama2_70b"
"training.trainer.max_steps" = "120"
"training.model.global_batch_size" = "256"
"training.model.pipeline_model_parallel_size" = "1"
3 changes: 0 additions & 3 deletions conf/v0.6/general/test/nemo_launcher.toml

This file was deleted.

10 changes: 4 additions & 6 deletions conf/v0.6/general/test_template/nemo_launcher.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ name = "NeMoLauncher"
default = "8"

[cmd_args.training]
values = ["gpt3/40b_improved"]
values = ["gpt3/40b_improved", "llama/llama2_70b"]
default = "gpt3/40b_improved"
[cmd_args.training.exp_manager]
[cmd_args.training.exp_manager.create_checkpoint_callback]
Expand All @@ -42,9 +42,8 @@ name = "NeMoLauncher"

[cmd_args.training.trainer]
[cmd_args.training.trainer.max_steps]
type = "preset"
values = ["100", "500", "1000", "2000"]
default = "100"
type = "int"
default = "400"

[cmd_args.training.trainer.val_check_interval]
type = "preset"
Expand All @@ -62,8 +61,7 @@ name = "NeMoLauncher"

[cmd_args.training.model]
[cmd_args.training.model.global_batch_size]
type = "preset"
values = ["8", "16", "32", "128"]
type = "int"
default = "128"

[cmd_args.training.model.micro_batch_size]
Expand Down

0 comments on commit 5d7761a

Please sign in to comment.