From b7a28e48029233cd05ef812bbb413e1d3e544a38 Mon Sep 17 00:00:00 2001 From: tigranfah Date: Wed, 11 Dec 2024 19:56:10 +0400 Subject: [PATCH 1/3] prepare for 8 gpu training --- submitit_train.py | 4 ++-- train_configs/llama3.2_1b.toml | 8 ++++---- train_configs/llama3.2_1b_conversion.toml | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/submitit_train.py b/submitit_train.py index 4e1a5000..322fc976 100644 --- a/submitit_train.py +++ b/submitit_train.py @@ -9,7 +9,7 @@ if __name__ == "__main__": executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j") - n_gpus = 6 + n_gpus = 8 node = "h100" executor.update_parameters( name="titan", @@ -17,7 +17,7 @@ gpus_per_node=n_gpus, nodes=1, mem_gb=80, - cpus_per_task=n_gpus * 6, + cpus_per_task=n_gpus * 12, slurm_additional_parameters={"partition": node}, ) diff --git a/train_configs/llama3.2_1b.toml b/train_configs/llama3.2_1b.toml index b6c0cd8b..37d77246 100644 --- a/train_configs/llama3.2_1b.toml +++ b/train_configs/llama3.2_1b.toml @@ -50,7 +50,7 @@ enable_valid = true dataset = "chemlactica_valid" # supported datasets: chemlactica_valid_mini [dataloader] -num_workers = 2 +num_workers = 4 [experimental] pipeline_parallel_degree = 1 @@ -59,9 +59,9 @@ enable_async_tensor_parallel = false [checkpoint] enable_checkpoint = true save_folder = "yerevann/Llama-3.2-1B" -# load_folder = "meta-llama/Llama-3.2-1B" -load_folder = "yerevann/Llama-3.2-1B/7b98d06b463e45ea8db87d05" -load_at_step = 22000 +load_folder = "meta-llama/Llama-3.2-1B" +# load_folder = "yerevann/Llama-3.2-1B/ec943c9e63db4cf7b4a8b847" +load_at_step = 40000 interval_type = "steps" interval = 2000 model_weights_only = false diff --git a/train_configs/llama3.2_1b_conversion.toml b/train_configs/llama3.2_1b_conversion.toml index c4128526..3c21ad11 100644 --- a/train_configs/llama3.2_1b_conversion.toml +++ b/train_configs/llama3.2_1b_conversion.toml @@ -52,9 +52,9 @@ enable_async_tensor_parallel = false enable_checkpoint = true # load_folder = "meta-llama/Llama-3.2-1B" # save_folder = "meta-llama/Llama-3.2-1B" -load_folder = "yerevann/Llama-3.2-1B/04711d5d4fad44df8b81bd20" -load_at_step = 40000 -save_folder = "hf/yerevann/Llama-3.2-1B/04711d5d4fad44df8b81bd20" +load_folder = "yerevann/Llama-3.2-1B/faf448be3acd495db1f270f6" +load_at_step = 20000 +save_folder = "hf/yerevann/Llama-3.2-1B/faf448be3acd495db1f270f6" interval_type = "steps" interval = 1000 model_weights_only = false From 5df7a5d1c06a5e04f820b0809101d491ded7e8d9 Mon Sep 17 00:00:00 2001 From: tigranfah Date: Mon, 16 Dec 2024 17:58:38 +0400 Subject: [PATCH 2/3] update hparam tuning script --- submitit_train.py | 2 +- submitit_train_hparam_tuning.py | 87 +++++++++++++++-------- train_configs/llama3.2_1b.toml | 8 +-- train_configs/llama3.2_1b_conversion.toml | 6 +- 4 files changed, 65 insertions(+), 38 deletions(-) diff --git a/submitit_train.py b/submitit_train.py index 322fc976..0de8e1e5 100644 --- a/submitit_train.py +++ b/submitit_train.py @@ -9,7 +9,7 @@ if __name__ == "__main__": executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j") - n_gpus = 8 + n_gpus = 6 node = "h100" executor.update_parameters( name="titan", diff --git a/submitit_train_hparam_tuning.py b/submitit_train_hparam_tuning.py index b0a0540d..fcae3293 100644 --- a/submitit_train_hparam_tuning.py +++ b/submitit_train_hparam_tuning.py @@ -1,19 +1,24 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + import submitit -import datetime -import yaml -import os if __name__ == "__main__": executor = submitit.AutoExecutor(folder="~/slurm_jobs/titan/job_%j") - n_gpus = 8 + n_gpus = 6 + node = "h100" executor.update_parameters( - name="titan", timeout_min=3 * 60, + name="titan", + timeout_min=6 * 60, gpus_per_node=n_gpus, - nodes=1, mem_gb=80, cpus_per_task=n_gpus * 4, - slurm_additional_parameters={ - "partition": "h100" - } + nodes=1, + mem_gb=80, + cpus_per_task=n_gpus * 12, + slurm_additional_parameters={"partition": node}, ) hparams = { @@ -21,29 +26,51 @@ # "optimizer.lr": ["8e-4", "6e-4", "4e-4", "2e-4"], # "optimizer.lr": ["2.5e-4"], # "optimizer.lr": ["1e-4", "8e-5", "6e-5", "4e-5", "2e-5"], + "training.gradient_accumulation_steps": ["21", "25", "29", "33"], + "training.steps": ["31000", "26000", "22.500", "20000"], } jobs = [] with executor.batch(): for _ in range(1): - for hparam_name, value in hparams.items(): - for v in value: - # train_config = './train_configs/chemlactica_125m.toml' - # train_config = './train_configs/chemlactica_1.3b.toml' - train_config = './train_configs/llama3.2_1b.toml' - # train_config = './train_configs/debug_model.toml' - function = submitit.helpers.CommandFunction([ - 'python3', '-m', 'torch.distributed.run', - '--nproc_per_node', f'{n_gpus}', - '--rdzv_backend', 'c10d', - '--rdzv_endpoint', 'localhost:0', - '--local-ranks-filter', '0', - '--role', 'rank', '--tee', '3', - 'train.py', - '--job.config_file', train_config, - f'--{hparam_name}', v - ]) - print(' '.join(function.command)) - # subprocess.run(function.command) - job = executor.submit(function) - jobs.append(job) + length = len(list(hparams.values())[0]) + for i in range(length): + hparam_dict = {} + for key, values in hparams.items(): + hparam_dict[key] = values[i] + + # train_config = './train_configs/chemlactica_125m.toml' + # train_config = './train_configs/chemlactica_1.3b.toml' + train_config = "./train_configs/llama3.2_1b.toml" + # train_config = './train_configs/debug_model.toml' + command_lst = [ + "python3", + "-m", + "torch.distributed.run", + "--nproc_per_node", + f"{n_gpus}", + "--rdzv_backend", + "c10d", + "--rdzv_endpoint", + "localhost:0", + "--local-ranks-filter", + "0", + "--role", + "rank", + "--tee", + "3", + "train.py", + "--job.config_file", + train_config, + ] + + # add the hparam + for key, value in hparam_dict.items(): + command_lst.append(f"--{key}") + command_lst.append(value) + + function = submitit.helpers.CommandFunction(command_lst) + print(" ".join(function.command)) + # subprocess.run(function.command) + job = executor.submit(function) + jobs.append(job) diff --git a/train_configs/llama3.2_1b.toml b/train_configs/llama3.2_1b.toml index 37d77246..d30c1a9b 100644 --- a/train_configs/llama3.2_1b.toml +++ b/train_configs/llama3.2_1b.toml @@ -30,11 +30,11 @@ lr = 6e-4 [training] batch_size = 10 -gradient_accumulation_steps = 16 +gradient_accumulation_steps = 21 seq_len = 2048 warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps max_norm = 1.0 # grad norm clipping -steps = 40000 +steps = 31000 data_parallel_degree = -1 tensor_parallel_degree = 1 compile = true @@ -50,7 +50,7 @@ enable_valid = true dataset = "chemlactica_valid" # supported datasets: chemlactica_valid_mini [dataloader] -num_workers = 4 +num_workers = 2 [experimental] pipeline_parallel_degree = 1 @@ -61,7 +61,7 @@ enable_checkpoint = true save_folder = "yerevann/Llama-3.2-1B" load_folder = "meta-llama/Llama-3.2-1B" # load_folder = "yerevann/Llama-3.2-1B/ec943c9e63db4cf7b4a8b847" -load_at_step = 40000 +# load_at_step = 40000 interval_type = "steps" interval = 2000 model_weights_only = false diff --git a/train_configs/llama3.2_1b_conversion.toml b/train_configs/llama3.2_1b_conversion.toml index 3c21ad11..e4494694 100644 --- a/train_configs/llama3.2_1b_conversion.toml +++ b/train_configs/llama3.2_1b_conversion.toml @@ -52,9 +52,9 @@ enable_async_tensor_parallel = false enable_checkpoint = true # load_folder = "meta-llama/Llama-3.2-1B" # save_folder = "meta-llama/Llama-3.2-1B" -load_folder = "yerevann/Llama-3.2-1B/faf448be3acd495db1f270f6" -load_at_step = 20000 -save_folder = "hf/yerevann/Llama-3.2-1B/faf448be3acd495db1f270f6" +load_folder = "yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8" +load_at_step = 40000 +save_folder = "hf/yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8" interval_type = "steps" interval = 1000 model_weights_only = false From a3a42fb9192a4d5f2f80629476f71c5ae94ee38c Mon Sep 17 00:00:00 2001 From: tigranfah Date: Wed, 18 Dec 2024 11:06:26 +0400 Subject: [PATCH 3/3] add llama 3.2 3b connfigs and files --- submitit_train.py | 2 +- submitit_train_hparam_tuning.py | 4 +- torchtitan/models/llama/__init__.py | 13 +++- train_configs/llama3.2_1b.toml | 4 +- train_configs/llama3.2_3b.toml | 76 +++++++++++++++++++++++ train_configs/llama3.2_3b_conversion.toml | 74 ++++++++++++++++++++++ 6 files changed, 166 insertions(+), 7 deletions(-) create mode 100644 train_configs/llama3.2_3b.toml create mode 100644 train_configs/llama3.2_3b_conversion.toml diff --git a/submitit_train.py b/submitit_train.py index 0de8e1e5..8bb54bfe 100644 --- a/submitit_train.py +++ b/submitit_train.py @@ -26,7 +26,7 @@ for _ in range(1): # train_config = './train_configs/chemlactica_125m.toml' # train_config = './train_configs/chemlactica_1.3b.toml' - train_config = "./train_configs/llama3.2_1b.toml" + train_config = "./train_configs/llama3.2_3b.toml" # train_config = './train_configs/debug_model.toml' function = submitit.helpers.CommandFunction( [ diff --git a/submitit_train_hparam_tuning.py b/submitit_train_hparam_tuning.py index fcae3293..610bc3fc 100644 --- a/submitit_train_hparam_tuning.py +++ b/submitit_train_hparam_tuning.py @@ -26,8 +26,8 @@ # "optimizer.lr": ["8e-4", "6e-4", "4e-4", "2e-4"], # "optimizer.lr": ["2.5e-4"], # "optimizer.lr": ["1e-4", "8e-5", "6e-5", "4e-5", "2e-5"], - "training.gradient_accumulation_steps": ["21", "25", "29", "33"], - "training.steps": ["31000", "26000", "22.500", "20000"], + # "training.gradient_accumulation_steps": ["21", "25", "29", "33"], + # "training.steps": ["31000", "26000", "22500", "20000"], } jobs = [] diff --git a/torchtitan/models/llama/__init__.py b/torchtitan/models/llama/__init__.py index 15b6b597..cff36e7c 100644 --- a/torchtitan/models/llama/__init__.py +++ b/torchtitan/models/llama/__init__.py @@ -37,7 +37,16 @@ n_heads=32, n_kv_heads=8, rope_theta=500000, - share_embeddings=True + share_embeddings=True, + ), + "3B": ModelArgs( + dim=3072, + n_layers=28, + n_heads=24, + n_kv_heads=8, + rope_theta=500000, + ffn_dim_multiplier=2 / 3, # in Llama3.2-3B dim is 3072, but ffn dim is 8192 + share_embeddings=True, ), "8B": ModelArgs( dim=4096, @@ -66,4 +75,4 @@ multiple_of=4096, rope_theta=500000, ), -} \ No newline at end of file +} diff --git a/train_configs/llama3.2_1b.toml b/train_configs/llama3.2_1b.toml index d30c1a9b..b9691e19 100644 --- a/train_configs/llama3.2_1b.toml +++ b/train_configs/llama3.2_1b.toml @@ -30,11 +30,11 @@ lr = 6e-4 [training] batch_size = 10 -gradient_accumulation_steps = 21 +gradient_accumulation_steps = 16 seq_len = 2048 warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps max_norm = 1.0 # grad norm clipping -steps = 31000 +steps = 40000 data_parallel_degree = -1 tensor_parallel_degree = 1 compile = true diff --git a/train_configs/llama3.2_3b.toml b/train_configs/llama3.2_3b.toml new file mode 100644 index 00000000..eb394458 --- /dev/null +++ b/train_configs/llama3.2_3b.toml @@ -0,0 +1,76 @@ +# torchtitan Config.toml + +[job] +dump_folder = "/nfs/h100/raid/chem/checkpoints" +description = "Llama 3.2 training" +use_for_integration_test = false + +[profiling] +enable_profiling = false +save_traces_folder = "profile_trace" +profile_freq = 10 +enable_memory_snapshot = false +save_memory_snapshot_folder = "memory_snapshot" + +[metrics] +log_freq = 1 +enable_color_printing = true +enable_aim = true +save_aim_folder = "aim" + +[model] +name = "llama3" +flavor = "3B" +norm_type = "rmsnorm" # layernorm / np_layernorm / rmsnorm / compiled_rmsnorm / fused_rmsnorm +tokenizer_path = "torchtitan/tokenizers/Llama-3.2-chem-1B-v1/" + +[optimizer] +name = "AdamW" +lr = 6e-4 + +[training] +batch_size = 6 +gradient_accumulation_steps = 28 +seq_len = 2048 +warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps +max_norm = 1.0 # grad norm clipping +steps = 40000 +data_parallel_degree = -1 +tensor_parallel_degree = 1 +compile = true +# dataset = "c4" # supported datasets: c4_test (2K), c4 (177M) +# dataset = "chemlactica_train_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K) +dataset = "chemlactica_train" +data_processing_style="chemlactica_style" +representation_type = "SMILES" + +[validation] +valid_freq = 2000 +enable_valid = true +dataset = "chemlactica_valid" # supported datasets: chemlactica_valid_mini + +[dataloader] +num_workers = 2 + +[experimental] +pipeline_parallel_degree = 1 +enable_async_tensor_parallel = false + +[checkpoint] +enable_checkpoint = true +save_folder = "yerevann/Llama-3.2-3B" +load_folder = "meta-llama/Llama-3.2-3B" +# load_folder = "yerevann/Llama-3.2-1B/ec943c9e63db4cf7b4a8b847" +# load_at_step = 40000 +interval_type = "steps" +interval = 2000 +model_weights_only = false +export_dtype = "float32" +async_mode = "async_with_pinned_mem" # ["disabled", "async", "async_with_pinned_mem"] + +[activation_checkpoint] +mode = 'none' # ['none', 'selective', 'full'] +selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy + +[float8] +enable_float8_linear = false diff --git a/train_configs/llama3.2_3b_conversion.toml b/train_configs/llama3.2_3b_conversion.toml new file mode 100644 index 00000000..699c1e11 --- /dev/null +++ b/train_configs/llama3.2_3b_conversion.toml @@ -0,0 +1,74 @@ +# torchtitan Config.toml + +[job] +dump_folder = "/nfs/h100/raid/chem/checkpoints" +description = "Llama 3.2 training" +use_for_integration_test = false + +[profiling] +enable_profiling = false +save_traces_folder = "profile_trace" +profile_freq = 10 +enable_memory_snapshot = false +save_memory_snapshot_folder = "memory_snapshot" + +[metrics] +log_freq = 1 +enable_color_printing = true +enable_aim = false +save_aim_folder = "aim" + +[model] +name = "llama3" +flavor = "3B" +norm_type = "rmsnorm" # layernorm / np_layernorm / rmsnorm / compiled_rmsnorm / fused_rmsnorm +tokenizer_path = "torchtitan/tokenizers/Llama-3.2-chem-1B-v1" +# tokenizer_path = "meta-llama/Llama-3.2-1B" + +[optimizer] +name = "AdamW" +lr = 1.0e-4 + +[training] +batch_size = 1 +gradient_accumulation_steps = 3 +seq_len = 2048 +warmup_steps = 500 # lr scheduler warm up, normally 20% of the train steps +max_norm = 1.0 # grad norm clipping +steps = 10 +data_parallel_degree = -1 +tensor_parallel_degree = 1 +compile = false +# dataset = "c4" # supported datasets: c4_test (2K), c4 (177M) +# dataset = "chemlactica_train_mini" # supported datasets: c4_test (2K), c4 (177M), chemlactica_train_mini (4K) +dataset = "chemlactica_train" +data_processing_style="chemlactica_style" + +[experimental] +pipeline_parallel_degree = 1 +enable_async_tensor_parallel = false + +[checkpoint] +enable_checkpoint = true +load_folder = "meta-llama/Llama-3.2-3B" +save_folder = "meta-llama/Llama-3.2-3B" +# load_folder = "yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8" +load_at_step = 0 +# save_folder = "hf/yerevann/Llama-3.2-1B/e625b9a4b9784da4a63fa1a8" +interval_type = "steps" +interval = 1000 +model_weights_only = false +export_dtype = "float32" +async_mode = "async_with_pinned_mem" # ["disabled", "async", "async_with_pinned_mem"] + +[model_download_export] +to_titan = true +weights_source = "huggingface" +# to_hf = true + +[activation_checkpoint] +mode = 'none' # ['none', 'selective', 'full'] +selective_ac_option = '2' # 'int' = ac every positive int layer or 'op', ac based on ops policy + +[float8] +enable_float8_linear = false