forked from huggingface/picotron
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_config.py
136 lines (117 loc) · 5.76 KB
/
create_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
"""
python create_config.py --out_dir tmp --exp_name test_2_node --tp 2 --cp 2 --pp 2 --dp 2 --model_name HuggingFaceTB/SmolLM-360M-Instruct --num_attention_heads 16 --num_key_value_heads 4 --grad_acc_steps 1 --mbs 32 --seq_len 4096 --use_wandb
"""
import os
from copy import deepcopy
from transformers import AutoConfig
import shutil
import argparse
import json
from typing import Optional
from picotron.utils import download_model
def create_single_config(
out_dir: str,
tp: int,
cp: int,
dp: int,
pp: int,
pp_engine: str,
model_name: str,
num_hidden_layers: Optional[int],
num_attention_heads: Optional[int],
num_key_value_heads: Optional[int],
grad_acc_steps: int,
mbs: int,
seq_len: int,
subset_name: Optional[str],
exp_name: str,
use_wandb: bool = False,
use_cpu: bool = False,
use_fused_adam: bool = False,
hf_token: str = None
):
run_path = os.path.join(out_dir, exp_name)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
with open("template/base_config.json", "r") as f:
base_config = json.load(f)
config_content = deepcopy(base_config)
config_content["environment"]["HF_TOKEN"] = hf_token
config_content["training"]["seq_length"] = seq_len
config_content["checkpoint"]["save_dir"] = run_path
config_content["dataset"]["subset_name"] = subset_name
config_content["model"]["name"] = model_name
tmp_model_config = AutoConfig.from_pretrained(model_name)
config_content["model"]["num_hidden_layers"] = tmp_model_config.num_hidden_layers if num_hidden_layers is None else num_hidden_layers
config_content["model"]["num_attention_heads"] = tmp_model_config.num_attention_heads if num_attention_heads is None else num_attention_heads
config_content["model"]["num_key_value_heads"] = tmp_model_config.num_key_value_heads if num_key_value_heads is None else num_key_value_heads
config_content["model"]["use_fused_adam"] = use_fused_adam
del tmp_model_config
config_content['distributed']['tp_size'] = tp
config_content['distributed']['cp_size'] = cp
config_content['distributed']['dp_size'] = dp
config_content['distributed']['pp_size'] = pp
config_content['distributed']['pp_engine'] = pp_engine
config_content['distributed']['use_cpu'] = use_cpu
if use_cpu:
config_content["environment"]["FLASH_ATTEN"] = "0"
config_content["distributed"]["backend"] = "gloo"
config_content['logging']['use_wandb'] = use_wandb
config_content['logging']['run_name'] = exp_name
gbs = dp * mbs * grad_acc_steps
gbs_token = gbs * seq_len
print(f"Gbs_token: {gbs_token:,}, Gbs: {gbs}, dp: {dp}, seq_len: {seq_len}, grad_acc_steps: {grad_acc_steps}, mbs: {mbs}")
config_content['training']['gradient_accumulation_steps'] = grad_acc_steps
config_content['training']['micro_batch_size'] = mbs
if os.path.exists(run_path):
shutil.rmtree(run_path)
os.makedirs(run_path)
with open(os.path.join(run_path, "config.json"), "w") as new_config:
json.dump(config_content, new_config, indent=4)
del config_content
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--out_dir", type=str, help="Output directory to store the configs", default="tmp")
parser.add_argument("--tp", type=int, help="number of tensor parallelism", default=1)
parser.add_argument("--cp", type=int, help="number of context parallelism", default=1)
parser.add_argument("--dp", type=int, help="number of data parallelism", default=1)
parser.add_argument("--pp", type=int, help="number of pipeline parallelism", default=1)
parser.add_argument("--pp_engine", type=str, help="pipeline parallel engine", default="1f1b")
parser.add_argument("--model_name", type=str, help="Model name to create configs for", default="HuggingFaceTB/SmolLM-360M-Instruct")
parser.add_argument("--num_hidden_layers", type=int, help="Number of hidden layers", default=None)
parser.add_argument("--num_attention_heads", type=int, help="Number of attention heads", default=None)
parser.add_argument("--num_key_value_heads", type=int, help="Number of key value heads", default=None)
parser.add_argument("--grad_acc_steps", type=int, help="grad accumulation", default=1)
parser.add_argument("--mbs", type=int, help="micro batch size", default=1)
parser.add_argument("--seq_len", type=int, help="Sequence length", default=1024)
parser.add_argument("--subset_name", type=str, help="Subset name", default=None)
parser.add_argument("--exp_name", type=str, help="Experiment name", default="dummy_exp")
parser.add_argument("--use_wandb", action="store_true", help="Use wandb for logging")
parser.add_argument("--use_cpu", action="store_true", help="Use CPU for training")
parser.add_argument("--use_fused_adam", action="store_true", help="Use fused adam")
parser.add_argument("--hf_token", type=str, help="HF token")
args=parser.parse_args()
create_single_config(
out_dir=args.out_dir,
tp=args.tp,
cp=args.cp,
dp=args.dp,
pp=args.pp,
pp_engine=args.pp_engine,
model_name=args.model_name,
num_hidden_layers=args.num_hidden_layers,
num_attention_heads=args.num_attention_heads,
num_key_value_heads=args.num_key_value_heads,
grad_acc_steps=args.grad_acc_steps,
mbs=args.mbs,
seq_len=args.seq_len,
subset_name=args.subset_name,
exp_name=args.exp_name,
use_wandb=args.use_wandb,
use_cpu=args.use_cpu,
use_fused_adam=args.use_fused_adam,
hf_token=args.hf_token
)
print("Configs created successfully! ✅")
download_model(args.model_name, args.hf_token)
print("SafeTensors files downloaded successfully! ✅")