diff --git a/.gitignore b/.gitignore index ab28bbee3..574f83391 100644 --- a/.gitignore +++ b/.gitignore @@ -130,4 +130,6 @@ venv.bak/ dmypy.json # Pyre type checker -.pyre/ \ No newline at end of file +.pyre/ +output +exp_data* \ No newline at end of file diff --git a/configs/dtr_gpt2_pretrain.py b/configs/dtr_gpt2_pretrain.py new file mode 100644 index 000000000..22b4f98cf --- /dev/null +++ b/configs/dtr_gpt2_pretrain.py @@ -0,0 +1,49 @@ +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from .common.models.gpt import pretrain_model as model +from .common.train import train +from .common.optim import optim +from .common.data.gpt_dataset import dataloader, tokenization + +from .common.models.graph import graph + +__dataset_root = "/share_nfs/sd_dataset/lph/datasets/libai_dataset" +vocab_file = f"{__dataset_root}/gpt2-vocab.json" +merge_files = f"{__dataset_root}/gpt2-merges.txt" +data_prefix = ( + f"{__dataset_root}/loss_compara_content_sentence" +) + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = data_prefix +dataloader.test[0].dataset.data_prefix = data_prefix +dataloader.test[0].dataset.indexed_dataset.data_prefix = data_prefix + +# GPT-2 model config +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.num_attention_heads = 16 +model.cfg.hidden_size = 384 +model.cfg.ffn_hidden_size = 1536 +model.cfg.hidden_layers = 6 +model.cfg.max_seq_length = 1024 + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 1.5e-4 + +train.train_micro_batch_size = 4 +# train.train_micro_batch_size = 42 for a800 +train.amp.enabled = False + +train.evaluation.evaluator = LazyCall(PPLEvaluator)() + +train.output_dir = "./output/gpt2_output" +graph.enabled = False diff --git a/configs/dtr_gpt3_pretrain.py b/configs/dtr_gpt3_pretrain.py new file mode 100644 index 000000000..35cb5c2c5 --- /dev/null +++ b/configs/dtr_gpt3_pretrain.py @@ -0,0 +1,48 @@ +from libai.config import LazyCall +from libai.evaluation import PPLEvaluator +from .common.models.gpt import pretrain_model as model +from .common.train import train +from .common.optim import optim +from .common.data.gpt_dataset import dataloader, tokenization + +from .common.models.graph import graph + +__dataset_root = "/share_nfs/sd_dataset/lph/datasets/libai_dataset" +vocab_file = f"{__dataset_root}/gpt2-vocab.json" +merge_files = f"{__dataset_root}/gpt2-merges.txt" +data_prefix = ( + f"{__dataset_root}/loss_compara_content_sentence" +) + +tokenization.tokenizer.vocab_file = vocab_file +tokenization.tokenizer.merges_file = merge_files +dataloader.train.dataset[0].data_prefix = data_prefix +dataloader.train.dataset[0].indexed_dataset.data_prefix = data_prefix +dataloader.test[0].dataset.data_prefix = data_prefix +dataloader.test[0].dataset.indexed_dataset.data_prefix = data_prefix + +# GPT-2 model config +model.cfg.embedding_dropout_prob = 0.1 +model.cfg.attention_dropout_prob = 0.1 +model.cfg.num_attention_heads = 32 +model.cfg.hidden_size = 4096 +model.cfg.ffn_hidden_size = 4096*4 +model.cfg.hidden_layers = 32 +model.cfg.max_seq_length = 1024 + +train.input_placement_device = "cpu" + +train.dist.pipeline_num_layers = model.cfg.hidden_layers + +for ds in dataloader.train.dataset: + ds.max_seq_length = model.cfg.max_seq_length + +optim.lr = 1.5e-4 + +train.train_micro_batch_size = 2 +train.amp.enabled = False + +train.evaluation.evaluator = LazyCall(PPLEvaluator)() + +train.output_dir = "./output/gpt3_output" +graph.enabled = False diff --git a/libai/engine/default.py b/libai/engine/default.py index 14a107167..ef8ba0bda 100644 --- a/libai/engine/default.py +++ b/libai/engine/default.py @@ -44,6 +44,51 @@ # https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/defaults.py # -------------------------------------------------------- +def count_all_parameters(model, verbose=False): + """ + Count total, trainable, and non-trainable parameters in a PyTorch model. + Args: + model (nn.Module): The model to count parameters for. + verbose (bool, optional): Print detailed information if True. + Returns: + Tuple containing total, trainable, and non-trainable parameters, and percent trainable parameters. + """ + train_params, all_params = 0, 0 + for _, param in model.named_parameters(): + num_params = param.numel() + all_params += num_params + if param.requires_grad: + train_params += num_params + nontrain_params = all_params - train_params + pct_train_params = train_params / all_params * 100 + if verbose: + logger = logging.getLogger(__name__) + logger.info(f"Total params: {format_size(all_params)}") + logger.info(f"Trainable params: {format_size(train_params)}") + logger.info(f"Non-trainable params: {format_size(nontrain_params)}") + logger.info(f"Trainable params %: {pct_train_params:.4f}") + return all_params, train_params, nontrain_params, pct_train_params + + +def format_size(size): + """ + Convert bytes to a human-readable string with appropriate units. + Args: + size (int): The number of bytes. + Returns: + String representing the number of bytes with appropriate units. + """ + k, m, b, t = 1024, 1024**2, 10**9, 10**12 + if size > t: + return f"{round(size / t, 4)}T" + elif size > b: + return f"{round(size / b, 4)}B" + elif size > m: + return f"{round(size / m, 4)}M" + elif size > k: + return f"{round(size / k, 4)}K" + else: + return f"{size}" def _highlight(code, filename): try: @@ -200,6 +245,18 @@ def default_setup(cfg, args): _compile_dependencies() +class EmtypOptimizer(): + def __init__(self,cfg): + pass + + def clip_grad(self): + pass + + def step(self): + pass + + def zero_grad(self): + pass class DefaultTrainer(TrainerBase): """ @@ -314,8 +371,9 @@ def __init__(self, cfg): "Building time: {:.3f} seconds".format(time.time() - start_time) ) - self.optimizer = self.build_optimizer(cfg, self.model) - self.lr_scheduler = self.build_lr_scheduler(cfg, self.optimizer) + # self.optimizer = self.build_optimizer(cfg, self.model) + self.optimizer = EmtypOptimizer(cfg) + # self.lr_scheduler = self.build_lr_scheduler(cfg, self.optimizer) if cfg.graph.enabled: self.graph_train = self.build_graph( @@ -345,13 +403,14 @@ def __init__(self, cfg): lr_scheduler=self.lr_scheduler, ) else: - self.checkpointer = Checkpointer( - # Assume you want to save checkpoints together with logs/statistics - self.model, - cfg.train.output_dir, - optimizer=self.optimizer, - lr_scheduler=self.lr_scheduler, - ) + pass + # self.checkpointer = Checkpointer( + # # Assume you want to save checkpoints together with logs/statistics + # self.model, + # cfg.train.output_dir, + # optimizer=self.optimizer, + # lr_scheduler=self.lr_scheduler, + # ) # Loading checkpoint before dataloader construction, because # dataloader needs to know the consumed iterations from @@ -415,12 +474,12 @@ def build_hooks(self): ret = [ hooks.IterationTimer(), - hooks.LRScheduler(), # for beauty lr scheduler printer in `nn.Graph` mode - hooks.PeriodicCheckpointer( - self.checkpointer, - self.cfg.train.checkpointer.period, - max_to_keep=self.cfg.train.checkpointer.max_to_keep, - ), + # hooks.LRScheduler(), # for beauty lr scheduler printer in `nn.Graph` mode + # hooks.PeriodicCheckpointer( + # self.checkpointer, + # self.cfg.train.checkpointer.period, + # max_to_keep=self.cfg.train.checkpointer.max_to_keep, + # ), ] if self.cfg.train.evaluation.enabled: @@ -563,6 +622,7 @@ def build_model(cls, cfg): model = build_model(cfg.model) logger = logging.getLogger(__name__) logger.info("Model:\n{}".format(model)) + count_all_parameters(model, verbose=True) model._apply(dist.convert_to_distributed_default_setting) return model diff --git a/libai/layers/attention.py b/libai/layers/attention.py index 0bec6ebc1..45fcb9bc4 100644 --- a/libai/layers/attention.py +++ b/libai/layers/attention.py @@ -19,6 +19,7 @@ import oneflow as flow from oneflow import nn +from oneflow.utils import checkpoint from .linear import Linear @@ -28,6 +29,88 @@ class AttnMaskType(enum.Enum): causal = 2 +class CoreAttention(nn.Module): + def __init__( + self, + hidden_size, + num_attention_heads, + attention_dropout_prob, + scale_mask_softmax_fusion, + attn_mask_type, + apply_query_key_layer_scaling=False, + layer_idx=0, + ) -> None: + super().__init__() + self.hidden_size = hidden_size + self.head_size = hidden_size // num_attention_heads + self.norm_factor = 1.0 / math.sqrt(float(self.head_size)) + self.use_cache = False + self.scale_mask_softmax_fusion = scale_mask_softmax_fusion + self.attn_mask_type = attn_mask_type + self.coeff = None + if apply_query_key_layer_scaling: + self.coeff = layer_idx + 1 + self.norm_factor /= self.coeff + self.attention_dropout_prob = attention_dropout_prob + self.dropout = nn.Dropout(p=attention_dropout_prob) + + def forward(self, query, key, value, attention_mask): + # [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)] + attention_scores = flow.matmul( + query, key, transpose_b=True, alpha=self.norm_factor + ) + + # [S(0), S(1)] x [S(0), B] = [S(0), S(1)] + if attention_mask is not None: + if self.scale_mask_softmax_fusion: + if self.attn_mask_type == AttnMaskType.padding: + attention_mask = ( + attention_mask.expand_as(attention_scores) + if self.use_cache + else attention_mask + ) + attention_weights = flow._C.fused_scale_mask_softmax_dropout( + attention_scores, + attention_mask, + fill_value=-10000.0, + scale=self.coeff, + p=self.attention_dropout_prob, + )[0] + else: + if self.coeff is not None: + attention_scores *= self.coeff + attention_scores = flow.mul(attention_scores, attention_mask) + attention_scores = attention_scores - 10000.0 * (1 - attention_mask) + # TODO(xingyu.liao): graph will occur `where_scalar` errors + # when using `masked_fill` + # attention_scores = attention_scores.masked_fill(1 - attention_mask, -10000.0) + attention_weights = flow.softmax(attention_scores, dim=-1) + # [bsz, num_heads, tgt_len, src_len] + attention_weights = self.dropout(attention_weights) + else: + if ( + self.scale_mask_softmax_fusion + and self.attn_mask_type == AttnMaskType.causal + ): + attention_weights = flow._C.fused_scale_tril_softmax_mask_scale( + attention_scores, + p=self.attention_dropout_prob, + diagonal=0, + tril_scale_value=self.coeff, + tril_fill_value=-10000.0, + )[0] + else: + attention_weights = flow.softmax(attention_scores, dim=-1) + # [bsz, num_heads, tgt_len, src_len] + attention_weights = self.dropout(attention_weights) + + # Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)] + context = flow.matmul(attention_weights, value) + # Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size] + context = context.transpose(1, 2) + return context.flatten(2) + + class MultiheadAttention(nn.Module): """Multi-head attention layer, support self attention and cross attention. @@ -80,18 +163,9 @@ def __init__( self.num_heads = num_attention_heads self.head_size = hidden_size // num_attention_heads - self.attn_mask_type = attn_mask_type - - self.attention_dropout_prob = attention_dropout_prob - self.dropout = nn.Dropout(p=attention_dropout_prob) - self.norm_factor = 1.0 / math.sqrt(float(self.head_size)) - self.coeff = None - if apply_query_key_layer_scaling: - self.coeff = layer_idx + 1 - self.norm_factor /= self.coeff self.is_cross_attention = is_cross_attention - self.scale_mask_softmax_fusion = scale_mask_softmax_fusion + self.bias_dropout_fusion = bias_dropout_fusion if self.bias_dropout_fusion: @@ -131,6 +205,15 @@ def __init__( skip_bias_add=self.bias_dropout_fusion, layer_idx=layer_idx, ) + self.core_attention = CoreAttention( + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + attention_dropout_prob=attention_dropout_prob, + scale_mask_softmax_fusion=scale_mask_softmax_fusion, + attn_mask_type=attn_mask_type, + apply_query_key_layer_scaling=apply_query_key_layer_scaling, + layer_idx=layer_idx, + ) def forward( self, @@ -158,7 +241,7 @@ def forward( use_cache (bool, optional): it will be set to True, when the model is in the inference phase and used for incremental decoding. Defaults to False. """ - + self.core_attention.use_cache = use_cache # hidden_states, encoder_states: [S(0), B] # attention_mask: [S(0), B] @@ -193,7 +276,9 @@ def forward( # hidden_states is the last-added state, # the full key and value could be obtained by concatenating with past_key_value. query_key_value = self.query_key_value(hidden_states) - query_key_value = query_key_value.view(bsz, -1, self.num_heads, 3 * self.head_size) + query_key_value = query_key_value.view( + bsz, -1, self.num_heads, 3 * self.head_size + ) query_key_value = query_key_value.permute( 0, 2, 1, 3 ) # [bsz, num_heads, src_len, 3 * head_size] @@ -207,58 +292,17 @@ def forward( if use_cache: past_key_value = (key, value) - # [bsz, num_heads, tgt_len, src_len] with [S(0), S(1)] - attention_scores = flow.matmul(query, key, transpose_b=True, alpha=self.norm_factor) - - # [S(0), S(1)] x [S(0), B] = [S(0), S(1)] - if attention_mask is not None: - if self.scale_mask_softmax_fusion: - if self.attn_mask_type == AttnMaskType.padding: - attention_mask = ( - attention_mask.expand_as(attention_scores) if use_cache else attention_mask - ) - attention_weights = flow._C.fused_scale_mask_softmax_dropout( - attention_scores, - attention_mask, - fill_value=-10000.0, - scale=self.coeff, - p=self.attention_dropout_prob, - )[0] - else: - if self.coeff is not None: - attention_scores *= self.coeff - attention_scores = flow.mul(attention_scores, attention_mask) - attention_scores = attention_scores - 10000.0 * (1 - attention_mask) - # TODO(xingyu.liao): graph will occur `where_scalar` errors - # when using `masked_fill` - # attention_scores = attention_scores.masked_fill(1 - attention_mask, -10000.0) - attention_weights = flow.softmax(attention_scores, dim=-1) - # [bsz, num_heads, tgt_len, src_len] - attention_weights = self.dropout(attention_weights) - else: - if self.scale_mask_softmax_fusion and self.attn_mask_type == AttnMaskType.causal: - attention_weights = flow._C.fused_scale_tril_softmax_mask_scale( - attention_scores, - p=self.attention_dropout_prob, - diagonal=0, - tril_scale_value=self.coeff, - tril_fill_value=-10000.0, - )[0] - else: - attention_weights = flow.softmax(attention_scores, dim=-1) - # [bsz, num_heads, tgt_len, src_len] - attention_weights = self.dropout(attention_weights) - - # Context shape: [bsz, num_heads, tgt_len, head_size] with [S(0), S(1)] - context = flow.matmul(attention_weights, value) - # Change shape: [bsz, num_heads, tgt_len, head_size] -> [bsz, tgt_len, num_heads, head_size] - context = context.transpose(1, 2) - # Concat multi-head results from # [bsz, tgt_len, num_heads, head_size] -> [bsz, tgt_len, num_heads * head_size] # SBP sign: [S(0), S(2)] # [S(0), S(2)] x [B, S(0)] = [S(0), P] -> [S(0), B] - output = self.dense(context.flatten(2)) + + # context = self.core_attention(query, key, value, attention_mask) + + context = checkpoint.checkpoint( + self.core_attention, query, key, value, attention_mask + ) + output = self.dense(context) if self.bias_dropout_fusion: output, bias = output diff --git a/libai/remat/__init__.py b/libai/remat/__init__.py new file mode 100644 index 000000000..2fed58a7c --- /dev/null +++ b/libai/remat/__init__.py @@ -0,0 +1,2 @@ +from .config import remat_argument_parser +from .trainer import RematTrainer \ No newline at end of file diff --git a/libai/remat/config.py b/libai/remat/config.py new file mode 100644 index 000000000..7e7f068bb --- /dev/null +++ b/libai/remat/config.py @@ -0,0 +1,7 @@ +from libai.config import default_argument_parser + + +def remat_argument_parser(epilog=None): + parser = default_argument_parser(epilog=epilog) + parser.add_argument("--threshold", type=int) + return parser diff --git a/libai/remat/trainer.py b/libai/remat/trainer.py new file mode 100644 index 000000000..9cdb4c439 --- /dev/null +++ b/libai/remat/trainer.py @@ -0,0 +1,15 @@ +from libai.engine import DefaultTrainer +import oneflow as flow + + +def sync(): + flow.comm.barrier() + + +class RematTrainer(DefaultTrainer): + def __init__(self, cfg): + super().__init__(cfg) + + def after_step(self): + sync() + super().after_step() diff --git a/libai/utils/distributed.py b/libai/utils/distributed.py index e7914a0ad..59cc235b2 100644 --- a/libai/utils/distributed.py +++ b/libai/utils/distributed.py @@ -29,13 +29,14 @@ def _merge_devices(devices): num_gpus_per_node = get_world_size() // get_num_nodes() - node_devices = [node_id * num_gpus_per_node + device_id for node_id, device_id in devices] + node_devices = [ + node_id * num_gpus_per_node + device_id for node_id, device_id in devices + ] return node_devices class _DistributeUtil(object): def __init__(self, cfg): - self._init_distributed_env(cfg) self._init_parallel_size(cfg) self._init_placement_group(cfg) @@ -47,7 +48,10 @@ def _init_distributed_env(self, cfg): num_nodes = get_num_nodes() num_gpus_per_node = get_world_size() // num_nodes - if try_get_key(cfg, "num_gpus_per_node", default=num_gpus_per_node) != num_gpus_per_node: + if ( + try_get_key(cfg, "num_gpus_per_node", default=num_gpus_per_node) + != num_gpus_per_node + ): # This means key(num_gpus_per_node) saved in config is not equal # to environment variable. # Give user a warning about inconsistent reproduce environment. @@ -74,7 +78,6 @@ def _init_distributed_env(self, cfg): self._device_type = try_get_key(cfg, "device_type", default="cuda") def _init_parallel_size(self, cfg): - # tensor parallel size self._tensor_parallel_size = min(cfg.tensor_parallel_size, self.world_size) assert self.world_size % self._tensor_parallel_size == 0, ( @@ -121,7 +124,9 @@ def _init_parallel_size(self, cfg): if try_get_key(cfg, "pipeline_num_layers") is None: cfg.pipeline_num_layers = 10000 - self._model_parallel_size = self._pipeline_parallel_size * self._tensor_parallel_size + self._model_parallel_size = ( + self._pipeline_parallel_size * self._tensor_parallel_size + ) assert self.world_size % self._model_parallel_size == 0, ( f"world size ({self.world_size}) is not divisible by" @@ -152,21 +157,27 @@ def _init_placement_group(self, cfg): and cfg.pipeline_num_layers >= 8 and cfg.pipeline_num_layers % self._pipeline_parallel_size == 0 ): - temp_num_layers_per_stage = cfg.pipeline_num_layers // self._pipeline_parallel_size + temp_num_layers_per_stage = ( + cfg.pipeline_num_layers // self._pipeline_parallel_size + ) actual_pipeline_num_layers = cfg.pipeline_num_layers + min( self._pipeline_parallel_size - 1, temp_num_layers_per_stage ) else: actual_pipeline_num_layers = cfg.pipeline_num_layers - num_layers_per_stage = actual_pipeline_num_layers // self._pipeline_parallel_size + num_layers_per_stage = ( + actual_pipeline_num_layers // self._pipeline_parallel_size + ) stage_offset = actual_pipeline_num_layers % self._pipeline_parallel_size # stage_offset can make the later stages contain more layers when pipeline_num_layers # cannot be divided by pipeline_parallel_size. # This can make pipeline parallel more memory efficient. self._layer_stage_ids = [] - for i in range(0, actual_pipeline_num_layers - stage_offset, num_layers_per_stage): + for i in range( + 0, actual_pipeline_num_layers - stage_offset, num_layers_per_stage + ): stage_id = i // num_layers_per_stage if stage_id >= (self._pipeline_parallel_size - stage_offset): self._layer_stage_ids.append(stage_id) @@ -180,7 +191,9 @@ def _init_placement_group(self, cfg): self._layer_stage_ids = cfg.custom_pipeline_stage_id cfg.actual_pipeline_stage_id = self._layer_stage_ids - self._layer_ranks = [stages_devices[stage_id] for stage_id in self._layer_stage_ids] + self._layer_ranks = [ + stages_devices[stage_id] for stage_id in self._layer_stage_ids + ] def _init_parallel_hierarchy(self): if self.is_data_model_parallel(): @@ -228,7 +241,12 @@ def device_type(self): return self._device_type def set_device_type(self, device_type): - assert device_type in ["cpu", "cuda"], f"not supported for {device_type}" + assert device_type in [ + "cpu", + "cuda", + "cpu+remat", + "cuda+remat", + ], f"not supported for {device_type}" self._device_type = device_type def get_layer_ranks(self, layer_idx): @@ -316,6 +334,7 @@ def get_layer_placement(layer_idx, device_type=None): device_type = dist_util.device_type if device_type is None else device_type if not flow.cuda.is_available() and device_type == "cuda": device_type = "cpu" + return flow.placement.all("cuda+remat") return flow.placement( device_type, dist_util.get_layer_ranks(layer_idx), @@ -353,7 +372,9 @@ def get_hidden_sbp(): def get_data_parallel_rank(): dist_util = get_dist_util() - return (flow.env.get_rank() // dist_util.model_parallel_size) % dist_util.data_parallel_size + return ( + flow.env.get_rank() // dist_util.model_parallel_size + ) % dist_util.data_parallel_size def get_data_parallel_size(): @@ -429,10 +450,14 @@ def convert_to_distributed_default_setting(t): sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=get_layer_placement(0), ) + else: dist_util = get_dist_util() device_type = dist_util.device_type - return t.to_global(placement=flow.placement(device_type, ranks=t.placement.ranks)) + return t.to_global(flow.placement.all("cuda+remat"), flow.sbp.broadcast) + return t.to_global( + placement=flow.placement(device_type, ranks=t.placement.ranks) + ) def ttol(tensor, pure_local=False, ranks=None): @@ -443,7 +468,8 @@ def ttol(tensor, pure_local=False, ranks=None): tensor = tensor.to_global(placement=placement).to_local() else: tensor = tensor.to_global( - sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=placement + sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), + placement=placement, ).to_local() return tensor @@ -462,9 +488,12 @@ def tensor_to_rank0(tensor, device="cuda", to_local=False): assert device in ["cpu", "cuda"], f"not supported for device:{device}" if tensor.is_global: # Consider if it's 2d mesh, ranks should be [[0]] instead of [0] - placement = flow.placement(device, ranks=[0] if tensor.placement.ranks.ndim == 1 else [[0]]) + placement = flow.placement( + device, ranks=[0] if tensor.placement.ranks.ndim == 1 else [[0]] + ) tensor = tensor.to_global( - sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=placement + sbp=get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), + placement=placement, ) if to_local: tensor = ttol(tensor) diff --git a/tools/json_to_pic.py b/tools/json_to_pic.py new file mode 100644 index 000000000..abf3adb89 --- /dev/null +++ b/tools/json_to_pic.py @@ -0,0 +1,359 @@ +import json +import csv +import sys +import re +from pathlib import Path +import matplotlib.pyplot as plt +import numpy as np +import os + + +our_times = [] +nlr_times = [] +no_allo_times = [] +no_lr_times = [] + + +cwd = Path.cwd() + + +def get_theo_time_from_json_file(fn): + with open(fn) as f: + j = json.load(f) + return float(j["theoretically time"]) + + +def get_real_time_from_json_file(fn): + with open(fn) as f: + j = json.load(f) + return float(j["real time"]) + + +def avg(x): + return sum(x) / len(x) + + +def get_searching_time_from_json_file(fn): + with open(fn) as f: + j = json.load(f) + j = list(map(lambda x: x[2], j["overhead"])) + if len(j) == 0: + return 0 + return sum(j) + + +def get_mem_frag_rate_from_json_file(fn): + with open(fn) as f: + j = json.load(f) + mem_frag_rate = j["mem frag rate"] + if mem_frag_rate is None: + mem_frag_rate = np.nan + mem_frag_rate = float(mem_frag_rate) + # return % + return mem_frag_rate * 100 + + +def get_dataset_time_from_json_file(fn): + with open(fn) as f: + j = json.load(f) + if 'real time' in j: + x = j["real time"] + if x != 0: + return x * 10**6 + x = j["dataset time"] + if x is not None: + x = float(x) + return x + + +def get_threshold_from_json_file(fn): + return int(fn[-9:][:4]) + # return int(fn[-10:][:5]) for a800 + if fn[:9] == 'overhead-': + fn = fn[:-5] + return int(fn.split('-')[-1]) + with open(fn) as f: + j = json.load(f) + t = float(j["threshold"][:-2]) + if t == 9900: + t = 10000 + return t + + +model_name = sys.argv[1] + + +def draw_one(ax, data, label, i, total, kind): + assert kind in ["step", "line", "bar"] + colors = { + 'Coop (Ours)': 'tab:blue', + 'Coop': 'tab:blue', + 'DTE': 'tab:orange', + 'DTR': 'tab:green', + 'No op-guided allocation': 'tab:brown', + 'No recomputable in-place': 'tab:purple', + 'No layout-aware eviction': 'tab:red', + } + markers = ["o", "*", "D", "^", "s"] + zorder = 200 - i + length = len(data) + data = list(zip(*data)) + if kind == "step": + x = ax.step( + *data, + where="post", + label=label, + linewidth=4, + marker=markers[i], + markevery=[True] + [False] * (length - 1), + ms=10, + zorder=zorder, + color=colors[label], + ) + elif kind == "line": + ax.plot( + *data, + label=label, + linewidth=3, + marker=markers[i], + markevery=1, + ms=10, + zorder=zorder, + ) + elif kind == "bar": + width = 0.25 + x = np.arange(len(data[0])) + bars = ax.bar( + x + (i - total / 2 + 0.5) * width, + np.where(np.isnan(data[1]), 0.5, data[1]), + width=width * 0.88, + label=label, + zorder=zorder, + ) + for i, bar in enumerate(bars): + if np.isnan(data[1][i]): + bar.set_color('gray') + + +def draw_from_files_and_draw( + *, + xlabel, + ylabel, + get_y, + pic_name, + name_to_legend_and_fn_patterns, + ncols, + nrows, + pic_kind, + data_kind, + imgcat=False, +): + assert len(name_to_legend_and_fn_patterns) == ncols * nrows + if data_kind in [DK_FRAG, DK_ABLA_FRAG]: + fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(14, 6)) + elif data_kind == DK_TIME: + fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(28, 9), sharex=True, sharey='row') + elif data_kind == DK_ABLA: + fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(28, 9), sharex=True, sharey='row') + elif data_kind == DK_OH: + fig, axs = plt.subplots(ncols=ncols, nrows=nrows, figsize=(14, 6), sharex=True, sharey='row') + else: + raise ValueError("data_kind") + axs = axs.flatten() + + for i, ((name, resolution), legend_and_fn_patterns) in enumerate(name_to_legend_and_fn_patterns.items()): + ax = axs[i] + _draw_from_files_and_draw_in_subplot( + ax=ax, + xlabel=xlabel, + ylabel=ylabel, + get_y=get_y, + legend_and_fn_pattern=legend_and_fn_patterns, + pic_kind=pic_kind, + data_kind=data_kind, + name=name, + resolution=resolution, + index=i, + ) + left = 0.075 if data_kind in [DK_FRAG, DK_ABLA_FRAG] else 0.06 + if data_kind == DK_OH: + left = 0 + right = 1 + top = 1 + bottom = {DK_FRAG: 0.15, DK_ABLA: 0.18, DK_TIME: 0.11, DK_OH: 0.15}[data_kind] + subplot_x_center = (left + right) / 2 + subplot_y_center = (top + bottom) / 2 + if data_kind in [DK_TIME, DK_ABLA]: + fig.subplots_adjust(top=top, left=left, right=right, bottom=bottom, wspace=0.1, hspace=0.07) + elif data_kind == [DK_FRAG, DK_OH]: + fig.subplots_adjust(top=top, left=left, right=right, bottom=bottom, wspace=0.1) + handles, labels = axs[0].get_legend_handles_labels() # type: ignore + fig.legend(handles, labels, loc='upper center', ncol=3 if data_kind == DK_FRAG else 4, bbox_to_anchor=(subplot_x_center, 0.01), handlelength=1.5) + + if data_kind == DK_FRAG: + fig.supylabel(ylabel, y=subplot_y_center, fontsize=YLABEL_FONT_SIZE_FRAG, fontweight='bold') + else: + fig.supylabel(ylabel, y=subplot_y_center, fontweight='bold') + fig.supxlabel(xlabel, x=subplot_x_center, fontweight='bold') + + plt.savefig(pic_name, bbox_inches="tight") + if imgcat: + os.system(f"imgcat {pic_name}") + + +def _draw_from_files_and_draw_in_subplot( + *, ax, xlabel, ylabel, get_y, legend_and_fn_pattern, pic_kind, data_kind, name, resolution, index +): + data = {} + threshold_set = set() + for label, (fn_pattern, predicate) in legend_and_fn_pattern.items(): + match = re.compile(fn_pattern).match + fns = list( + filter(match, (str(x.name) for x in cwd.iterdir())) + ) + fns = list(filter(lambda fn: predicate(int(match(fn).group(1))), fns)) + thresholds = list(map(get_threshold_from_json_file, fns)) + threshold_set = threshold_set.union(thresholds) + data[label] = list(zip(thresholds, list(map(get_y, fns)))) + max_threshold = max(threshold_set) + min_threshold = min(threshold_set) + if data_kind in [DK_TIME, DK_ABLA]: + base_time = min(min(x[1] for x in d) for d in data.values()) + # dtr_base_time = min(x[1] for x in data["DTR"]) + for label, d in data.items(): + for i in range(len(d)): + # if label == 'DTR': + # d[i] = (d[i][0], d[i][1] / dtr_base_time) + # else: + d[i] = (d[i][0], d[i][1] / base_time) + + if pic_kind == "bar": + for threshold in threshold_set: + for label, d in data.items(): + if threshold not in list(zip(*d))[0]: + d.append((threshold, np.nan)) + for label in data: + data[label].sort(key=lambda x: x[0]) + # print(data[label]) + data[label] = list(map(lambda x: (x[0] / max_threshold, x[1]), data[label])) + # pop max_threshold because it doesn't have mem frag + # pop min_threshold because most data is None + if pic_kind == "bar" and data_kind in [DK_FRAG, DK_OH]: + data[label].pop() + threshold_set.discard(max_threshold) + del data[label][0] + threshold_set.discard(min_threshold) + + # print(f"max_threshold: {max_threshold}") + + for i, (label, d) in enumerate(data.items()): + draw_one(ax, d, label, i, len(data), pic_kind) + + if name[:5] == 'GPT-2' or name[:4] == 'BERT': + ax.plot(0.77, 1.25, label='SAR', color='tab:brown', markersize=15, marker='*', linestyle='None') + + if pic_kind in ["step", "line"]: + if data_kind in [DK_TIME, DK_ABLA]: + ax.set_xticks(np.arange(0, 1.1, 0.2)) + ax.set_xticks(np.arange(0, 1.1, 0.1), minor=True) + ax.set_xlim(left=0.15, right=1.1) + + flag = True + if index >= 4: + flag = False + if data_kind == DK_ABLA: + flag = True + if flag: + ax.set_ylim(top=1.57) + ax.text(1.07, 1.51, name, fontsize=24, fontweight="bold", ha='right') + ax.text(1.07, 1.49, resolution, fontsize=18, ha='right', va='top') + else: + ax.set_ylim(top=1.39) + ax.text(1.07, 1.34, name, fontsize=24, fontweight="bold", ha='right') + ax.text(1.07, 1.32, resolution, fontsize=18, ha='right', va='top') + + ax.grid(which='both') + if pic_kind in ["bar"]: + x = list(map(lambda x: x / max_threshold, sorted(list(threshold_set)))) + ax.set_xticks(np.arange(len(x)), x) + ax.grid(axis="y") + ax.set_title(name, fontsize=21, fontweight='bold', pad=12) + + +DK_ABLA_FRAG = "abfrag" +DK_FRAG = "frag" +DK_OH = "oh" +DK_TIME = "time" +DK_ABLA = "ablation" +DK_FWS = "fws" +data_kind = sys.argv[3] if len(sys.argv) > 3 else DK_FRAG + +DEFAULT_FONT_SIZE = 22 +YLABEL_FONT_SIZE_FRAG = 20 if data_kind == DK_FRAG else None +plt.rcParams.update({"font.size": DEFAULT_FONT_SIZE}) + + +def divisible_by(n): + return lambda x: x % n == 0 + + +def unet_predicate(fn): + return divisible_by(1000)(fn) + +def gpt2_predicate(fn): + return divisible_by(850)(fn) + # return divisible_by(7168)(fn) + # return divisible_by(6500)(fn) + +def resnet50_predicate(fn): + return divisible_by(1000)(fn) + + +if data_kind == DK_TIME: + d = { + # ("GPT-2 (8)", "Sequence length 512"): ("bert-new2", unet_predicate), + # ("BERT Large (4)", "Sequence length 512"): ("bert-new2", unet_predicate), + # ("BiLSTM (2048)", "Input dimension 100,\nHidden dimension 256,\nSequence length 128"): ("lstm_text-new2", divisible_by(850)), + # # ("ResNet-152 (55)", "224x224"): ("resnet152-new2", unet_predicate), + # ("U-Net (5)", "460x608"): ("unet-new2", unet_predicate), + # ("Swin-T (40)", "224x224"): ("stn2", unet_predicate), + # ("ResNet-50 (115)", "224x224"): ("resnet50-new2", unet_predicate), + # ("Inception V3 (96)", "299x299"): ("inception_v3-new2", unet_predicate), + # ("DenseNet-121 (70)", "224x224"): ("densenet121-new2", unet_predicate), + + ("gpt", "32x32"): ("gpt2", gpt2_predicate), + ("gpt1", "32x32"): ("gpt2", gpt2_predicate), + ("gpt2", "32x32"): ("gpt2", gpt2_predicate), + ("gpt3", "32x32"): ("gpt2", gpt2_predicate), + ("gpt4", "32x32"): ("gpt2", gpt2_predicate), + ("gpt5", "32x32"): ("gpt2", gpt2_predicate), + ("gpt6", "32x32"): ("gpt2", gpt2_predicate), + ("gpt7", "32x32"): ("gpt2", gpt2_predicate), + } + name_to_legend_and_fn_patterns = {} + for k, v in d.items(): + if k == ("Swin-T (40)", "224x224"): + name_to_legend_and_fn_patterns[k] = { + "Coop (Ours)": (rf"{v[0]}-ours-(\d{{4,5}}).json", v[1]), + "DTE": (rf"{v[0]}-no-gp-(\d{{4,5}}).json", v[1]), + "DTR": (rf"{v[0]}-no-fbip-(\d{{4,5}}).json", v[1]), + } + continue + name_to_legend_and_fn_patterns[k] = { + "Coop (Ours)": (rf"{v[0]}-ours-(\d{{4,5}}).json", v[1]), + "DTE": (rf"{v[0]}-dte-our-impl-(\d{{4,5}}).json", v[1]), + "DTR": (rf"{v[0]}-dtr-no-free-(\d{{4,5}}).json", v[1]), + } + + draw_from_files_and_draw( + xlabel="Memory Ratio", + ylabel="Compute Overhead (x)", + get_y=get_dataset_time_from_json_file, + pic_name=f"compute-overhead-main.pdf", + name_to_legend_and_fn_patterns=name_to_legend_and_fn_patterns, + ncols=4, + nrows=2, + imgcat=True, + pic_kind=sys.argv[2], + data_kind=data_kind, + ) \ No newline at end of file diff --git a/tools/run_checkpoint.sh b/tools/run_checkpoint.sh new file mode 100755 index 000000000..7347fecc1 --- /dev/null +++ b/tools/run_checkpoint.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +set -ux + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + +MODEL_NAME=gpt2 +RESULT_DIR=./exp_data_${MODEL_NAME} +THRESHOLD=9500 + +config_file=/home/dev/files/repos/libai-normal/libai/config/configs/${MODEL_NAME}_pretrain.py + + +CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-checkpointing ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=2 \ +python -m oneflow.distributed.launch \ +--nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \ +tools/train_remat.py --config-file $config_file --threshold $THRESHOLD --fast-dev-run \ No newline at end of file diff --git a/tools/run_remat_gpt2.sh b/tools/run_remat_gpt2.sh new file mode 100755 index 000000000..6b7b775bb --- /dev/null +++ b/tools/run_remat_gpt2.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +set -ux + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + +MODEL_NAME=gpt2 +RESULT_DIR=./exp_data + + +# array=(4000 4500 5000 5500 6000 6500 7000 7500 8000 8500 9000 9500) +# array=(3500 4000 4500 5000 5500 6000 6500 7500 8500 9500) +array=(3400 4250 5100 5950 6800 7650 8500) +# array=(28672 35840 43008 50176 57344 64512 71680) for a800 + +config_file=/share_nfs/sd_dataset/lph/codes/libai_normal/configs/dtr_gpt2_pretrain.py + +METHOD=ours +for threshold in "${array[@]}"; do + CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=2 \ + python -m oneflow.distributed.launch \ + --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \ + tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run +done + +METHOD=dte-our-impl +for threshold in "${array[@]}"; do + CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTE=1 \ + python -m oneflow.distributed.launch \ + --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \ + tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run +done + +METHOD=dtr-no-free +for threshold in "${array[@]}"; do + CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTR=1 \ + python -m oneflow.distributed.launch \ + --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \ + tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run +done \ No newline at end of file diff --git a/tools/run_remat_gpt3.sh b/tools/run_remat_gpt3.sh new file mode 100755 index 000000000..1d0a16b87 --- /dev/null +++ b/tools/run_remat_gpt3.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +set -ux + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + +MODEL_NAME=gpt3 +RESULT_DIR=./exp_data + +array=(32500 39000 45500 52000 58500 65000) +config_file=/share_nfs/sd_dataset/lph/codes/libai_normal/configs/dtr_gpt3_pretrain.py + +METHOD=ours +for threshold in "${array[@]}"; do + CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=2 \ + python -m oneflow.distributed.launch \ + --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \ + tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run +done + +METHOD=dte-our-impl +for threshold in "${array[@]}"; do + CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTE=1 \ + python -m oneflow.distributed.launch \ + --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \ + tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run +done + +METHOD=dtr-no-free +for threshold in "${array[@]}"; do + CUDA_VISIBLE_DEVICES=0 ONEFLOW_REMAT_SUMMARY_FILE_PREFIX=${RESULT_DIR}/$MODEL_NAME-$METHOD-$threshold ENABLE_PROFILE_FOR_DTR=0 ONEFLOW_VM_MULTI_THREAD=0 ONEFLOW_DTR_GROUP_NUM=1 ONEFLOW_REMAT_HEURISTIC_DTR=1 \ + python -m oneflow.distributed.launch \ + --nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr 127.0.0.1 --master_port 12345 \ + tools/train_remat.py --config-file $config_file --threshold $threshold --fast-dev-run +done \ No newline at end of file diff --git a/tools/train_remat.py b/tools/train_remat.py new file mode 100644 index 000000000..a94f01ae3 --- /dev/null +++ b/tools/train_remat.py @@ -0,0 +1,64 @@ +# coding=utf-8 +# Copyright 2021 The OneFlow Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import os +import random +import sys + +import numpy as np +import oneflow as flow +import argparse +import sys + +sys.path.append( + os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)) +) +from libai.config import LazyConfig +from libai.engine import DefaultTrainer, default_setup +from libai.remat import remat_argument_parser, RematTrainer +from libai.utils import distributed as dist + +logger = logging.getLogger("libai." + __name__) + + +def main(args): + cfg = LazyConfig.load(args.config_file) + cfg = LazyConfig.apply_overrides(cfg, args.opts) + default_setup(cfg, args) + + seed_for_rank = cfg.train.seed + flow.env.get_rank() + flow.manual_seed(seed_for_rank) + flow.cuda.manual_seed(seed_for_rank) + np.random.seed(seed_for_rank) + random.seed(seed_for_rank) + + if args.fast_dev_run: + cfg.train.train_epoch = 0 + cfg.train.train_iter = 10 + cfg.train.evaluation.enabled = False + cfg.train.log_period = 1 + cfg.train.input_placement_device = "cuda+remat" + + trainer = RematTrainer(cfg) + return trainer.train() + + +if __name__ == "__main__": + dist.set_device_type("cuda+remat") + args = remat_argument_parser().parse_args() + flow.remat.set_budget(f"{args.threshold}MB") + flow.remat.set_small_pieces_optimization(False) + main(args)