Skip to content

Commit

Permalink
update save format (#28)
Browse files Browse the repository at this point in the history
* fix

* update settings

* Update dtype to torch.float32 in DiT model

* Remove unnecessary assert statement in DiT class

* Refactor logging and checkpoint saving in train_img.py

* Update save directory to include global step in ckpt_utils.py and log global step in train_img.py
  • Loading branch information
oahzxl authored Feb 23, 2024
1 parent ea1f83d commit 90e8d6c
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 2 deletions.
4 changes: 3 additions & 1 deletion opendit/utils/ckpt_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,13 @@ def save(
lr_scheduler: _LRScheduler,
epoch: int,
step: int,
global_step: int,
batch_size: int,
coordinator: DistCoordinator,
save_dir: str,
shape_dict: dict,
):
save_dir = os.path.join(save_dir, f"epoch{epoch}-step{step}")
save_dir = os.path.join(save_dir, f"epoch{epoch}-global_step{global_step}")
os.makedirs(os.path.join(save_dir, "model"), exist_ok=True)

booster.save_model(model, os.path.join(save_dir, "model"), shard=True)
Expand All @@ -79,6 +80,7 @@ def save(
running_states = {
"epoch": epoch,
"step": step,
"global_step": global_step,
"sample_start_index": step * batch_size,
}
if coordinator.is_master():
Expand Down
5 changes: 4 additions & 1 deletion train_img.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,12 +244,15 @@ def main(args):
lr_scheduler,
epoch,
step + 1,
global_step + 1,
args.batch_size,
coordinator,
experiment_dir,
ema_shape_dict,
)
logger.info(f"Saved checkpoint at epoch {epoch} step {step + 1} to {experiment_dir}")
logger.info(
f"Saved checkpoint at epoch {epoch} step {step + 1} global_step {global_step + 1} to {experiment_dir}"
)

# the continue epochs are not resumed, so we need to reset the sampler start index and start step
dataloader.sampler.set_start_index(0)
Expand Down

0 comments on commit 90e8d6c

Please sign in to comment.