From 44219fbfc7638870235ef6252c8779523f796293 Mon Sep 17 00:00:00 2001 From: Sami Jaghouar Date: Thu, 14 Nov 2024 05:38:39 +0000 Subject: [PATCH] allow to do remote data load with live reco --- src/zeroband/checkpoint.py | 17 ++++++++++------- src/zeroband/train.py | 4 ++++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/zeroband/checkpoint.py b/src/zeroband/checkpoint.py index f5699dff..de25afe5 100644 --- a/src/zeroband/checkpoint.py +++ b/src/zeroband/checkpoint.py @@ -461,20 +461,23 @@ def load( if not skip_dataloader: if self.config.remote_data_load: - remote_data_path = os.path.join(self.config.remote_data_path, f"data_{self.data_rank}", "latest") - id_ = uuid.uuid4() - dest = f"/tmp/zeroband/data_{id_}" - rsync_fsspec(remote_data_path, os.path.join(dest, "data")) - data_path = dest + self.remote_data_load() else: data_path = resume_ckpt_path if data_path is None else data_path - - self._load_data(data_path) + self._load_data(data_path) self._init_state() self._logger.info(f"Loaded checkpoint from {resume_ckpt_path} in {time.perf_counter() - time_start} seconds") + def remote_data_load(self): + remote_data_path = os.path.join(self.config.remote_data_path, f"data_{self.data_rank}", "latest") + id_ = uuid.uuid4() + dest = f"/tmp/zeroband/data_{id_}" + rsync_fsspec(remote_data_path, os.path.join(dest, "data")) + data_path = dest + self._load_data(data_path) + @torch.no_grad() def recv_ckpt_from_peer(self, global_pg: dist.ProcessGroup): assert self.diloco_offloaded_param_list is not None, "recv_ckpt_from_peers is only supported with diloco" diff --git a/src/zeroband/train.py b/src/zeroband/train.py index b7c46b97..af40bc45 100644 --- a/src/zeroband/train.py +++ b/src/zeroband/train.py @@ -332,6 +332,10 @@ def train(config: Config): logger.info(f"inner optimizer hash: {get_optimizer_signature(inner_optimizer)}") need_live_recovery = False + + if config.ckpt.remote_data_load: + ckpt_manager.remote_data_load() + logger.info("live recovery done in %f", time.perf_counter() - time_start_live_recovery) # at the beginning of the inner steps we allow joiner to arrive.