PrimeIntellect-ai · samsja · Oct 17, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 18, 2024
diff --git a/configs/debug/diloco.toml b/configs/debug/diloco.toml
@@ -15,5 +15,5 @@ total_steps = 4
 fake = true
 
 [diloco]
-inner_steps = 2
+inner_steps = 5
 
diff --git a/scripts/all_reduce_test.py → scripts/all_reduce.py b/scripts/all_reduce_test.py → scripts/all_reduce.py
diff --git a/scripts/mem_use.py b/scripts/mem_use.py
@@ -0,0 +1,19 @@
+import sys
+import time
+import torch
+
+
+def allocate_memory(size_bytes):
+    # Allocate tensor on CPU
+    return torch.ones((size_bytes) // 4, dtype=torch.float32)
+
+
+if __name__ == "__main__":
+    size_gb = float(sys.argv[1])
+    size_bytes = int(size_gb * 1024 * 1024 * 1024)
+
+    data = allocate_memory(size_bytes)
+    print(f"Allocated {size_gb} GB of RAM using NumPy")
+    while True:
+        time.sleep(1)
+        print(f"Allocated {size_gb} GB of RAM using NumPy, data.shape: {data.shape}")
diff --git a/scripts/simple_gloo_test.py → scripts/simple_gloo.py b/scripts/simple_gloo_test.py → scripts/simple_gloo.py
diff --git a/scripts/swap_check.py b/scripts/swap_check.py
@@ -0,0 +1,68 @@
+import torch
+import psutil
+import time
+
+
+def get_total_ram():
+    return psutil.virtual_memory().total
+
+
+def get_available_ram():
+    return psutil.virtual_memory().available
+
+
+def allocate_memory(size_bytes):
+    # Allocate tensor on CPU
+    return torch.ones((size_bytes) // 4, dtype=torch.float32)
+
+
+def main():
+    print("Starting memory allocation test...")
+
+    total_ram = get_total_ram()
+    print(f"Total physical RAM: {total_ram / (1024**3):.2f} GB")
+
+    # Start with 1% of total RAM
+    initial_percentage = 50
+    percentage_increment = 10
+    current_percentage = initial_percentage
+
+    tensors = []
+
+    while True:
+        try:
+            available_ram = get_available_ram()
+            allocation_size = int(total_ram * (current_percentage / 100))
+
+            # Allocate memory
+            tensor = allocate_memory(allocation_size)
+            tensors.append(tensor)
+
+            # Get current memory usage
+            process = psutil.Process()
+            memory_info = process.memory_info()
+
+            print(
+                f"Allocated {allocation_size / (1024**2):.2f}MB ({current_percentage}% of total RAM). "
+                f"Process memory used: {memory_info.rss / (1024**3):.2f}GB. "
+                f"Available RAM: {available_ram / (1024**3):.2f}GB"
+            )
+
+            # Increase percentage for next iteration
+            current_percentage += percentage_increment
+
+            # Sleep to allow for monitoring
+            time.sleep(1)
+
+        except RuntimeError as e:
+            print(f"Memory allocation failed: {e}")
+            break
+        except KeyboardInterrupt:
+            print("Test stopped by user.")
+            break
+
+    print("Test completed. Check your system monitor to see if swap was used.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/zeroband/comms.py b/src/zeroband/comms.py
@@ -137,6 +137,7 @@ def _init_global_pg(self) -> None:
         )
 
         self._global_leader = self.world_info.global_rank == 0
+        self._logger.info(f"[{self.world_info.global_unique_id}] Global leader: {self._global_leader}")
         self.global_store = dist.TCPStore(
             host_name=self.world_info.global_addr,
             port=self.world_info.global_port + self.world_info.rank,

diff --git a/src/zeroband/train.py b/src/zeroband/train.py
@@ -330,19 +330,22 @@ def train(config: Config):
             loss_batch = 0
             z_loss_batch = 0
 
-            maybe_dest_rank = elastic_device_mesh.live_recovery.should_send_ckpt_to()
-            if maybe_dest_rank is not None:
-                logger.info(f"Start live recovery to rank {maybe_dest_rank}")
-                if config.train.log_model_hash:
-                    logger.info(
-                        f"live recovery outer optimizer hash: {get_optimizer_signature(diloco.outer_optimizer)}"
-                    )
-                    logger.info(f"live recovery outer model hash: {get_tensor_list_signature(diloco.param_list_cpu)}")
-                    logger.info(f"inner optimizer hash: {get_optimizer_signature(inner_optimizer)}")
-
-                ckpt_manager.send_ckpt_to_peer(elastic_device_mesh.global_pg, maybe_dest_rank)
-
-                elastic_device_mesh.live_recovery.reset()
+            if config.diloco is not None:
+                maybe_dest_rank = elastic_device_mesh.live_recovery.should_send_ckpt_to()
+                if maybe_dest_rank is not None:
+                    logger.info(f"Start live recovery to rank {maybe_dest_rank}")
+                    if config.train.log_model_hash:
+                        logger.info(
+                            f"live recovery outer optimizer hash: {get_optimizer_signature(diloco.outer_optimizer)}"
+                        )
+                        logger.info(
+                            f"live recovery outer model hash: {get_tensor_list_signature(diloco.param_list_cpu)}"
+                        )
+                        logger.info(f"inner optimizer hash: {get_optimizer_signature(inner_optimizer)}")
+
+                    ckpt_manager.send_ckpt_to_peer(elastic_device_mesh.global_pg, maybe_dest_rank)
+
+                    elastic_device_mesh.live_recovery.reset()
 
             for grad_acc_step in range(gradient_accumulation_steps):
                 is_accumulating = grad_acc_step < gradient_accumulation_steps - 1

diff --git a/tests/test_dist/test_comms.py b/tests/test_dist/test_comms.py
@@ -10,7 +10,7 @@
 def test_elastic_device_mesh_no_global(world_size: int, random_available_port: int, mock_env):
     def foo(**kwargs):
         with mock_env(**kwargs):
-            edm = ElasticDeviceMesh()
+            edm = ElasticDeviceMesh(enable=False)
 
             rank = int(kwargs["RANK"])
             a = torch.arange(3) * (rank + 1)

diff --git a/tests/test_torchrun/test_train.py b/tests/test_torchrun/test_train.py
@@ -30,11 +30,12 @@ def gpus_to_use(num_nodes, num_gpu, rank):
     return ",".join(map(str, range(rank * num_gpu, (rank + 1) * num_gpu)))
 
 
-def _test_multi_gpu(num_gpus, config, extra_args=[]):
+def _test_multi_gpu(num_gpus, config, extra_args=[], diloco=False):
     num_nodes, num_gpu = num_gpus[0], num_gpus[1]
 
     processes = []
     ports = get_random_available_port_list(num_nodes)
+    new_port = get_random_available_port(1)
     for i in range(num_nodes):
         cmd = [
             "torchrun",
@@ -47,7 +48,20 @@ def _test_multi_gpu(num_gpus, config, extra_args=[]):
         ]
 
         env = copy.deepcopy(os.environ)
+
+        if diloco:
+            new_env = {
+                "GLOBAL_RANK": str(i),
+                "GLOBAL_UNIQUE_ID": str(i),
+                "GLOBAL_ADDR": "localhost",
+                "GLOBAL_WORLD_SIZE": str(num_nodes),
+                "GLOBAL_PORT": str(new_port),
+            }
+            env.update(new_env)
+
         env["CUDA_VISIBLE_DEVICES"] = gpus_to_use(num_nodes, num_gpu, i)
+        env["ZERO_BAND_LOG_LEVEL"] = "DEBUG"
+
         process1 = subprocess.Popen(cmd, env=env)
         processes.append(process1)
 
@@ -62,10 +76,9 @@ def test_multi_gpu(num_gpus):
     _test_multi_gpu(num_gpus, "debug/normal.toml")
 
 
-@pytest.mark.parametrize("num_gpus", [[1, 2], [2, 2]])
+@pytest.mark.parametrize("num_gpus", [[2, 1], [2, 2]])
 def test_multi_gpu_diloco(num_gpus):
-    # we don't test 1,1 and 2,1 because 1 solo gpu failed with fsdp
-    _test_multi_gpu(num_gpus, "debug/diloco.toml")
+    _test_multi_gpu(num_gpus, "debug/diloco.toml", diloco=True)
 
 
 def test_act_ckpt():
@@ -78,12 +91,10 @@ def test_act_ckpt_num():
     _test_multi_gpu(num_gpus, "debug/normal.toml", extra_args=["--train.ac_ckpt", "2"])
 
 
-@pytest.mark.parametrize(
-    "backend", [Compression.NO, Compression.UINT8]
-)  # not adding CINT8 because the compile is too slow
+@pytest.mark.parametrize("backend", [Compression.NO, Compression.UINT8])
 def test_all_reduce_diloco(backend: Compression):
     num_gpus = [2, 1]
-    _test_multi_gpu(num_gpus, "debug/diloco.toml", extra_args=["--diloco.compression", backend.value])
+    _test_multi_gpu(num_gpus, "debug/diloco.toml", extra_args=["--diloco.compression", backend.value], diloco=True)
 
 
 def test_z_loss():