Update on "[BE] consolidate 4-GPU integration tests into 8-GPU tests …

…and reduce frequency" Previously CI has two parts of GPU tests: (1) 4 GPU test which runs hourly (2) 8 GPU test which runs daily. - I think the frequency of (1) is unnecessary, the only thing outside PR changes which can break CI is pytorch-nightly release. This PR reduces the frequency to e.g. per-6-hours (4 times a day). - Since we reduce the frequency of (1) to be close to (2), can we consolidate them into one yaml. The overall “cost” might be lower instead of higher considering we only launch the container once. Also it’s going to be less confusing if we just show a single badge as “integration tests” on README. Results: - CI used to run ~16min on 4-GPU test and ~10min on 8-GPU test - It now runs 18.5min on the 8-GPU test -- the main saving is from only pulling docker image once (around 4 min) [ghstack-poisoned]
pytorch · Dec 17, 2024 · da432eb · da432eb
2 parents 71b96ed + c51f424
commit da432eb
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-[![Integration Tests](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
+[![8 GPU Integration Test](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml/badge.svg?branch=main)](https://github.com/pytorch/torchtitan/actions/workflows/integration_test_8gpu.yaml?query=branch%3Amain)
 
 # torchtitan
 

diff --git a/scripts/generate/test_generate.py b/scripts/generate/test_generate.py
@@ -52,7 +52,7 @@ def apply_tp_minus_sp(model: nn.Module, tp_mesh: DeviceMesh):
         },
     )
 
-    for layer_id, transformer_block in model.layers.items():
+    for _, transformer_block in model.layers.items():
         layer_plan = {
             "attention.wq": ColwiseParallel(),
             "attention.wk": ColwiseParallel(),
@@ -81,6 +81,7 @@ def test_generate(
     batch_size: int = 1,
     top_k: Optional[int] = None,
     seed: Optional[int] = None,
+    deterministic: bool = False,
 ):
     init_logger()
     color = utils.Color
@@ -141,7 +142,7 @@ def test_generate(
         # sequences would require https://github.com/pytorch/torchtitan/pull/686
         apply_tp_minus_sp(model, world_mesh["tp"])
 
-    utils.set_determinism(world_mesh, device, seed, deterministic=(seed is not None))
+    utils.set_determinism(world_mesh, device, seed, deterministic)
 
     # materalize model
     model.to_empty(device=device_type)
@@ -272,8 +273,13 @@ def test_generate(
         "--top_k", type=int, help="Prune to select from top_k probabilities. Optional"
     )
     parser.add_argument("--seed", type=int, help="Random seed for reproducibility")
+    parser.add_argument(
+        "--deterministic",
+        action="store_true",
+        help="Use deterministic algorithms wherever possible, may be slower",
+    )
 
-    parser.add_argument("--prompt", type=str, help="Input prompt")
+    parser.add_argument("--prompt", type=str, default="", help="Input prompt")
 
     parser.add_argument(
         "--out",
@@ -293,6 +299,7 @@ def test_generate(
         batch_size=args.batch_size,
         top_k=args.top_k,
         seed=args.seed,
+        deterministic=args.deterministic,
     )
 
     if torch.distributed.is_initialized():

diff --git a/torchtitan/utils.py b/torchtitan/utils.py
@@ -53,7 +53,7 @@ def _warn_overwrite_env(env, val):
 
 
 def set_determinism(
-    world_mesh: DeviceMesh,
+    world_mesh: Optional[DeviceMesh],
     device: torch.device,
     seed: Optional[int] = None,
     deterministic: bool = False,
@@ -68,14 +68,21 @@ def set_determinism(
     Set Determinism flags for increased reproducibility with loss of performance.
     """
     if deterministic:
-        logger.info("Deterministic training enabled (expect perf degradation).")
+        logger.info("Deterministic algorithm enabled (expect perf degradation).")
         torch.use_deterministic_algorithms(True)
         torch.backends.cudnn.deterministic = True
         torch.backends.cudnn.benchmark = False
         # env var for deterministic CuBLAS
         # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 
+    if not world_mesh:
+        if seed is not None:
+            torch.manual_seed(seed)
+            os.environ["PYTHONHASHSEED"] = str(seed % 2**32)
+            logger.debug(f"Single-process job using seed: {seed}")
+        return
+
     # to ensure we can control which ranks have same or different seeds, all ranks agree on a starting seed.
     # if user provides one, we use this. Otherwise rank 0 rolls the dice and everyone else uses that.
     if seed is None: