From a64671b020d52c522093e872338713d94151c8ea Mon Sep 17 00:00:00 2001 From: Taekyung Heo Date: Mon, 13 Jan 2025 19:49:02 -0500 Subject: [PATCH] Mount NCCL_TOPO_FILE in NCCL test --- .../test_template/nccl_test/slurm_command_gen_strategy.py | 7 +++---- .../test_nccl_slurm_command_gen_strategy.py | 6 +++--- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py index 3157c5d8..f26f214c 100644 --- a/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/slurm_command_gen_strategy.py @@ -31,10 +31,9 @@ def _parse_slurm_args( base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, tr) container_mounts = "" - if "NCCL_TOPO_FILE" in env_vars and "DOCKER_NCCL_TOPO_FILE" in env_vars: - nccl_graph_path = Path(env_vars["NCCL_TOPO_FILE"]).resolve() - nccl_graph_file = env_vars["DOCKER_NCCL_TOPO_FILE"] - container_mounts = f"{nccl_graph_path}:{nccl_graph_file}" + if "NCCL_TOPO_FILE" in env_vars: + nccl_topo_file = Path(env_vars["NCCL_TOPO_FILE"]).resolve() + container_mounts = f"{nccl_topo_file}:{nccl_topo_file}" elif "NCCL_TOPO_FILE" in env_vars: del env_vars["NCCL_TOPO_FILE"] diff --git a/tests/slurm_command_gen_strategy/test_nccl_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_nccl_slurm_command_gen_strategy.py index c32ede71..33dd60c7 100644 --- a/tests/slurm_command_gen_strategy/test_nccl_slurm_command_gen_strategy.py +++ b/tests/slurm_command_gen_strategy/test_nccl_slurm_command_gen_strategy.py @@ -35,12 +35,12 @@ def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> NcclTestSlurmCommandGen [ ( "nccl_test", - {"NCCL_TOPO_FILE": "/path/to/topo", "DOCKER_NCCL_TOPO_FILE": "/docker/topo"}, + {"NCCL_TOPO_FILE": "/path/to/topo"}, {"subtest_name": "all_reduce_perf", "docker_image_url": "fake_image_url"}, 2, ["node1", "node2"], { - "container_mounts": "/path/to/topo:/docker/topo", + "container_mounts": "/path/to/topo:/path/to/topo", }, ), ( @@ -50,7 +50,7 @@ def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> NcclTestSlurmCommandGen 1, ["node1"], { - "container_mounts": "", + "container_mounts": "/path/to/topo:/path/to/topo", }, ), ],