Skip to content

Commit

Permalink
use gpus_per_node in system file instead of using try catch for gres …
Browse files Browse the repository at this point in the history
…require in command line
  • Loading branch information
Lily Wang committed Jan 15, 2025
1 parent 9e03b9b commit 35537fb
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 11 deletions.
8 changes: 3 additions & 5 deletions src/cloudai/util/docker_image_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def check_docker_image_exists(self, docker_image_url: str, docker_image_filename
return DockerImageCacheResult(False, Path(), message)

def _import_docker_image(
self, srun_prefix: str, docker_image_url: str, docker_image_path: Path, retry: bool = False
self, srun_prefix: str, docker_image_url: str, docker_image_path: Path
) -> DockerImageCacheResult:
enroot_import_cmd = f"{srun_prefix} enroot import -o {docker_image_path} docker://{docker_image_url}"
logging.debug(f"Importing Docker image: {enroot_import_cmd}")
Expand All @@ -205,10 +205,6 @@ def _import_docker_image(
logging.debug(f"Command used: {enroot_import_cmd}, stdout: {p.stdout}, stderr: {p.stderr}")
return DockerImageCacheResult(True, docker_image_path.absolute(), success_message)
except subprocess.CalledProcessError as e:
# Retry enroot by adding `--gpus=1`` if cluster requires specifing GPU resource
if not retry and e.stderr and "Cannot find GPU specification" in e.stderr:
srun_prefix += " --gpus=1"
return self._import_docker_image(srun_prefix, docker_image_url, docker_image_path, True)
error_message = (
f"Failed to import Docker image {docker_image_url}. Command: {enroot_import_cmd}. Error: {e.stderr}"
)
Expand Down Expand Up @@ -251,6 +247,8 @@ def cache_docker_image(self, docker_image_url: str, docker_image_filename: str)
srun_prefix = f"srun --export=ALL --partition={self.system.default_partition}"
if self.system.account:
srun_prefix += f" --account={self.system.account}"
if self.system.gpus_per_node:
srun_prefix += " --gres=gpu:1"

return self._import_docker_image(srun_prefix, docker_image_url, docker_image_path)

Expand Down
22 changes: 16 additions & 6 deletions tests/test_docker_image_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from pathlib import Path
from unittest.mock import patch

import pytest

from cloudai.systems.slurm.slurm_system import SlurmSystem
from cloudai.util.docker_image_cache_manager import (
DockerImageCacheManager,
Expand Down Expand Up @@ -198,8 +200,12 @@ def test_ensure_docker_image_no_local_cache(slurm_system: SlurmSystem):
assert result.message == ""


def test_system_with_account(slurm_system: SlurmSystem):
slurm_system.account = "test_account"
@pytest.mark.parametrize(
"account, gpus_per_node", [(None, None), ("test_account", None), (None, 8), ("test_account", 8)]
)
def test_system_with_account(slurm_system: SlurmSystem, account, gpus_per_node):
slurm_system.account = account
slurm_system.gpus_per_node = gpus_per_node
slurm_system.install_path.mkdir(parents=True, exist_ok=True)

manager = DockerImageCacheManager(slurm_system)
Expand All @@ -209,11 +215,15 @@ def test_system_with_account(slurm_system: SlurmSystem):
res = manager.cache_docker_image("docker.io/hello-world", "docker_image.sqsh")
assert res.success

command = f"srun --export=ALL --partition={slurm_system.default_partition}"
if slurm_system.account:
command += f" --account={slurm_system.account}"
if slurm_system.gpus_per_node:
command += " --gres=gpu:1"
command += f" enroot import -o {slurm_system.install_path}/docker_image.sqsh docker://docker.io/hello-world"

mock_run.assert_called_once_with(
(
f"srun --export=ALL --partition={slurm_system.default_partition} --account={slurm_system.account} "
f"enroot import -o {slurm_system.install_path}/docker_image.sqsh docker://docker.io/hello-world"
),
command,
shell=True,
check=True,
capture_output=True,
Expand Down

0 comments on commit 35537fb

Please sign in to comment.