Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not require enroot on head node (backport #323) #326

Closed
wants to merge 9 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Specify account while caching images
  • Loading branch information
amaslenn committed Jan 8, 2025
commit 2e560f0cf16c97f4ff283b540028dd742985ff48
Original file line number Diff line number Diff line change
@@ -33,9 +33,9 @@ def is_installed(self) -> InstallStatusResult:
if docker_image_result.success:
return InstallStatusResult(success=True)
else:
if self.docker_image_cache_manager.cache_docker_images_locally:
if self.docker_image_cache_manager.system.cache_docker_images_locally:
expected_docker_image_path = os.path.join(
self.docker_image_cache_manager.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME
self.docker_image_cache_manager.system.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME
)
return InstallStatusResult(
success=False,
Original file line number Diff line number Diff line change
@@ -39,9 +39,9 @@ def is_installed(self) -> InstallStatusResult:
if docker_image_result.success:
return InstallStatusResult(success=True)
else:
if self.docker_image_cache_manager.cache_docker_images_locally:
if self.docker_image_cache_manager.system.cache_docker_images_locally:
expected_docker_image_path = os.path.join(
self.docker_image_cache_manager.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME
self.docker_image_cache_manager.system.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME
)
return InstallStatusResult(
success=False,
Original file line number Diff line number Diff line change
@@ -39,9 +39,9 @@ def is_installed(self) -> InstallStatusResult:
if docker_image_result.success:
return InstallStatusResult(success=True)
else:
if self.docker_image_cache_manager.cache_docker_images_locally:
if self.docker_image_cache_manager.system.cache_docker_images_locally:
expected_docker_image_path = os.path.join(
self.docker_image_cache_manager.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME
self.docker_image_cache_manager.system.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME
)
return InstallStatusResult(
success=False,
8 changes: 4 additions & 4 deletions src/cloudai/util/docker_image_cache_manager.py
Original file line number Diff line number Diff line change
@@ -236,10 +236,10 @@ def cache_docker_image(
logging.error(error_message)
return DockerImageCacheResult(False, "", error_message)

enroot_import_cmd = (
f"srun --export=ALL --partition={self.system.default_partition} "
f"enroot import -o {docker_image_path} docker://{docker_image_url}"
)
srun_prefix = f"srun --export=ALL --partition={self.system.default_partition}"
if self.system.account:
srun_prefix += f" --account={self.system.account}"
enroot_import_cmd = f"{srun_prefix} enroot import -o {docker_image_path} docker://{docker_image_url}"
logging.debug(f"Importing Docker image: {enroot_import_cmd}")

try:
29 changes: 27 additions & 2 deletions tests/test_docker_image_cache_manager.py
Original file line number Diff line number Diff line change
@@ -291,5 +291,30 @@ def test_check_docker_image_exists_with_both_valid_files(mock_exists, mock_isfil

result = manager.check_docker_image_exists("/tmp/existing_file.sqsh", "subdir", "docker_image.sqsh")
assert result.success
assert result.docker_image_path == "/tmp/existing_file.sqsh"
assert result.message == "Docker image file path is valid: /tmp/existing_file.sqsh."
assert result.docker_image_path is not None
assert str(result.docker_image_path) == "docker.io/hello-world"
assert not Path(result.docker_image_path).is_absolute()
assert result.message == ""


def test_system_with_account(slurm_system: SlurmSystem):
slurm_system.account = "test_account"
Path(slurm_system.install_path).mkdir(parents=True, exist_ok=True)

manager = DockerImageCacheManager(slurm_system)
manager._check_prerequisites = lambda docker_image_url: PrerequisiteCheckResult(True, "All prerequisites are met.")

with patch("subprocess.run") as mock_run:
res = manager.cache_docker_image("docker.io/hello-world", "subdir", "docker_image.sqsh")
assert res.success

mock_run.assert_called_once_with(
(
f"srun --export=ALL --partition={slurm_system.default_partition} --account={slurm_system.account} "
f"enroot import -o {slurm_system.install_path}/docker_image.sqsh docker://docker.io/hello-world"
),
shell=True,
check=True,
capture_output=True,
text=True,
)
5 changes: 3 additions & 2 deletions tests/test_slurm_install_strategy.py
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@
from unittest.mock import MagicMock, patch

import pytest

from cloudai import InstallStatusResult
from cloudai.schema.test_template.nccl_test.slurm_install_strategy import NcclTestSlurmInstallStrategy
from cloudai.schema.test_template.nemo_launcher.slurm_install_strategy import (
@@ -94,7 +95,7 @@ def test_is_installed_locally(self, strategy: NcclTestSlurmInstallStrategy):
)

def test_is_installed_remote(self, strategy: NcclTestSlurmInstallStrategy):
strategy.docker_image_cache_manager.cache_docker_images_locally = False
strategy.docker_image_cache_manager.system.cache_docker_images_locally = False

result = strategy.is_installed()

@@ -222,7 +223,7 @@ def test_is_installed_locally(self, strategy: UCCTestSlurmInstallStrategy):
)

def test_is_installed_remote(self, strategy: UCCTestSlurmInstallStrategy):
strategy.docker_image_cache_manager.cache_docker_images_locally = False
strategy.docker_image_cache_manager.system.cache_docker_images_locally = False

result = strategy.is_installed()