From 2414b54712d5d241692c03c6a70e736e04a5459f Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Mon, 6 Jan 2025 06:10:27 -0800 Subject: [PATCH 1/9] Do not require enroot binary on head node --- src/cloudai/util/docker_image_cache_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudai/util/docker_image_cache_manager.py b/src/cloudai/util/docker_image_cache_manager.py index 6a302b652..28dd88680 100644 --- a/src/cloudai/util/docker_image_cache_manager.py +++ b/src/cloudai/util/docker_image_cache_manager.py @@ -238,7 +238,7 @@ def cache_docker_image( return DockerImageCacheResult(False, "", error_message) enroot_import_cmd = ( - f"srun --export=ALL --partition={self.partition_name} " + f"srun --export=ALL --partition={self.partition_name} --account=coreai_dlalgo_ci " f"enroot import -o {docker_image_path} docker://{docker_image_url}" ) logging.debug(f"Importing Docker image: {enroot_import_cmd}") @@ -286,7 +286,7 @@ def _check_prerequisites(self, docker_image_url: str) -> PrerequisiteCheckResult Returns: PrerequisiteCheckResult: Result of the prerequisite check. """ - required_binaries = ["enroot", "srun"] + required_binaries = ["srun"] missing_binaries = [binary for binary in required_binaries if not shutil.which(binary)] if missing_binaries: From 10a847c2853692f768d5bb0e7b3dc4e28ceb54c7 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 6 Jan 2025 15:46:52 +0100 Subject: [PATCH 2/9] Pass SlurmSystem into DockerImageCacheManager --- .../strategy/slurm_command_gen_strategy.py | 6 +- .../slurm/strategy/slurm_install_strategy.py | 6 +- .../util/docker_image_cache_manager.py | 43 +++--- tests/test_docker_image_cache_manager.py | 139 ++++++------------ 4 files changed, 72 insertions(+), 122 deletions(-) diff --git a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py index 71b5854fd..f9b11db4c 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_command_gen_strategy.py @@ -46,11 +46,7 @@ def __init__(self, system: SlurmSystem, env_vars: Dict[str, Any], cmd_args: Dict self.install_path = self.slurm_system.install_path self.default_env_vars.update(self.slurm_system.global_env_vars) - self.docker_image_cache_manager = DockerImageCacheManager( - self.slurm_system.install_path, - self.slurm_system.cache_docker_images_locally, - self.slurm_system.default_partition, - ) + self.docker_image_cache_manager = DockerImageCacheManager(self.slurm_system) docker_image_url_info = self.cmd_args.get("docker_image_url") if docker_image_url_info is not None: self.docker_image_url = docker_image_url_info.get("default") diff --git a/src/cloudai/systems/slurm/strategy/slurm_install_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_install_strategy.py index 6507ef0fb..04a50ec0f 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_install_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_install_strategy.py @@ -41,11 +41,7 @@ def __init__( super().__init__(system, env_vars, cmd_args) self.slurm_system = cast(SlurmSystem, self.system) self.install_path = self.slurm_system.install_path - self.docker_image_cache_manager = DockerImageCacheManager( - self.slurm_system.install_path, - self.slurm_system.cache_docker_images_locally, - self.slurm_system.default_partition, - ) + self.docker_image_cache_manager = DockerImageCacheManager(self.slurm_system) docker_image_url_info = self.cmd_args.get("docker_image_url") if docker_image_url_info is not None: self.docker_image_url = docker_image_url_info.get("default") diff --git a/src/cloudai/util/docker_image_cache_manager.py b/src/cloudai/util/docker_image_cache_manager.py index 28dd88680..65da30135 100644 --- a/src/cloudai/util/docker_image_cache_manager.py +++ b/src/cloudai/util/docker_image_cache_manager.py @@ -20,6 +20,8 @@ import subprocess import tempfile +from cloudai.systems import SlurmSystem + class PrerequisiteCheckResult: """ @@ -107,15 +109,11 @@ class DockerImageCacheManager: Manages the caching of Docker images for installation strategies. Attributes - install_path (str): The base installation path. - cache_docker_images_locally (bool): Whether to cache Docker image files locally. - partition_name (str): The partition name to use in the srun command. + system (SlurmSystem): The Slurm system configuration. """ - def __init__(self, install_path: str, cache_docker_images_locally: bool, partition_name: str) -> None: - self.install_path = install_path - self.cache_docker_images_locally = cache_docker_images_locally - self.partition_name = partition_name + def __init__(self, system: SlurmSystem) -> None: + self.system = system def ensure_docker_image( self, docker_image_url: str, subdir_name: str, docker_image_filename: str @@ -135,7 +133,7 @@ def ensure_docker_image( if image_check_result.success: return image_check_result - if self.cache_docker_images_locally: + if self.system.cache_docker_images_locally: return self.cache_docker_image(docker_image_url, subdir_name, docker_image_filename) return image_check_result @@ -155,13 +153,14 @@ def check_docker_image_exists( DockerImageCacheResult: Result of the Docker image existence check. """ logging.debug( - f"Checking if Docker image exists: docker_image_url={docker_image_url}, subdir_name={subdir_name}, " + f"Checking if Docker image exists: docker_image_url={docker_image_url}, " + f"subdir_name={subdir_name}, " f"docker_image_filename={docker_image_filename}, " - f"cache_docker_images_locally={self.cache_docker_images_locally}" + f"cache_docker_images_locally={self.system.cache_docker_images_locally}" ) # If not caching locally, return True. Defer checking URL accessibility to srun. - if not self.cache_docker_images_locally: + if not self.system.cache_docker_images_locally: return DockerImageCacheResult(True, docker_image_url, "") # Check if docker_image_url is a file path and exists @@ -171,12 +170,12 @@ def check_docker_image_exists( ) # Check if the cache file exists - if not os.path.exists(self.install_path): - message = f"Install path {self.install_path} does not exist." + if not os.path.exists(self.system.install_path): + message = f"Install path {self.system.install_path} does not exist." logging.debug(message) return DockerImageCacheResult(False, "", message) - subdir_path = os.path.join(self.install_path, subdir_name) + subdir_path = os.path.join(self.system.install_path, subdir_name) if not os.path.exists(subdir_path): message = f"Subdirectory path {subdir_path} does not exist." logging.debug(message) @@ -206,7 +205,7 @@ def cache_docker_image( Returns: DockerImageCacheResult: Result of the Docker image caching operation. """ - subdir_path = os.path.join(self.install_path, subdir_name) + subdir_path = os.path.join(self.system.install_path, subdir_name) docker_image_path = os.path.join(subdir_path, docker_image_filename) if os.path.isfile(docker_image_path): @@ -219,13 +218,13 @@ def cache_docker_image( logging.error(f"Prerequisite check failed: {prerequisite_check.message}") return DockerImageCacheResult(False, "", prerequisite_check.message) - if not os.path.exists(self.install_path): - error_message = f"Install path {self.install_path} does not exist." + if not os.path.exists(self.system.install_path): + error_message = f"Install path {self.system.install_path} does not exist." logging.error(error_message) return DockerImageCacheResult(False, "", error_message) - if not os.access(self.install_path, os.W_OK): - error_message = f"No permission to write in install path {self.install_path}." + if not os.access(self.system.install_path, os.W_OK): + error_message = f"No permission to write in install path {self.system.install_path}." logging.error(error_message) return DockerImageCacheResult(False, "", error_message) @@ -238,7 +237,7 @@ def cache_docker_image( return DockerImageCacheResult(False, "", error_message) enroot_import_cmd = ( - f"srun --export=ALL --partition={self.partition_name} --account=coreai_dlalgo_ci " + f"srun --export=ALL --partition={self.system.default_partition} " f"enroot import -o {docker_image_path} docker://{docker_image_url}" ) logging.debug(f"Importing Docker image: {enroot_import_cmd}") @@ -387,7 +386,7 @@ def uninstall_cached_image(self, subdir_name: str, docker_image_filename: str) - if not result.success: return result - subdir_path = os.path.join(self.install_path, subdir_name) + subdir_path = os.path.join(self.system.install_path, subdir_name) if os.path.isdir(subdir_path): try: if not os.listdir(subdir_path): @@ -414,7 +413,7 @@ def remove_cached_image(self, subdir_name: str, docker_image_filename: str) -> D Returns: DockerImageCacheResult: Result of the removal operation. """ - docker_image_path = os.path.join(self.install_path, subdir_name, docker_image_filename) + docker_image_path = os.path.join(self.system.install_path, subdir_name, docker_image_filename) if os.path.isfile(docker_image_path): try: os.remove(docker_image_path) diff --git a/tests/test_docker_image_cache_manager.py b/tests/test_docker_image_cache_manager.py index 1e9a29f5c..543b87b9d 100644 --- a/tests/test_docker_image_cache_manager.py +++ b/tests/test_docker_image_cache_manager.py @@ -15,8 +15,12 @@ # limitations under the License. import subprocess -from unittest.mock import MagicMock, patch +from pathlib import Path +from unittest.mock import patch +import pytest + +from cloudai.systems.slurm.slurm_system import SlurmSystem from cloudai.util.docker_image_cache_manager import ( DockerImageCacheManager, DockerImageCacheResult, @@ -24,12 +28,26 @@ ) +@pytest.fixture +def slurm_system(tmp_path: Path) -> SlurmSystem: + system = SlurmSystem( + name="test_system", + install_path=str(tmp_path / "install"), + output_path=str(tmp_path / "output"), + cache_docker_images_locally=True, + default_partition="main", + partitions={}, + ) + (tmp_path / "install").mkdir() + return system + + @patch("os.path.isfile") @patch("os.path.exists") @patch("os.access") -def test_ensure_docker_image_file_exists(mock_access, mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") - mock_isfile.return_value = True +def test_ensure_docker_image_file_exists(mock_access, mock_exists, mock_is_file, slurm_system: SlurmSystem): + manager = DockerImageCacheManager(slurm_system) + mock_is_file.return_value = True mock_exists.return_value = True result = manager.ensure_docker_image("/tmp/existing_file.sqsh", "subdir", "docker_image.sqsh") assert result.success @@ -40,9 +58,9 @@ def test_ensure_docker_image_file_exists(mock_access, mock_exists, mock_isfile): @patch("os.path.isfile") @patch("os.path.exists") @patch("os.access") -def test_ensure_docker_image_url_cache_enabled(mock_access, mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") - mock_isfile.return_value = False +def test_ensure_docker_image_url_cache_enabled(mock_access, mock_exists, mock_is_file, slurm_system: SlurmSystem): + manager = DockerImageCacheManager(slurm_system) + mock_is_file.return_value = False mock_exists.return_value = True mock_access.return_value = True with patch.object( @@ -61,23 +79,26 @@ def test_ensure_docker_image_url_cache_enabled(mock_access, mock_exists, mock_is @patch("os.path.isfile") @patch("os.path.exists") @patch("os.access") -@patch("os.makedirs") @patch("subprocess.run") @patch("cloudai.util.docker_image_cache_manager.DockerImageCacheManager._check_prerequisites") -def test_cache_docker_image(mock_check_prerequisites, mock_run, mock_makedirs, mock_access, mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") +def test_cache_docker_image( + mock_check_prerequisites, mock_run, mock_access, mock_exists, mock_is_file, slurm_system: SlurmSystem +): + manager = DockerImageCacheManager(slurm_system) # Test when cached file already exists - mock_isfile.return_value = True + mock_is_file.return_value = True result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz") assert result.success - assert result.message == "Cached Docker image already exists at /fake/install/path/subdir/image.tar.gz." + assert result.docker_image_path == f"{slurm_system.install_path}/subdir/image.tar.gz" + assert result.message == f"Cached Docker image already exists at {slurm_system.install_path}/subdir/image.tar.gz." # Test creating subdirectory when it doesn't exist - mock_isfile.return_value = False - mock_exists.side_effect = [True, False, False] # install_path exists, subdir_path does not + mock_is_file.return_value = False + mock_exists.side_effect = [True, False] # install_path exists, subdir_path does not with patch("os.makedirs") as mock_makedirs: result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz") + print(f"!! {result.message=}") mock_makedirs.assert_called_once_with("/fake/install/path/subdir") # Ensure prerequisites are always met for the following tests @@ -88,7 +109,7 @@ def test_cache_docker_image(mock_check_prerequisites, mock_run, mock_makedirs, m mock_exists.side_effect = None # Test caching success with subprocess command (removal of default partition keyword) - mock_isfile.return_value = False + mock_is_file.return_value = False mock_exists.side_effect = [True, True, True, True, True] # Ensure all path checks return True mock_run.return_value = subprocess.CompletedProcess(args=["cmd"], returncode=0, stderr="") result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz") @@ -100,16 +121,16 @@ def test_cache_docker_image(mock_check_prerequisites, mock_run, mock_makedirs, m text=True, ) assert result.success - assert result.message == "Docker image cached successfully at /fake/install/path/subdir/image.tar.gz." + assert result.message == f"Docker image cached successfully at {slurm_system.install_path}/image.tar.gz." # Test caching failure due to subprocess error - mock_isfile.return_value = False + mock_is_file.return_value = False mock_run.side_effect = subprocess.CalledProcessError(1, "cmd") result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz") assert not result.success # Test caching failure due to disk-related errors - mock_isfile.return_value = False + mock_is_file.return_value = False mock_run.side_effect = None mock_run.return_value = subprocess.CompletedProcess(args=["cmd"], returncode=1, stderr="Disk quota exceeded\n") mock_exists.side_effect = [True, True, True, True, True] @@ -127,8 +148,8 @@ def test_cache_docker_image(mock_check_prerequisites, mock_run, mock_makedirs, m @patch("os.path.exists") @patch("os.access") @patch("os.remove") -def test_remove_cached_image(mock_remove, mock_access, mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") +def test_remove_cached_image(mock_remove, mock_access, mock_exists, mock_isfile, slurm_system: SlurmSystem): + manager = DockerImageCacheManager(slurm_system) # Test successful removal mock_isfile.return_value = True @@ -154,8 +175,8 @@ def test_remove_cached_image(mock_remove, mock_access, mock_exists, mock_isfile) @patch("os.path.exists") @patch("os.access") @patch("os.remove") -def test_uninstall_cached_image(mock_remove, mock_access, mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") +def test_uninstall_cached_image(mock_remove, mock_access, mock_exists, mock_isfile, slurm_system: SlurmSystem): + manager = DockerImageCacheManager(slurm_system) # Test successful uninstallation and subdirectory removal mock_isfile.return_value = True @@ -196,8 +217,8 @@ def test_uninstall_cached_image(mock_remove, mock_access, mock_exists, mock_isfi @patch("shutil.which") @patch("cloudai.util.docker_image_cache_manager.DockerImageCacheManager._check_docker_image_accessibility") -def test_check_prerequisites(mock_check_docker_image_accessibility, mock_which): - manager = DockerImageCacheManager("/fake/install/path", True, "default") +def test_check_prerequisites(mock_check_docker_image_accessibility, mock_which, slurm_system: SlurmSystem): + manager = DockerImageCacheManager(slurm_system) # Ensure enroot and srun are installed mock_which.side_effect = lambda x: x in ["enroot", "srun"] @@ -234,72 +255,10 @@ def test_check_prerequisites(mock_check_docker_image_accessibility, mock_which): assert "Docker image URL not accessible." in result.message -@patch("subprocess.Popen") -@patch("shutil.which") -@patch("tempfile.TemporaryDirectory") -def test_check_docker_image_accessibility_with_enroot(mock_tempdir, mock_which, mock_popen): - manager = DockerImageCacheManager("/fake/install/path", True, "default") - - # Ensure docker binary is not available - mock_which.return_value = None - - # Mocking the TemporaryDirectory context manager - mock_temp_dir = MagicMock() - mock_tempdir.return_value.__enter__.return_value = mock_temp_dir - temp_dir_path = "/fake/temp/dir" - mock_temp_dir.__enter__.return_value = temp_dir_path - - # Mock Popen for enroot command with success scenario - process_mock = MagicMock() - process_mock.stderr.readline.side_effect = [ - b"Found all layers in cache\n", # Simulate successful output - b"", - b"", - ] - process_mock.poll.side_effect = [None, None, 0] # Simulate process running then finishing - - mock_popen.return_value = process_mock - - result = manager._check_docker_image_accessibility("docker.io/hello-world") - assert result.success - assert result.message == "Docker image URL, docker.io/hello-world, is accessible." - - # Verify the command contains the required keywords - command = mock_popen.call_args[0][0] - assert "enroot import -o" in command - assert "docker://docker.io/hello-world" in command - - # Mock Popen for enroot command with failure scenario - process_mock.reset_mock() - process_mock.stderr.readline.side_effect = [ - b"[ERROR] URL https://nvcr.io/proxy_auth returned error code: 401 Unauthorized\n", # Simulate 401 error output - b"", - b"", - ] - process_mock.poll.side_effect = [None, None, 0] # Simulate process running then finishing - - mock_popen.return_value = process_mock - - result = manager._check_docker_image_accessibility("docker.io/hello-world") - assert not result.success - assert ( - "Failed to access Docker image URL: docker.io/hello-world. " - "Error: [ERROR] URL https://nvcr.io/proxy_auth returned error code: 401 Unauthorized\n" - "This error indicates that access to the Docker image URL is unauthorized. " - "Please ensure you have the necessary permissions and have followed the instructions in the README " - "for setting up your credentials correctly." in result.message.replace("\n\n", "\n") - ) - - # Verify the command contains the required keywords - command = mock_popen.call_args[0][0] - assert "enroot import -o" in command - assert "docker://docker.io/hello-world" in command - - @patch("os.path.isfile") @patch("os.path.exists") -def test_check_docker_image_exists_with_only_cached_file(mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") +def test_check_docker_image_exists_with_only_cached_file(mock_exists, mock_isfile, slurm_system: SlurmSystem): + manager = DockerImageCacheManager(slurm_system) # Simulate only the cached file being a valid file path mock_isfile.side_effect = lambda path: path == "/fake/install/path/subdir/docker_image.sqsh" @@ -317,8 +276,8 @@ def test_check_docker_image_exists_with_only_cached_file(mock_exists, mock_isfil @patch("os.path.isfile") @patch("os.path.exists") -def test_check_docker_image_exists_with_both_valid_files(mock_exists, mock_isfile): - manager = DockerImageCacheManager("/fake/install/path", True, "default") +def test_check_docker_image_exists_with_both_valid_files(mock_exists, mock_isfile, slurm_system: SlurmSystem): + manager = DockerImageCacheManager(slurm_system) # Simulate both cached file and docker image URL being valid file paths mock_isfile.side_effect = lambda path: path in [ From 2e560f0cf16c97f4ff283b540028dd742985ff48 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 6 Jan 2025 15:57:08 +0100 Subject: [PATCH 3/9] Specify account while caching images --- .../jax_toolbox/slurm_install_strategy.py | 4 +-- .../nccl_test/slurm_install_strategy.py | 4 +-- .../ucc_test/slurm_install_strategy.py | 4 +-- .../util/docker_image_cache_manager.py | 8 ++--- tests/test_docker_image_cache_manager.py | 29 +++++++++++++++++-- tests/test_slurm_install_strategy.py | 5 ++-- 6 files changed, 40 insertions(+), 14 deletions(-) diff --git a/src/cloudai/schema/test_template/jax_toolbox/slurm_install_strategy.py b/src/cloudai/schema/test_template/jax_toolbox/slurm_install_strategy.py index 29fd81c1d..257e8c2c6 100644 --- a/src/cloudai/schema/test_template/jax_toolbox/slurm_install_strategy.py +++ b/src/cloudai/schema/test_template/jax_toolbox/slurm_install_strategy.py @@ -33,9 +33,9 @@ def is_installed(self) -> InstallStatusResult: if docker_image_result.success: return InstallStatusResult(success=True) else: - if self.docker_image_cache_manager.cache_docker_images_locally: + if self.docker_image_cache_manager.system.cache_docker_images_locally: expected_docker_image_path = os.path.join( - self.docker_image_cache_manager.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME + self.docker_image_cache_manager.system.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME ) return InstallStatusResult( success=False, diff --git a/src/cloudai/schema/test_template/nccl_test/slurm_install_strategy.py b/src/cloudai/schema/test_template/nccl_test/slurm_install_strategy.py index e4706b488..f359048b6 100644 --- a/src/cloudai/schema/test_template/nccl_test/slurm_install_strategy.py +++ b/src/cloudai/schema/test_template/nccl_test/slurm_install_strategy.py @@ -39,9 +39,9 @@ def is_installed(self) -> InstallStatusResult: if docker_image_result.success: return InstallStatusResult(success=True) else: - if self.docker_image_cache_manager.cache_docker_images_locally: + if self.docker_image_cache_manager.system.cache_docker_images_locally: expected_docker_image_path = os.path.join( - self.docker_image_cache_manager.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME + self.docker_image_cache_manager.system.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME ) return InstallStatusResult( success=False, diff --git a/src/cloudai/schema/test_template/ucc_test/slurm_install_strategy.py b/src/cloudai/schema/test_template/ucc_test/slurm_install_strategy.py index a633848ab..81209b2db 100644 --- a/src/cloudai/schema/test_template/ucc_test/slurm_install_strategy.py +++ b/src/cloudai/schema/test_template/ucc_test/slurm_install_strategy.py @@ -39,9 +39,9 @@ def is_installed(self) -> InstallStatusResult: if docker_image_result.success: return InstallStatusResult(success=True) else: - if self.docker_image_cache_manager.cache_docker_images_locally: + if self.docker_image_cache_manager.system.cache_docker_images_locally: expected_docker_image_path = os.path.join( - self.docker_image_cache_manager.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME + self.docker_image_cache_manager.system.install_path, self.SUBDIR_PATH, self.DOCKER_IMAGE_FILENAME ) return InstallStatusResult( success=False, diff --git a/src/cloudai/util/docker_image_cache_manager.py b/src/cloudai/util/docker_image_cache_manager.py index 65da30135..a3ab2d77a 100644 --- a/src/cloudai/util/docker_image_cache_manager.py +++ b/src/cloudai/util/docker_image_cache_manager.py @@ -236,10 +236,10 @@ def cache_docker_image( logging.error(error_message) return DockerImageCacheResult(False, "", error_message) - enroot_import_cmd = ( - f"srun --export=ALL --partition={self.system.default_partition} " - f"enroot import -o {docker_image_path} docker://{docker_image_url}" - ) + srun_prefix = f"srun --export=ALL --partition={self.system.default_partition}" + if self.system.account: + srun_prefix += f" --account={self.system.account}" + enroot_import_cmd = f"{srun_prefix} enroot import -o {docker_image_path} docker://{docker_image_url}" logging.debug(f"Importing Docker image: {enroot_import_cmd}") try: diff --git a/tests/test_docker_image_cache_manager.py b/tests/test_docker_image_cache_manager.py index 543b87b9d..923d8b893 100644 --- a/tests/test_docker_image_cache_manager.py +++ b/tests/test_docker_image_cache_manager.py @@ -291,5 +291,30 @@ def test_check_docker_image_exists_with_both_valid_files(mock_exists, mock_isfil result = manager.check_docker_image_exists("/tmp/existing_file.sqsh", "subdir", "docker_image.sqsh") assert result.success - assert result.docker_image_path == "/tmp/existing_file.sqsh" - assert result.message == "Docker image file path is valid: /tmp/existing_file.sqsh." + assert result.docker_image_path is not None + assert str(result.docker_image_path) == "docker.io/hello-world" + assert not Path(result.docker_image_path).is_absolute() + assert result.message == "" + + +def test_system_with_account(slurm_system: SlurmSystem): + slurm_system.account = "test_account" + Path(slurm_system.install_path).mkdir(parents=True, exist_ok=True) + + manager = DockerImageCacheManager(slurm_system) + manager._check_prerequisites = lambda docker_image_url: PrerequisiteCheckResult(True, "All prerequisites are met.") + + with patch("subprocess.run") as mock_run: + res = manager.cache_docker_image("docker.io/hello-world", "subdir", "docker_image.sqsh") + assert res.success + + mock_run.assert_called_once_with( + ( + f"srun --export=ALL --partition={slurm_system.default_partition} --account={slurm_system.account} " + f"enroot import -o {slurm_system.install_path}/docker_image.sqsh docker://docker.io/hello-world" + ), + shell=True, + check=True, + capture_output=True, + text=True, + ) diff --git a/tests/test_slurm_install_strategy.py b/tests/test_slurm_install_strategy.py index 35d26637d..b980e3093 100644 --- a/tests/test_slurm_install_strategy.py +++ b/tests/test_slurm_install_strategy.py @@ -18,6 +18,7 @@ from unittest.mock import MagicMock, patch import pytest + from cloudai import InstallStatusResult from cloudai.schema.test_template.nccl_test.slurm_install_strategy import NcclTestSlurmInstallStrategy from cloudai.schema.test_template.nemo_launcher.slurm_install_strategy import ( @@ -94,7 +95,7 @@ def test_is_installed_locally(self, strategy: NcclTestSlurmInstallStrategy): ) def test_is_installed_remote(self, strategy: NcclTestSlurmInstallStrategy): - strategy.docker_image_cache_manager.cache_docker_images_locally = False + strategy.docker_image_cache_manager.system.cache_docker_images_locally = False result = strategy.is_installed() @@ -222,7 +223,7 @@ def test_is_installed_locally(self, strategy: UCCTestSlurmInstallStrategy): ) def test_is_installed_remote(self, strategy: UCCTestSlurmInstallStrategy): - strategy.docker_image_cache_manager.cache_docker_images_locally = False + strategy.docker_image_cache_manager.system.cache_docker_images_locally = False result = strategy.is_installed() From 914e9c80e01c8304b67fd4f8562bb5565a9cc788 Mon Sep 17 00:00:00 2001 From: Taekyung Heo Date: Mon, 6 Jan 2025 10:33:28 -0500 Subject: [PATCH 4/9] Add test hooks to USER_GUIDE.md (#322) --- USER_GUIDE.md | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/USER_GUIDE.md b/USER_GUIDE.md index 573166a86..8faf79637 100644 --- a/USER_GUIDE.md +++ b/USER_GUIDE.md @@ -363,6 +363,39 @@ You can update the fields to adjust the behavior. For example, you can update th 2. Follow the instructions under 'Usage Tips' on how to download the tokenizer. 3. Replace "training.model.tokenizer.model=TOKENIZER_MODEL" with "training.model.tokenizer.model=YOUR_TOKENIZER_PATH" (the tokenizer should be a .model file) in conf/common/test/llama.toml. + +## Using Test Hooks in CloudAI + +A test hook in CloudAI is a specialized test that runs either before or after each main test in a scenario, providing flexibility to prepare the environment or clean up resources. Hooks are defined as pre-test or post-test and referenced in the test scenario’s TOML file using `pre_test` and `post_test` fields. +``` +name = "nccl-test" + +pre_test = "nccl_test_pre" +post_test = "nccl_test_post" + +[[Tests]] +id = "Tests.1" +test_name = "nccl_test_all_reduce" +num_nodes = "2" +time_limit = "00:20:00" +``` + +CloudAI organizes hooks in a dedicated directory structure: + +- Hook directory: All hook configurations reside in `conf/hook/`. +- Hook test scenarios: Place pre-test hook and post-test hook scenario files in `conf/hook/`. +- Hook tests: Place individual tests referenced in hooks within `conf/hook/test/`. + +In the execution flow, pre-test hooks run before the main test, which only executes if the pre-test completes successfully. Post-test hooks follow the main test, provided the prior steps succeed. If a pre-test hook fails, the main test and its post-test hook are skipped. +``` +name = "nccl_test_pre" + +[[Tests]] +id = "Tests.1" +test_name = "nccl_test_all_reduce" +time_limit = "00:20:00" +``` + ## Troubleshooting In this section, we will guide you through identifying the root cause of issues, determining whether they stem from system infrastructure or a bug in CloudAI. Users should closely follow the USER_GUIDE.md and README.md for installation, adding test templates, tests, and test scenarios. From 0358fb87ab7e64dfb1ac04174e60bd35e336592e Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Mon, 6 Jan 2025 07:40:54 -0800 Subject: [PATCH 5/9] Reduce noise in CLI output --- src/cloudai/util/docker_image_cache_manager.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/cloudai/util/docker_image_cache_manager.py b/src/cloudai/util/docker_image_cache_manager.py index a3ab2d77a..0e84013ff 100644 --- a/src/cloudai/util/docker_image_cache_manager.py +++ b/src/cloudai/util/docker_image_cache_manager.py @@ -263,17 +263,8 @@ def cache_docker_image( error_message = ( f"Failed to import Docker image from {docker_image_url}. Command: {enroot_import_cmd}. Error: {e}" ) - logging.error(error_message) - return DockerImageCacheResult( - False, - "", - ( - f"Failed to import Docker image from {docker_image_url}. " - f"Command: {enroot_import_cmd}. " - f"Error: {e}. Please check the Docker image URL and ensure that it is accessible and set up with " - f"valid credentials." - ), - ) + logging.debug(error_message) + return DockerImageCacheResult(False, message=error_message) def _check_prerequisites(self, docker_image_url: str) -> PrerequisiteCheckResult: """ From 8ce31c3d5bd9774dcc34024789140adb6f527a6d Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Mon, 6 Jan 2025 16:53:15 +0100 Subject: [PATCH 6/9] Make ruff happy --- src/cloudai/_core/base_installer.py | 10 ++++++++-- tests/test_docker_image_cache_manager.py | 1 - tests/test_slurm_install_strategy.py | 1 - 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/cloudai/_core/base_installer.py b/src/cloudai/_core/base_installer.py index c00906a59..e2d88877b 100644 --- a/src/cloudai/_core/base_installer.py +++ b/src/cloudai/_core/base_installer.py @@ -122,10 +122,16 @@ def install(self, test_templates: Iterable[TestTemplate]) -> InstallStatusResult else: install_results[test_template.name] = result.message done += 1 - logging.info( - f"{done}/{total} Installation for {test_template.name} finished with status: " + msg = ( + f"{done}/{total} Installation of {test_template.name}: " f"{result.message if result.message else 'OK'}" ) + if result.success: + install_results[test_template.name] = "Success" + logging.info(msg) + else: + install_results[test_template.name] = result.message + logging.error(msg) except Exception as e: done += 1 logging.error(f"{done}/{total} Installation failed for {test_template.name}: {e}") diff --git a/tests/test_docker_image_cache_manager.py b/tests/test_docker_image_cache_manager.py index 923d8b893..1a2519d81 100644 --- a/tests/test_docker_image_cache_manager.py +++ b/tests/test_docker_image_cache_manager.py @@ -19,7 +19,6 @@ from unittest.mock import patch import pytest - from cloudai.systems.slurm.slurm_system import SlurmSystem from cloudai.util.docker_image_cache_manager import ( DockerImageCacheManager, diff --git a/tests/test_slurm_install_strategy.py b/tests/test_slurm_install_strategy.py index b980e3093..af8258ccb 100644 --- a/tests/test_slurm_install_strategy.py +++ b/tests/test_slurm_install_strategy.py @@ -18,7 +18,6 @@ from unittest.mock import MagicMock, patch import pytest - from cloudai import InstallStatusResult from cloudai.schema.test_template.nccl_test.slurm_install_strategy import NcclTestSlurmInstallStrategy from cloudai.schema.test_template.nemo_launcher.slurm_install_strategy import ( From 15719081b744606da5c15369bc3d2c9da3267955 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Wed, 8 Jan 2025 12:28:43 +0100 Subject: [PATCH 7/9] Fix tests --- tests/test_docker_image_cache_manager.py | 66 +++++++++++++----------- tests/test_slurm_install_strategy.py | 4 +- 2 files changed, 37 insertions(+), 33 deletions(-) diff --git a/tests/test_docker_image_cache_manager.py b/tests/test_docker_image_cache_manager.py index 1a2519d81..c5419f8a7 100644 --- a/tests/test_docker_image_cache_manager.py +++ b/tests/test_docker_image_cache_manager.py @@ -97,8 +97,7 @@ def test_cache_docker_image( mock_exists.side_effect = [True, False] # install_path exists, subdir_path does not with patch("os.makedirs") as mock_makedirs: result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz") - print(f"!! {result.message=}") - mock_makedirs.assert_called_once_with("/fake/install/path/subdir") + mock_makedirs.assert_called_once_with(f"{manager.system.install_path}/subdir") # Ensure prerequisites are always met for the following tests mock_check_prerequisites.return_value = PrerequisiteCheckResult(True, "All prerequisites are met.") @@ -113,14 +112,17 @@ def test_cache_docker_image( mock_run.return_value = subprocess.CompletedProcess(args=["cmd"], returncode=0, stderr="") result = manager.cache_docker_image("docker.io/hello-world", "subdir", "image.tar.gz") mock_run.assert_called_once_with( - "srun --export=ALL --partition=default enroot import -o /fake/install/path/subdir/image.tar.gz docker://docker.io/hello-world", + ( + f"srun --export=ALL --partition={manager.system.default_partition} enroot " + f"import -o {manager.system.install_path}/subdir/image.tar.gz docker://docker.io/hello-world" + ), shell=True, check=True, capture_output=True, text=True, ) assert result.success - assert result.message == f"Docker image cached successfully at {slurm_system.install_path}/image.tar.gz." + assert result.message == f"Docker image cached successfully at {slurm_system.install_path}/subdir/image.tar.gz." # Test caching failure due to subprocess error mock_is_file.return_value = False @@ -154,8 +156,11 @@ def test_remove_cached_image(mock_remove, mock_access, mock_exists, mock_isfile, mock_isfile.return_value = True result = manager.remove_cached_image("subdir", "image.tar.gz") assert result.success - assert result.message == "Cached Docker image removed successfully from /fake/install/path/subdir/image.tar.gz." - mock_remove.assert_called_once_with("/fake/install/path/subdir/image.tar.gz") + assert ( + result.message + == f"Cached Docker image removed successfully from {manager.system.install_path}/subdir/image.tar.gz." + ) + mock_remove.assert_called_once_with(f"{manager.system.install_path}/subdir/image.tar.gz") # Test failed removal due to OSError mock_remove.side_effect = OSError("Mocked OSError") @@ -167,7 +172,10 @@ def test_remove_cached_image(mock_remove, mock_access, mock_exists, mock_isfile, mock_isfile.return_value = False result = manager.remove_cached_image("subdir", "image.tar.gz") assert result.success - assert result.message == "No cached Docker image found to remove at /fake/install/path/subdir/image.tar.gz." + assert ( + result.message + == f"No cached Docker image found to remove at {manager.system.install_path}/subdir/image.tar.gz." + ) @patch("os.path.isfile") @@ -185,17 +193,17 @@ def test_uninstall_cached_image(mock_remove, mock_access, mock_exists, mock_isfi result = manager.uninstall_cached_image("subdir", "image.tar.gz") assert result.success assert result.message in [ - "Cached Docker image uninstalled successfully from /fake/install/path/subdir.", - "Subdirectory removed successfully: /fake/install/path/subdir.", + f"Cached Docker image uninstalled successfully from {manager.system.install_path}/subdir.", + f"Subdirectory removed successfully: {manager.system.install_path}/subdir.", ] - mock_remove.assert_called_once_with("/fake/install/path/subdir/image.tar.gz") + mock_remove.assert_called_once_with(f"{manager.system.install_path}/subdir/image.tar.gz") # Test successful uninstallation but subdirectory not empty mock_listdir.return_value = ["otherfile"] result = manager.uninstall_cached_image("subdir", "image.tar.gz") assert result.success - assert result.message == "Cached Docker image uninstalled successfully from /fake/install/path/subdir." - mock_remove.assert_called_with("/fake/install/path/subdir/image.tar.gz") + assert result.message == f"Cached Docker image uninstalled successfully from {manager.system.install_path}/subdir." + mock_remove.assert_called_with(f"{manager.system.install_path}/subdir/image.tar.gz") # Test failed removal due to OSError mock_remove.side_effect = OSError("Mocked OSError") @@ -208,7 +216,7 @@ def test_uninstall_cached_image(mock_remove, mock_access, mock_exists, mock_isfi mock_exists.return_value = False result = manager.uninstall_cached_image("subdir", "image.tar.gz") assert result.success - assert result.message == "Cached Docker image uninstalled successfully from /fake/install/path/subdir." + assert result.message == f"Cached Docker image uninstalled successfully from {manager.system.install_path}/subdir." # Cleanup patch.stopall() @@ -230,12 +238,6 @@ def test_check_prerequisites(mock_check_docker_image_accessibility, mock_which, assert result.success assert result.message == "All prerequisites are met." - # Test enroot not installed - mock_which.side_effect = lambda x: x != "enroot" - result = manager._check_prerequisites("docker.io/hello-world") - assert not result.success - assert result.message == "enroot are required for caching Docker images but are not installed." - # Test srun not installed mock_which.side_effect = lambda x: x != "srun" result = manager._check_prerequisites("docker.io/hello-world") @@ -260,17 +262,20 @@ def test_check_docker_image_exists_with_only_cached_file(mock_exists, mock_isfil manager = DockerImageCacheManager(slurm_system) # Simulate only the cached file being a valid file path - mock_isfile.side_effect = lambda path: path == "/fake/install/path/subdir/docker_image.sqsh" + mock_isfile.side_effect = lambda path: path == f"{manager.system.install_path}/subdir/docker_image.sqsh" mock_exists.side_effect = lambda path: path in [ - "/fake/install/path", - "/fake/install/path/subdir", - "/fake/install/path/subdir/docker_image.sqsh", + f"{manager.system.install_path}", + f"{manager.system.install_path}/subdir", + f"{manager.system.install_path}/subdir/docker_image.sqsh", ] result = manager.check_docker_image_exists("/tmp/non_existing_file.sqsh", "subdir", "docker_image.sqsh") assert result.success - assert result.docker_image_path == "/fake/install/path/subdir/docker_image.sqsh" - assert result.message == "Cached Docker image already exists at /fake/install/path/subdir/docker_image.sqsh." + assert result.docker_image_path == f"{manager.system.install_path}/subdir/docker_image.sqsh" + assert ( + result.message + == f"Cached Docker image already exists at {manager.system.install_path}/subdir/docker_image.sqsh." + ) @patch("os.path.isfile") @@ -281,19 +286,18 @@ def test_check_docker_image_exists_with_both_valid_files(mock_exists, mock_isfil # Simulate both cached file and docker image URL being valid file paths mock_isfile.side_effect = lambda path: path in [ "/tmp/existing_file.sqsh", - "/fake/install/path/subdir/docker_image.sqsh", + f"{manager.system.install_path}/subdir/docker_image.sqsh", ] mock_exists.side_effect = lambda path: path in [ "/tmp/existing_file.sqsh", - "/fake/install/path/subdir/docker_image.sqsh", + f"{manager.system.install_path}/subdir/docker_image.sqsh", ] result = manager.check_docker_image_exists("/tmp/existing_file.sqsh", "subdir", "docker_image.sqsh") assert result.success assert result.docker_image_path is not None - assert str(result.docker_image_path) == "docker.io/hello-world" - assert not Path(result.docker_image_path).is_absolute() - assert result.message == "" + assert str(result.docker_image_path) == "/tmp/existing_file.sqsh" + assert result.message == "Docker image file path is valid: /tmp/existing_file.sqsh." def test_system_with_account(slurm_system: SlurmSystem): @@ -310,7 +314,7 @@ def test_system_with_account(slurm_system: SlurmSystem): mock_run.assert_called_once_with( ( f"srun --export=ALL --partition={slurm_system.default_partition} --account={slurm_system.account} " - f"enroot import -o {slurm_system.install_path}/docker_image.sqsh docker://docker.io/hello-world" + f"enroot import -o {slurm_system.install_path}/subdir/docker_image.sqsh docker://docker.io/hello-world" ), shell=True, check=True, diff --git a/tests/test_slurm_install_strategy.py b/tests/test_slurm_install_strategy.py index af8258ccb..241888d1e 100644 --- a/tests/test_slurm_install_strategy.py +++ b/tests/test_slurm_install_strategy.py @@ -37,6 +37,7 @@ def slurm_system(tmp_path: Path) -> SlurmSystem: install_path=str(tmp_path / "install"), output_path=str(tmp_path / "output"), default_partition="main", + cache_docker_images_locally=True, partitions={ "main": [ SlurmNode(name="node1", partition="main", state=SlurmNodeState.IDLE), @@ -66,8 +67,7 @@ def test_install_path_attribute(slurm_install_strategy: SlurmInstallStrategy, sl @pytest.fixture def mock_docker_image_cache_manager(slurm_system: SlurmSystem): mock = MagicMock() - mock.cache_docker_images_locally = True - mock.install_path = slurm_system.install_path + mock.system = slurm_system mock.check_docker_image_exists.return_value = InstallStatusResult(success=False, message="Docker image not found") mock.ensure_docker_image.return_value = InstallStatusResult(success=True) mock.uninstall_cached_image.return_value = InstallStatusResult(success=True) From ca970723563b6ca6eaaf8f1b10317c3abb877c69 Mon Sep 17 00:00:00 2001 From: Andrey Maslennikov Date: Wed, 8 Jan 2025 12:31:17 +0100 Subject: [PATCH 8/9] Revert changes in README --- USER_GUIDE.md | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/USER_GUIDE.md b/USER_GUIDE.md index 8faf79637..573166a86 100644 --- a/USER_GUIDE.md +++ b/USER_GUIDE.md @@ -363,39 +363,6 @@ You can update the fields to adjust the behavior. For example, you can update th 2. Follow the instructions under 'Usage Tips' on how to download the tokenizer. 3. Replace "training.model.tokenizer.model=TOKENIZER_MODEL" with "training.model.tokenizer.model=YOUR_TOKENIZER_PATH" (the tokenizer should be a .model file) in conf/common/test/llama.toml. - -## Using Test Hooks in CloudAI - -A test hook in CloudAI is a specialized test that runs either before or after each main test in a scenario, providing flexibility to prepare the environment or clean up resources. Hooks are defined as pre-test or post-test and referenced in the test scenario’s TOML file using `pre_test` and `post_test` fields. -``` -name = "nccl-test" - -pre_test = "nccl_test_pre" -post_test = "nccl_test_post" - -[[Tests]] -id = "Tests.1" -test_name = "nccl_test_all_reduce" -num_nodes = "2" -time_limit = "00:20:00" -``` - -CloudAI organizes hooks in a dedicated directory structure: - -- Hook directory: All hook configurations reside in `conf/hook/`. -- Hook test scenarios: Place pre-test hook and post-test hook scenario files in `conf/hook/`. -- Hook tests: Place individual tests referenced in hooks within `conf/hook/test/`. - -In the execution flow, pre-test hooks run before the main test, which only executes if the pre-test completes successfully. Post-test hooks follow the main test, provided the prior steps succeed. If a pre-test hook fails, the main test and its post-test hook are skipped. -``` -name = "nccl_test_pre" - -[[Tests]] -id = "Tests.1" -test_name = "nccl_test_all_reduce" -time_limit = "00:20:00" -``` - ## Troubleshooting In this section, we will guide you through identifying the root cause of issues, determining whether they stem from system infrastructure or a bug in CloudAI. Users should closely follow the USER_GUIDE.md and README.md for installation, adding test templates, tests, and test scenarios. From 492276c70ce77348518a9865dd74e6618e77f8a7 Mon Sep 17 00:00:00 2001 From: Andrei Maslennikov Date: Wed, 8 Jan 2025 06:46:55 -0800 Subject: [PATCH 9/9] Actually remove accessibility check --- .../util/docker_image_cache_manager.py | 84 +------------------ tests/test_docker_image_cache_manager.py | 27 ++---- 2 files changed, 9 insertions(+), 102 deletions(-) diff --git a/src/cloudai/util/docker_image_cache_manager.py b/src/cloudai/util/docker_image_cache_manager.py index 0e84013ff..2ec50a140 100644 --- a/src/cloudai/util/docker_image_cache_manager.py +++ b/src/cloudai/util/docker_image_cache_manager.py @@ -18,7 +18,6 @@ import os import shutil import subprocess -import tempfile from cloudai.systems import SlurmSystem @@ -213,7 +212,7 @@ def cache_docker_image( logging.info(success_message) return DockerImageCacheResult(True, docker_image_path, success_message) - prerequisite_check = self._check_prerequisites(docker_image_url) + prerequisite_check = self._check_prerequisites() if not prerequisite_check: logging.error(f"Prerequisite check failed: {prerequisite_check.message}") return DockerImageCacheResult(False, "", prerequisite_check.message) @@ -266,14 +265,11 @@ def cache_docker_image( logging.debug(error_message) return DockerImageCacheResult(False, message=error_message) - def _check_prerequisites(self, docker_image_url: str) -> PrerequisiteCheckResult: + def _check_prerequisites(self) -> PrerequisiteCheckResult: """ Check prerequisites for caching Docker image. - Args: - docker_image_url (str): URL of the Docker image. - - Returns: + Returns PrerequisiteCheckResult: Result of the prerequisite check. """ required_binaries = ["srun"] @@ -286,82 +282,8 @@ def _check_prerequisites(self, docker_image_url: str) -> PrerequisiteCheckResult False, f"{missing_binaries_str} are required for caching Docker images but are not installed." ) - docker_accessible = self._check_docker_image_accessibility(docker_image_url) - if not docker_accessible.success: - logging.error(f"Docker image URL {docker_image_url} is not accessible. Error: {docker_accessible.message}") - return docker_accessible - return PrerequisiteCheckResult(True, "All prerequisites are met.") - def _check_docker_image_accessibility(self, docker_image_url: str) -> PrerequisiteCheckResult: - """ - Check if the Docker image URL is accessible. - - Args: - docker_image_url (str): URL of the Docker image. - - Returns: - PrerequisiteCheckResult: Result of the Docker image accessibility check. - """ - with tempfile.TemporaryDirectory() as temp_dir: - docker_image_path = os.path.join(temp_dir, "docker_image.sqsh") - enroot_import_cmd = f"enroot import -o {docker_image_path} docker://{docker_image_url}" - - logging.debug(f"Checking Docker image accessibility: {enroot_import_cmd}") - - process = subprocess.Popen(enroot_import_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - try: - while True: - error_output = process.stderr.readline() if process.stderr else None - error_output = error_output.decode() if error_output else "" - - if error_output: - if ( - "Downloading" in error_output - or "Found all layers in cache" in error_output - or "Fetching image manifest list" in error_output - ): - logging.debug( - f"Docker image URL, {docker_image_url}, is accessible. " - f"Command used: {enroot_import_cmd}. Found keyword: {error_output.strip()}" - ) - process.terminate() - return PrerequisiteCheckResult( - True, f"Docker image URL, {docker_image_url}, is accessible." - ) - if "[ERROR]" in error_output: - logging.error( - f"Failed to access Docker image URL, {docker_image_url}. " - f"Command used: {enroot_import_cmd}. Error: {error_output}" - ) - process.terminate() - if "401 Unauthorized" in error_output: - detailed_message = ( - f"Failed to access Docker image URL: {docker_image_url}. Error: {error_output}\n" - "This error indicates that access to the Docker image URL is unauthorized. " - "Please ensure you have the necessary permissions and have followed the " - "instructions in the README for setting up your credentials correctly." - ) - return PrerequisiteCheckResult(False, detailed_message) - return PrerequisiteCheckResult( - False, f"Failed to access Docker image URL: {docker_image_url}. Error: {error_output}" - ) - if process.poll() is not None: - break - - logging.debug(f"Failed to access Docker image URL: {docker_image_url}. Unknown error.") - return PrerequisiteCheckResult( - False, f"Failed to access Docker image URL: {docker_image_url}. Unknown error." - ) - finally: - # Ensure the temporary docker image file is removed - if os.path.exists(docker_image_path): - try: - os.remove(docker_image_path) - logging.debug(f"Temporary Docker image file removed: {docker_image_path}") - except OSError as e: - logging.error(f"Failed to remove temporary Docker image file {docker_image_path}. Error: {e}") - def uninstall_cached_image(self, subdir_name: str, docker_image_filename: str) -> DockerImageCacheResult: """ Uninstall the cached Docker image and remove the subdirectory if empty. diff --git a/tests/test_docker_image_cache_manager.py b/tests/test_docker_image_cache_manager.py index c5419f8a7..377872983 100644 --- a/tests/test_docker_image_cache_manager.py +++ b/tests/test_docker_image_cache_manager.py @@ -223,38 +223,23 @@ def test_uninstall_cached_image(mock_remove, mock_access, mock_exists, mock_isfi @patch("shutil.which") -@patch("cloudai.util.docker_image_cache_manager.DockerImageCacheManager._check_docker_image_accessibility") -def test_check_prerequisites(mock_check_docker_image_accessibility, mock_which, slurm_system: SlurmSystem): +def test_check_prerequisites(mock_which, slurm_system: SlurmSystem): manager = DockerImageCacheManager(slurm_system) - # Ensure enroot and srun are installed - mock_which.side_effect = lambda x: x in ["enroot", "srun"] + # Ensure srun is installed + mock_which.side_effect = lambda x: x in ["srun"] # Test all prerequisites met - mock_check_docker_image_accessibility.return_value = PrerequisiteCheckResult( - True, "Docker image URL is accessible." - ) - result = manager._check_prerequisites("docker.io/hello-world") + result = manager._check_prerequisites() assert result.success assert result.message == "All prerequisites are met." # Test srun not installed mock_which.side_effect = lambda x: x != "srun" - result = manager._check_prerequisites("docker.io/hello-world") + result = manager._check_prerequisites() assert not result.success assert result.message == "srun are required for caching Docker images but are not installed." - # Ensure enroot and srun are installed again - mock_which.side_effect = lambda x: x in ["enroot", "srun"] - - # Test Docker image URL not accessible - mock_check_docker_image_accessibility.return_value = PrerequisiteCheckResult( - False, "Docker image URL not accessible." - ) - result = manager._check_prerequisites("docker.io/hello-world") - assert not result.success - assert "Docker image URL not accessible." in result.message - @patch("os.path.isfile") @patch("os.path.exists") @@ -305,7 +290,7 @@ def test_system_with_account(slurm_system: SlurmSystem): Path(slurm_system.install_path).mkdir(parents=True, exist_ok=True) manager = DockerImageCacheManager(slurm_system) - manager._check_prerequisites = lambda docker_image_url: PrerequisiteCheckResult(True, "All prerequisites are met.") + manager._check_prerequisites = lambda: PrerequisiteCheckResult(True, "All prerequisites are met.") with patch("subprocess.run") as mock_run: res = manager.cache_docker_image("docker.io/hello-world", "subdir", "docker_image.sqsh")