Skip to content

Commit

Permalink
use gpus_per_node in system file instead of using try catch for gres …
Browse files Browse the repository at this point in the history
…require in command line
  • Loading branch information
Lily Wang committed Jan 15, 2025
1 parent 9e03b9b commit be75b0d
Showing 1 changed file with 2 additions and 4 deletions.
6 changes: 2 additions & 4 deletions src/cloudai/util/docker_image_cache_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,6 @@ def _import_docker_image(
logging.debug(f"Command used: {enroot_import_cmd}, stdout: {p.stdout}, stderr: {p.stderr}")
return DockerImageCacheResult(True, docker_image_path.absolute(), success_message)
except subprocess.CalledProcessError as e:
# Retry enroot by adding `--gpus=1`` if cluster requires specifing GPU resource
if not retry and e.stderr and "Cannot find GPU specification" in e.stderr:
srun_prefix += " --gpus=1"
return self._import_docker_image(srun_prefix, docker_image_url, docker_image_path, True)
error_message = (
f"Failed to import Docker image {docker_image_url}. Command: {enroot_import_cmd}. Error: {e.stderr}"
)
Expand Down Expand Up @@ -251,6 +247,8 @@ def cache_docker_image(self, docker_image_url: str, docker_image_filename: str)
srun_prefix = f"srun --export=ALL --partition={self.system.default_partition}"
if self.system.account:
srun_prefix += f" --account={self.system.account}"
if self.system.gpus_per_node:
srun_prefix += f" --gres=gpu:1"

return self._import_docker_image(srun_prefix, docker_image_url, docker_image_path)

Expand Down

0 comments on commit be75b0d

Please sign in to comment.