diff --git a/src/cloudai/parser/system_parser/slurm_system_parser.py b/src/cloudai/parser/system_parser/slurm_system_parser.py index f2f577bd7..1afd86e01 100644 --- a/src/cloudai/parser/system_parser/slurm_system_parser.py +++ b/src/cloudai/parser/system_parser/slurm_system_parser.py @@ -33,9 +33,8 @@ def parse(self, data: Dict[str, Any]) -> SlurmSystem: # noqa: C901 SlurmSystem: The parsed Slurm system object. Raises: - ValueError: If 'name' or 'partitions' are missing from - the data or if there are node list parsing issues or group - membership conflicts. + ValueError: If 'name' or 'partitions' are missing from the data or if there are node list parsing issues + or group membership conflicts. """ def safe_int(value): @@ -69,7 +68,7 @@ def safe_int(value): # Check if default_partition exists in partitions partition_names = [partition_data.get("name") for partition_data in partitions.values()] if default_partition not in partition_names: - raise ValueError(f"Default partition '{default_partition}' is not listed " f"in partitions.") + raise ValueError(f"Default partition '{default_partition}' is not listed in partitions.") global_env_vars = data.get("global_env_vars", {}) @@ -129,8 +128,8 @@ def safe_int(value): group_nodes.append(nodes_dict[group_node_name]) else: raise ValueError( - f"Node '{group_node_name}' in group '{group_name}' " - f"not found in partition '{partition_name}' nodes." + f"Node '{group_node_name}' in group '{group_name}' not found in partition " + "'{partition_name}' nodes." ) updated_groups[partition_name][group_name] = group_nodes diff --git a/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py b/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py index 37c58d978..6cb1dab17 100644 --- a/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py +++ b/src/cloudai/schema/test_template/nemo_launcher/slurm_install_strategy.py @@ -28,22 +28,21 @@ class NeMoLauncherSlurmInstallStrategy(SlurmInstallStrategy): """ - Install strategy for NeMo-Megatron-Launcher on Slurm systems. + Install strategy for NeMo-Launcher on Slurm systems. Attributes - SUBDIR_PATH (str): Subdirectory within the system's install path where - the NeMo-Megatron-Launcher and its Docker image will be stored. - REPOSITORY_NAME (str): Name of the NeMo-Megatron-Launcher repository. + SUBDIR_PATH (str): Subdirectory within the system's install path where the NeMo-Launcher and its Docker image + will be stored. + REPOSITORY_NAME (str): Name of the NeMo-Launcher repository. DOCKER_IMAGE_FILENAME (str): Filename of the Docker image to be downloaded. - repository_url (str): URL to the NeMo-Megatron-Launcher Git repository. - repository_commit_hash (str): Specific commit hash to checkout after cloning - the repository. + repository_url (str): URL to the NeMo-Launcher Git repository. + repository_commit_hash (str): Specific commit hash to checkout after cloning the repository. docker_image_url (str): URL to the Docker image in a remote container registry. """ - SUBDIR_PATH = "NeMo-Megatron-Launcher" - REPOSITORY_NAME = "NeMo-Megatron-Launcher" - DOCKER_IMAGE_FILENAME = "nemo_megatron_launcher.sqsh" + SUBDIR_PATH = "NeMo-Launcher" + REPOSITORY_NAME = "NeMo-Launcher" + DOCKER_IMAGE_FILENAME = "nemo_launcher.sqsh" def __init__( self, @@ -82,12 +81,10 @@ def _configure_logging(self): """ Configure logging to output error messages to stdout. - This method sets the logger's level to INFO, capturing messages at this - level and above. It also configures a StreamHandler for error messages - (ERROR level and above), directing them to stdout for immediate visibility. + This method sets the logger's level to INFO, capturing messages at this level and above. It also configures a + StreamHandler for error messages (ERROR level and above), directing them to stdout for immediate visibility. - StreamHandler formatting includes the logger's name, the log level, and - the message. + StreamHandler formatting includes the logger's name, the log level, and the message. """ self.logger.setLevel(logging.INFO) @@ -114,11 +111,9 @@ def is_installed(self) -> bool: datasets_ready = self._check_datasets_on_nodes(data_dir_path) if not datasets_ready: self.logger.error( - "NeMo datasets are not installed on some nodes. Please ensure " - "that the NeMo datasets are manually installed on each node " - f"in the specified data directory: {data_dir_path}. This " - "directory should contain all necessary datasets for NeMo " - "Megatron Launcher to function properly." + "NeMo datasets are not installed on some nodes. Please ensure that the NeMo datasets are manually " + "installed on each node in the specified data directory: {data_dir_path}. This directory should " + "contain all necessary datasets for NeMo Launcher to function properly." ) return repo_installed and docker_image_installed and datasets_ready @@ -135,10 +130,8 @@ def install(self) -> None: data_dir_path = self.default_cmd_args["data_dir"] if not self._check_datasets_on_nodes(data_dir_path): self.logger.error( - "Some nodes do not have the NeMoLauncher datasets installed. " - "Please note that CloudAI does not cover dataset installation. " - "Users are responsible for ensuring that datasets are installed " - "on all nodes." + "Some nodes do not have the NeMoLauncher datasets installed. Please note that CloudAI does not cover " + "dataset installation. Users are responsible for ensuring that datasets are installed on all nodes." ) self._clone_repository(subdir_path) @@ -150,8 +143,8 @@ def _check_install_path_access(self): Check if the install path exists and if there is permission to create a directory or file in the path. Raises - PermissionError: If the install path does not exist or if there is - no permission to create directories/files. + PermissionError: If the install path does not exist or if there is no permission to create + directories/files. """ if not os.path.exists(self.install_path): raise PermissionError(f"Install path {self.install_path} does not exist.") @@ -164,12 +157,11 @@ def _check_datasets_on_nodes(self, data_dir_path: str) -> bool: Default partition is used. - This method uses parallel execution to check datasets on multiple nodes - simultaneously, improving efficiency for systems with multiple nodes. + This method uses parallel execution to check datasets on multiple nodes simultaneously, improving efficiency + for systems with multiple nodes. Args: - data_dir_path (str): Path where dataset files and directories are - stored. + data_dir_path (str): Path where dataset files and directories are stored. Returns: bool: True if all specified dataset files and directories are present @@ -212,11 +204,8 @@ def _check_datasets_on_nodes(self, data_dir_path: str) -> bool: ", ".join(nodes_without_datasets), ) self.logger.error( - "Please ensure that the NeMo datasets are " - "installed on each node in the specified data " - "directory: %s. This directory should contain all " - "necessary datasets for NeMo Megatron Launcher to " - "function properly.", + "Please ensure that the NeMo datasets are installed on each node in the specified data directory: %s. " + "This directory should contain all necessary datasets for NeMo Launcher to function properly.", data_dir_path, ) return False @@ -230,12 +219,10 @@ def _check_dataset_on_node(self, node: str, data_dir_path: str, dataset_items: L Args: node (str): The name of the compute node. data_dir_path (str): Path to the data directory. - dataset_items (List[str]): List of dataset file and directory names - to check. + dataset_items (List[str]): List of dataset file and directory names to check. Returns: - bool: True if all dataset files and directories exist on the node, - False otherwise. + bool: True if all dataset files and directories exist on the node, False otherwise. """ python_check_script = ( f"import os;print(all(os.path.isfile(os.path.join('{data_dir_path}', " @@ -252,7 +239,7 @@ def _check_dataset_on_node(self, node: str, data_dir_path: str, dataset_items: L def _clone_repository(self, subdir_path: str) -> None: """ - Clones NeMo-Megatron-Launcher repository into specified path. + Clones NeMo-Launcher repository into specified path. Args: subdir_path (str): Subdirectory path for installation. diff --git a/src/cloudai/systems/slurm/strategy/slurm_install_strategy.py b/src/cloudai/systems/slurm/strategy/slurm_install_strategy.py index 64c3c2cf4..eb903c713 100644 --- a/src/cloudai/systems/slurm/strategy/slurm_install_strategy.py +++ b/src/cloudai/systems/slurm/strategy/slurm_install_strategy.py @@ -23,9 +23,8 @@ class SlurmInstallStrategy(InstallStrategy): Abstract base class for defining installation strategies specific to Slurm environments. Attributes - slurm_system (SlurmSystem): A casted version of the `system` attribute, - which provides Slurm-specific properties - and methods. + slurm_system (SlurmSystem): A casted version of the `system` attribute, which provides Slurm-specific + properties and methods. """ def __init__( diff --git a/tests/test_slurm_command_gen_strategy.py b/tests/test_slurm_command_gen_strategy.py index bf5cda854..04225cbb2 100644 --- a/tests/test_slurm_command_gen_strategy.py +++ b/tests/test_slurm_command_gen_strategy.py @@ -131,8 +131,7 @@ def test_docker_image_url_is_not_file(self, nemo_cmd_gen: NeMoLauncherSlurmComma nemo_cmd_gen.final_cmd_args["docker_image_url"] = f"{nemo_cmd_gen.install_path}/docker_image" nemo_cmd_gen.set_container_arg() assert ( - nemo_cmd_gen.final_cmd_args["container"] - == f"{nemo_cmd_gen.install_path}/NeMo-Megatron-Launcher/nemo_megatron_launcher.sqsh" + nemo_cmd_gen.final_cmd_args["container"] == f"{nemo_cmd_gen.install_path}/NeMo-Launcher/nemo_launcher.sqsh" ) def test_docker_image_url_is_file(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy):