Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update docstring to comply with coding style and max column length #62

Merged
merged 1 commit into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions src/cloudai/parser/system_parser/slurm_system_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,8 @@ def parse(self, data: Dict[str, Any]) -> SlurmSystem: # noqa: C901
SlurmSystem: The parsed Slurm system object.

Raises:
ValueError: If 'name' or 'partitions' are missing from
the data or if there are node list parsing issues or group
membership conflicts.
ValueError: If 'name' or 'partitions' are missing from the data or if there are node list parsing issues
or group membership conflicts.
"""

def safe_int(value):
Expand Down Expand Up @@ -69,7 +68,7 @@ def safe_int(value):
# Check if default_partition exists in partitions
partition_names = [partition_data.get("name") for partition_data in partitions.values()]
if default_partition not in partition_names:
raise ValueError(f"Default partition '{default_partition}' is not listed " f"in partitions.")
raise ValueError(f"Default partition '{default_partition}' is not listed in partitions.")

global_env_vars = data.get("global_env_vars", {})

Expand Down Expand Up @@ -129,8 +128,8 @@ def safe_int(value):
group_nodes.append(nodes_dict[group_node_name])
else:
raise ValueError(
f"Node '{group_node_name}' in group '{group_name}' "
f"not found in partition '{partition_name}' nodes."
f"Node '{group_node_name}' in group '{group_name}' not found in partition "
"'{partition_name}' nodes."
)

updated_groups[partition_name][group_name] = group_nodes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,21 @@

class NeMoLauncherSlurmInstallStrategy(SlurmInstallStrategy):
"""
Install strategy for NeMo-Megatron-Launcher on Slurm systems.
Install strategy for NeMo-Launcher on Slurm systems.

Attributes
SUBDIR_PATH (str): Subdirectory within the system's install path where
the NeMo-Megatron-Launcher and its Docker image will be stored.
REPOSITORY_NAME (str): Name of the NeMo-Megatron-Launcher repository.
SUBDIR_PATH (str): Subdirectory within the system's install path where the NeMo-Launcher and its Docker image
will be stored.
REPOSITORY_NAME (str): Name of the NeMo-Launcher repository.
DOCKER_IMAGE_FILENAME (str): Filename of the Docker image to be downloaded.
repository_url (str): URL to the NeMo-Megatron-Launcher Git repository.
repository_commit_hash (str): Specific commit hash to checkout after cloning
the repository.
repository_url (str): URL to the NeMo-Launcher Git repository.
repository_commit_hash (str): Specific commit hash to checkout after cloning the repository.
docker_image_url (str): URL to the Docker image in a remote container registry.
"""

SUBDIR_PATH = "NeMo-Megatron-Launcher"
REPOSITORY_NAME = "NeMo-Megatron-Launcher"
DOCKER_IMAGE_FILENAME = "nemo_megatron_launcher.sqsh"
SUBDIR_PATH = "NeMo-Launcher"
REPOSITORY_NAME = "NeMo-Launcher"
DOCKER_IMAGE_FILENAME = "nemo_launcher.sqsh"

def __init__(
self,
Expand Down Expand Up @@ -82,12 +81,10 @@ def _configure_logging(self):
"""
Configure logging to output error messages to stdout.

This method sets the logger's level to INFO, capturing messages at this
level and above. It also configures a StreamHandler for error messages
(ERROR level and above), directing them to stdout for immediate visibility.
This method sets the logger's level to INFO, capturing messages at this level and above. It also configures a
StreamHandler for error messages (ERROR level and above), directing them to stdout for immediate visibility.

StreamHandler formatting includes the logger's name, the log level, and
the message.
StreamHandler formatting includes the logger's name, the log level, and the message.
"""
self.logger.setLevel(logging.INFO)

Expand All @@ -114,11 +111,9 @@ def is_installed(self) -> bool:
datasets_ready = self._check_datasets_on_nodes(data_dir_path)
if not datasets_ready:
self.logger.error(
"NeMo datasets are not installed on some nodes. Please ensure "
"that the NeMo datasets are manually installed on each node "
f"in the specified data directory: {data_dir_path}. This "
"directory should contain all necessary datasets for NeMo "
"Megatron Launcher to function properly."
"NeMo datasets are not installed on some nodes. Please ensure that the NeMo datasets are manually "
"installed on each node in the specified data directory: {data_dir_path}. This directory should "
"contain all necessary datasets for NeMo Launcher to function properly."
)

return repo_installed and docker_image_installed and datasets_ready
Expand All @@ -135,10 +130,8 @@ def install(self) -> None:
data_dir_path = self.default_cmd_args["data_dir"]
if not self._check_datasets_on_nodes(data_dir_path):
self.logger.error(
"Some nodes do not have the NeMoLauncher datasets installed. "
"Please note that CloudAI does not cover dataset installation. "
"Users are responsible for ensuring that datasets are installed "
"on all nodes."
"Some nodes do not have the NeMoLauncher datasets installed. Please note that CloudAI does not cover "
"dataset installation. Users are responsible for ensuring that datasets are installed on all nodes."
)

self._clone_repository(subdir_path)
Expand All @@ -150,8 +143,8 @@ def _check_install_path_access(self):
Check if the install path exists and if there is permission to create a directory or file in the path.

Raises
PermissionError: If the install path does not exist or if there is
no permission to create directories/files.
PermissionError: If the install path does not exist or if there is no permission to create
directories/files.
"""
if not os.path.exists(self.install_path):
raise PermissionError(f"Install path {self.install_path} does not exist.")
Expand All @@ -164,12 +157,11 @@ def _check_datasets_on_nodes(self, data_dir_path: str) -> bool:

Default partition is used.

This method uses parallel execution to check datasets on multiple nodes
simultaneously, improving efficiency for systems with multiple nodes.
This method uses parallel execution to check datasets on multiple nodes simultaneously, improving efficiency
for systems with multiple nodes.

Args:
data_dir_path (str): Path where dataset files and directories are
stored.
data_dir_path (str): Path where dataset files and directories are stored.

Returns:
bool: True if all specified dataset files and directories are present
Expand Down Expand Up @@ -212,11 +204,8 @@ def _check_datasets_on_nodes(self, data_dir_path: str) -> bool:
", ".join(nodes_without_datasets),
)
self.logger.error(
"Please ensure that the NeMo datasets are "
"installed on each node in the specified data "
"directory: %s. This directory should contain all "
"necessary datasets for NeMo Megatron Launcher to "
"function properly.",
"Please ensure that the NeMo datasets are installed on each node in the specified data directory: %s. "
"This directory should contain all necessary datasets for NeMo Launcher to function properly.",
data_dir_path,
)
return False
Expand All @@ -230,12 +219,10 @@ def _check_dataset_on_node(self, node: str, data_dir_path: str, dataset_items: L
Args:
node (str): The name of the compute node.
data_dir_path (str): Path to the data directory.
dataset_items (List[str]): List of dataset file and directory names
to check.
dataset_items (List[str]): List of dataset file and directory names to check.

Returns:
bool: True if all dataset files and directories exist on the node,
False otherwise.
bool: True if all dataset files and directories exist on the node, False otherwise.
"""
python_check_script = (
f"import os;print(all(os.path.isfile(os.path.join('{data_dir_path}', "
Expand All @@ -252,7 +239,7 @@ def _check_dataset_on_node(self, node: str, data_dir_path: str, dataset_items: L

def _clone_repository(self, subdir_path: str) -> None:
"""
Clones NeMo-Megatron-Launcher repository into specified path.
Clones NeMo-Launcher repository into specified path.

Args:
subdir_path (str): Subdirectory path for installation.
Expand Down
5 changes: 2 additions & 3 deletions src/cloudai/systems/slurm/strategy/slurm_install_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,8 @@ class SlurmInstallStrategy(InstallStrategy):
Abstract base class for defining installation strategies specific to Slurm environments.

Attributes
slurm_system (SlurmSystem): A casted version of the `system` attribute,
which provides Slurm-specific properties
and methods.
slurm_system (SlurmSystem): A casted version of the `system` attribute, which provides Slurm-specific
properties and methods.
"""

def __init__(
Expand Down
3 changes: 1 addition & 2 deletions tests/test_slurm_command_gen_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,7 @@ def test_docker_image_url_is_not_file(self, nemo_cmd_gen: NeMoLauncherSlurmComma
nemo_cmd_gen.final_cmd_args["docker_image_url"] = f"{nemo_cmd_gen.install_path}/docker_image"
nemo_cmd_gen.set_container_arg()
assert (
nemo_cmd_gen.final_cmd_args["container"]
== f"{nemo_cmd_gen.install_path}/NeMo-Megatron-Launcher/nemo_megatron_launcher.sqsh"
nemo_cmd_gen.final_cmd_args["container"] == f"{nemo_cmd_gen.install_path}/NeMo-Launcher/nemo_launcher.sqsh"
)

def test_docker_image_url_is_file(self, nemo_cmd_gen: NeMoLauncherSlurmCommandGenStrategy):
Expand Down