Skip to content

Commit

Permalink
Add NeMo 2.0 (NeMo-run)
Browse files Browse the repository at this point in the history
  • Loading branch information
TaekyungHeo committed Oct 30, 2024
1 parent f2a82a5 commit 9349a3f
Show file tree
Hide file tree
Showing 8 changed files with 262 additions and 0 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ dependencies = [
"toml==0.10.2",
"kubernetes==30.1.0",
"pydantic==2.8.2",
"git+https://github.com/NVIDIA/NeMo-Run/@nightly",
]
[project.scripts]
cloudai = "cloudai.__main__:main"
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ tbparse==0.0.8
toml==0.10.2
kubernetes==30.1.0
pydantic==2.8.2
git+https://github.com/NVIDIA/NeMo-Run/@nightly
25 changes: 25 additions & 0 deletions src/cloudai/schema/test_template/nemo_run/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .report_generation_strategy import NeMoRunReportGenerationStrategy
from .slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy
from .template import NeMoRun

__all__ = [
"NeMoRun",
"NeMoRunReportGenerationStrategy",
"NeMoRunSlurmCommandGenStrategy",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path

from cloudai import ReportGenerationStrategy


class NeMoRunReportGenerationStrategy(ReportGenerationStrategy):
"""Strategy for generating reports from NeMo run directories."""

def can_handle_directory(self, directory_path: Path) -> bool:
return False
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


from typing import Dict, List

from cloudai import TestRun
from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy


class NeMoRunSlurmCommandGenStrategy(SlurmCommandGenStrategy):
"""Command generation strategy for NeMo 2.0 on Slurm systems."""

def generate_test_command(self, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun) -> List[str]:
command = ["python", "-m", "nemo_run.core.runners.fdl_runner"]

if "n" in cmd_args:
command.extend(["-n", cmd_args["n"]])

if "p" in cmd_args:
command.extend(["-p", cmd_args["p"]])

if "script" in cmd_args:
command.append(cmd_args["script"])

return command
21 changes: 21 additions & 0 deletions src/cloudai/schema/test_template/nemo_run/template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from cloudai import TestTemplate


class NeMoRun(TestTemplate):
"""Test template for the NeMo-run."""
43 changes: 43 additions & 0 deletions src/cloudai/test_definitions/nemo_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

from cloudai import CmdArgs, TestDefinition
from cloudai.installer.installables import DockerImage, Installable


class NeMoRunCmdArgs(CmdArgs):
"""NeMoRun test command arguments."""

docker_image_url: str = "nvcr.io/nvidia/nemo:24.07"


class NeMoRunTestDefinition(TestDefinition):
"""Test object for NeMoLauncher."""

_docker_image: Optional[DockerImage] = None

@property
def docker_image(self) -> DockerImage:
if not self._docker_image:
self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
return self._docker_image

@property
def installables(self) -> list[Installable]:
"""Get list of installable objects."""
return [self.docker_image]
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from pathlib import Path
from unittest.mock import Mock

import pytest

from cloudai._core.test import Test
from cloudai._core.test_scenario import TestRun
from cloudai.schema.test_template.nemo_run.slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy
from cloudai.systems import SlurmSystem
from cloudai.test_definitions.nemo_run import NeMoRunCmdArgs, NeMoRunTestDefinition


class TestNeMoRunSlurmCommandGenStrategy:
@pytest.fixture
def test_run(self, tmp_path: Path) -> TestRun:
tdef = NeMoRunTestDefinition(
name="t1",
description="desc1",
test_template_name="tt",
cmd_args=NeMoRunCmdArgs(),
extra_env_vars={"TEST_VAR_1": "value1"},
extra_cmd_args={"extra_args": ""},
)

test = Test(test_definition=tdef, test_template=Mock())
tr = TestRun(
test=test,
num_nodes=2,
nodes=[],
output_path=tmp_path / "output",
name="test-job",
)

return tr

@pytest.fixture
def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> NeMoRunSlurmCommandGenStrategy:
return NeMoRunSlurmCommandGenStrategy(slurm_system, {})

@pytest.mark.parametrize(
"cmd_args, expected_cmd",
[
# No optional args
({}, ["python", "-m", "nemo_run.core.runners.fdl_runner"]),
# Only `n` provided
({"n": "test_n"}, ["python", "-m", "nemo_run.core.runners.fdl_runner", "-n", "test_n"]),
# Only `p` provided
({"p": "/path/to/config"}, ["python", "-m", "nemo_run.core.runners.fdl_runner", "-p", "/path/to/config"]),
# Only `script` provided
({"script": "/path/to/script"}, ["python", "-m", "nemo_run.core.runners.fdl_runner", "/path/to/script"]),
# `n` and `p` provided
(
{"n": "test_n", "p": "/path/to/config"},
["python", "-m", "nemo_run.core.runners.fdl_runner", "-n", "test_n", "-p", "/path/to/config"],
),
# `n` and `script` provided
(
{"n": "test_n", "script": "/path/to/script"},
["python", "-m", "nemo_run.core.runners.fdl_runner", "-n", "test_n", "/path/to/script"],
),
# `p` and `script` provided
(
{"p": "/path/to/config", "script": "/path/to/script"},
["python", "-m", "nemo_run.core.runners.fdl_runner", "-p", "/path/to/config", "/path/to/script"],
),
# All `n`, `p`, and `script` provided
(
{"n": "test_n", "p": "/path/to/config", "script": "/path/to/script"},
[
"python",
"-m",
"nemo_run.core.runners.fdl_runner",
"-n",
"test_n",
"-p",
"/path/to/config",
"/path/to/script",
],
),
],
)
def test_generate_test_command(
self,
cmd_gen_strategy: NeMoRunSlurmCommandGenStrategy,
test_run: TestRun,
cmd_args: dict,
expected_cmd: list,
) -> None:
cmd = cmd_gen_strategy.generate_test_command(test_run.test.test_definition.extra_env_vars, cmd_args, test_run)
assert cmd == expected_cmd, f"Expected command {expected_cmd}, but got {cmd}"

0 comments on commit 9349a3f

Please sign in to comment.