diff --git a/pyproject.toml b/pyproject.toml index a6442964c..fa58daf06 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "toml==0.10.2", "kubernetes==30.1.0", "pydantic==2.8.2", + "git+https://github.com/NVIDIA/NeMo-Run/@nightly", ] [project.scripts] cloudai = "cloudai.__main__:main" diff --git a/requirements.txt b/requirements.txt index ddaf06e25..a77dca6d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ tbparse==0.0.8 toml==0.10.2 kubernetes==30.1.0 pydantic==2.8.2 +git+https://github.com/NVIDIA/NeMo-Run/@nightly diff --git a/src/cloudai/schema/test_template/nemo_run/__init__.py b/src/cloudai/schema/test_template/nemo_run/__init__.py new file mode 100644 index 000000000..feb3c130c --- /dev/null +++ b/src/cloudai/schema/test_template/nemo_run/__init__.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .report_generation_strategy import NeMoRunReportGenerationStrategy +from .slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy +from .template import NeMoRun + +__all__ = [ + "NeMoRun", + "NeMoRunReportGenerationStrategy", + "NeMoRunSlurmCommandGenStrategy", +] diff --git a/src/cloudai/schema/test_template/nemo_run/report_generation_strategy.py b/src/cloudai/schema/test_template/nemo_run/report_generation_strategy.py new file mode 100644 index 000000000..a265562ef --- /dev/null +++ b/src/cloudai/schema/test_template/nemo_run/report_generation_strategy.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +from cloudai import ReportGenerationStrategy + + +class NeMoRunReportGenerationStrategy(ReportGenerationStrategy): + """Strategy for generating reports from NeMo run directories.""" + + def can_handle_directory(self, directory_path: Path) -> bool: + return False diff --git a/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py new file mode 100644 index 000000000..64d082dfe --- /dev/null +++ b/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Dict, List + +from cloudai import TestRun +from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy + + +class NeMoRunSlurmCommandGenStrategy(SlurmCommandGenStrategy): + """Command generation strategy for NeMo 2.0 on Slurm systems.""" + + def generate_test_command(self, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun) -> List[str]: + command = ["python", "-m", "nemo_run.core.runners.fdl_runner"] + + if "n" in cmd_args: + command.extend(["-n", cmd_args["n"]]) + + if "p" in cmd_args: + command.extend(["-p", cmd_args["p"]]) + + if "script" in cmd_args: + command.append(cmd_args["script"]) + + return command diff --git a/src/cloudai/schema/test_template/nemo_run/template.py b/src/cloudai/schema/test_template/nemo_run/template.py new file mode 100644 index 000000000..0373b60cc --- /dev/null +++ b/src/cloudai/schema/test_template/nemo_run/template.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cloudai import TestTemplate + + +class NeMoRun(TestTemplate): + """Test template for the NeMo-run.""" diff --git a/src/cloudai/test_definitions/nemo_run.py b/src/cloudai/test_definitions/nemo_run.py new file mode 100644 index 000000000..99303762d --- /dev/null +++ b/src/cloudai/test_definitions/nemo_run.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from cloudai import CmdArgs, TestDefinition +from cloudai.installer.installables import DockerImage, Installable + + +class NeMoRunCmdArgs(CmdArgs): + """NeMoRun test command arguments.""" + + docker_image_url: str = "nvcr.io/nvidia/nemo:24.07" + + +class NeMoRunTestDefinition(TestDefinition): + """Test object for NeMoLauncher.""" + + _docker_image: Optional[DockerImage] = None + + @property + def docker_image(self) -> DockerImage: + if not self._docker_image: + self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) + return self._docker_image + + @property + def installables(self) -> list[Installable]: + """Get list of installable objects.""" + return [self.docker_image] diff --git a/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py new file mode 100644 index 000000000..e463556de --- /dev/null +++ b/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from unittest.mock import Mock + +import pytest + +from cloudai._core.test import Test +from cloudai._core.test_scenario import TestRun +from cloudai.schema.test_template.nemo_run.slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy +from cloudai.systems import SlurmSystem +from cloudai.test_definitions.nemo_run import NeMoRunCmdArgs, NeMoRunTestDefinition + + +class TestNeMoRunSlurmCommandGenStrategy: + @pytest.fixture + def test_run(self, tmp_path: Path) -> TestRun: + tdef = NeMoRunTestDefinition( + name="t1", + description="desc1", + test_template_name="tt", + cmd_args=NeMoRunCmdArgs(), + extra_env_vars={"TEST_VAR_1": "value1"}, + extra_cmd_args={"extra_args": ""}, + ) + + test = Test(test_definition=tdef, test_template=Mock()) + tr = TestRun( + test=test, + num_nodes=2, + nodes=[], + output_path=tmp_path / "output", + name="test-job", + ) + + return tr + + @pytest.fixture + def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> NeMoRunSlurmCommandGenStrategy: + return NeMoRunSlurmCommandGenStrategy(slurm_system, {}) + + @pytest.mark.parametrize( + "cmd_args, expected_cmd", + [ + # No optional args + ({}, ["python", "-m", "nemo_run.core.runners.fdl_runner"]), + # Only `n` provided + ({"n": "test_n"}, ["python", "-m", "nemo_run.core.runners.fdl_runner", "-n", "test_n"]), + # Only `p` provided + ({"p": "/path/to/config"}, ["python", "-m", "nemo_run.core.runners.fdl_runner", "-p", "/path/to/config"]), + # Only `script` provided + ({"script": "/path/to/script"}, ["python", "-m", "nemo_run.core.runners.fdl_runner", "/path/to/script"]), + # `n` and `p` provided + ( + {"n": "test_n", "p": "/path/to/config"}, + ["python", "-m", "nemo_run.core.runners.fdl_runner", "-n", "test_n", "-p", "/path/to/config"], + ), + # `n` and `script` provided + ( + {"n": "test_n", "script": "/path/to/script"}, + ["python", "-m", "nemo_run.core.runners.fdl_runner", "-n", "test_n", "/path/to/script"], + ), + # `p` and `script` provided + ( + {"p": "/path/to/config", "script": "/path/to/script"}, + ["python", "-m", "nemo_run.core.runners.fdl_runner", "-p", "/path/to/config", "/path/to/script"], + ), + # All `n`, `p`, and `script` provided + ( + {"n": "test_n", "p": "/path/to/config", "script": "/path/to/script"}, + [ + "python", + "-m", + "nemo_run.core.runners.fdl_runner", + "-n", + "test_n", + "-p", + "/path/to/config", + "/path/to/script", + ], + ), + ], + ) + def test_generate_test_command( + self, + cmd_gen_strategy: NeMoRunSlurmCommandGenStrategy, + test_run: TestRun, + cmd_args: dict, + expected_cmd: list, + ) -> None: + cmd = cmd_gen_strategy.generate_test_command(test_run.test.test_definition.extra_env_vars, cmd_args, test_run) + assert cmd == expected_cmd, f"Expected command {expected_cmd}, but got {cmd}"