From e3ca13bc6bb5f94d9f2724fa84842ee433ce061e Mon Sep 17 00:00:00 2001 From: Taekyung Heo <7621438+TaekyungHeo@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:32:09 -0400 Subject: [PATCH] Add NeMo 2.0 (NeMo-run) --- conf/common/test/nemo_run_llama3_8b.toml | 29 +++++++ .../test_scenario/nemo_run_llama3_8b.toml | 23 ++++++ src/cloudai/__init__.py | 14 +++- .../schema/test_template/nemo_run/__init__.py | 27 +++++++ .../nemo_run/grading_strategy.py | 26 +++++++ .../nemo_run/report_generation_strategy.py | 26 +++++++ .../nemo_run/slurm_command_gen_strategy.py | 57 ++++++++++++++ .../schema/test_template/nemo_run/template.py | 21 +++++ src/cloudai/test_definitions/__init__.py | 3 + src/cloudai/test_definitions/nemo_run.py | 46 +++++++++++ ...est_nemo_run_slurm_command_gen_strategy.py | 78 +++++++++++++++++++ tests/test_init.py | 15 +++- 12 files changed, 361 insertions(+), 4 deletions(-) create mode 100644 conf/common/test/nemo_run_llama3_8b.toml create mode 100644 conf/common/test_scenario/nemo_run_llama3_8b.toml create mode 100644 src/cloudai/schema/test_template/nemo_run/__init__.py create mode 100644 src/cloudai/schema/test_template/nemo_run/grading_strategy.py create mode 100644 src/cloudai/schema/test_template/nemo_run/report_generation_strategy.py create mode 100644 src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py create mode 100644 src/cloudai/schema/test_template/nemo_run/template.py create mode 100644 src/cloudai/test_definitions/nemo_run.py create mode 100644 tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py diff --git a/conf/common/test/nemo_run_llama3_8b.toml b/conf/common/test/nemo_run_llama3_8b.toml new file mode 100644 index 000000000..f629a6c4e --- /dev/null +++ b/conf/common/test/nemo_run_llama3_8b.toml @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nemo_run_llama3_8b" +description = "nemo_run_llama3_8b" +test_template_name = "NeMoRun" + +[cmd_args] +"docker_image_url" = "nvcr.io/nvidia/nemo:24.09" +"task" = "pretrain" +"recipe_name" = "llama3_8b" + +[extra_cmd_args] +"trainer.max_steps" = "5" +"log.ckpt.save_on_train_epoch_end" = "False" +"log.ckpt.save_last" = "False" diff --git a/conf/common/test_scenario/nemo_run_llama3_8b.toml b/conf/common/test_scenario/nemo_run_llama3_8b.toml new file mode 100644 index 000000000..55c8ab836 --- /dev/null +++ b/conf/common/test_scenario/nemo_run_llama3_8b.toml @@ -0,0 +1,23 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name = "nemo_run_llama3_8b" + +[[Tests]] +id = "nemo_run_llama3_8b" +test_name = "nemo_run_llama3_8b" +num_nodes = "1" +time_limit = "00:30:00" diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py index fd394f24b..2a4767666 100644 --- a/src/cloudai/__init__.py +++ b/src/cloudai/__init__.py @@ -75,6 +75,10 @@ NeMoLauncherSlurmJobIdRetrievalStrategy, ) from .schema.test_template.nemo_launcher.template import NeMoLauncher +from .schema.test_template.nemo_run.grading_strategy import NeMoRunGradingStrategy +from .schema.test_template.nemo_run.report_generation_strategy import NeMoRunReportGenerationStrategy +from .schema.test_template.nemo_run.slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy +from .schema.test_template.nemo_run.template import NeMoRun from .schema.test_template.sleep.grading_strategy import SleepGradingStrategy from .schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy from .schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy @@ -94,6 +98,7 @@ GrokTestDefinition, NCCLTestDefinition, NeMoLauncherTestDefinition, + NeMoRunTestDefinition, NemotronTestDefinition, SleepTestDefinition, UCCTestDefinition, @@ -115,20 +120,23 @@ ReportGenerationStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepReportGenerationStrategy ) Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherReportGenerationStrategy) +Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NeMoRun], NeMoRunReportGenerationStrategy) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmCommandGenStrategy) Registry().add_strategy(GradingStrategy, [SlurmSystem], [Sleep], SleepGradingStrategy) Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxReportGenerationStrategy) Registry().add_strategy(JobIdRetrievalStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobIdRetrievalStrategy) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy) +Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoRun], NeMoRunSlurmCommandGenStrategy) Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy) Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy) +Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoRun], NeMoRunGradingStrategy) Registry().add_strategy(GradingStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxGradingStrategy) Registry().add_strategy(GradingStrategy, [SlurmSystem], [UCCTest], UCCTestGradingStrategy) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmCommandGenStrategy) Registry().add_strategy( JobIdRetrievalStrategy, [SlurmSystem], - [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep], + [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, NeMoRun], SlurmJobIdRetrievalStrategy, ) Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy) @@ -141,7 +149,7 @@ Registry().add_strategy( JobStatusRetrievalStrategy, [SlurmSystem], - [ChakraReplay, UCCTest, NeMoLauncher, Sleep], + [ChakraReplay, UCCTest, NeMoLauncher, NeMoRun, Sleep], DefaultJobStatusRetrievalStrategy, ) Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy) @@ -162,6 +170,7 @@ Registry().add_test_definition("ChakraReplay", ChakraReplayTestDefinition) Registry().add_test_definition("Sleep", SleepTestDefinition) Registry().add_test_definition("NeMoLauncher", NeMoLauncherTestDefinition) +Registry().add_test_definition("NeMoRun", NeMoRunTestDefinition) Registry().add_test_definition("JaxToolboxGPT", GPTTestDefinition) Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition) Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition) @@ -169,6 +178,7 @@ Registry().add_test_template("ChakraReplay", ChakraReplay) Registry().add_test_template("NcclTest", NcclTest) Registry().add_test_template("NeMoLauncher", NeMoLauncher) +Registry().add_test_template("NeMoRun", NeMoRun) Registry().add_test_template("Sleep", Sleep) Registry().add_test_template("UCCTest", UCCTest) Registry().add_test_template("JaxToolboxGPT", JaxToolbox) diff --git a/src/cloudai/schema/test_template/nemo_run/__init__.py b/src/cloudai/schema/test_template/nemo_run/__init__.py new file mode 100644 index 000000000..e907d0d4c --- /dev/null +++ b/src/cloudai/schema/test_template/nemo_run/__init__.py @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .grading_strategy import NeMoRunGradingStrategy +from .report_generation_strategy import NeMoRunReportGenerationStrategy +from .slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy +from .template import NeMoRun + +__all__ = [ + "NeMoRun", + "NeMoRunGradingStrategy", + "NeMoRunReportGenerationStrategy", + "NeMoRunSlurmCommandGenStrategy", +] diff --git a/src/cloudai/schema/test_template/nemo_run/grading_strategy.py b/src/cloudai/schema/test_template/nemo_run/grading_strategy.py new file mode 100644 index 000000000..39c0a5843 --- /dev/null +++ b/src/cloudai/schema/test_template/nemo_run/grading_strategy.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +from cloudai import GradingStrategy + + +class NeMoRunGradingStrategy(GradingStrategy): + """Performance grading strategy for NeMoLauncher test templates on Slurm systems.""" + + def grade(self, directory_path: Path, ideal_perf: float) -> float: + return 0.0 diff --git a/src/cloudai/schema/test_template/nemo_run/report_generation_strategy.py b/src/cloudai/schema/test_template/nemo_run/report_generation_strategy.py new file mode 100644 index 000000000..a265562ef --- /dev/null +++ b/src/cloudai/schema/test_template/nemo_run/report_generation_strategy.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path + +from cloudai import ReportGenerationStrategy + + +class NeMoRunReportGenerationStrategy(ReportGenerationStrategy): + """Strategy for generating reports from NeMo run directories.""" + + def can_handle_directory(self, directory_path: Path) -> bool: + return False diff --git a/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py new file mode 100644 index 000000000..935b0e748 --- /dev/null +++ b/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Any, Dict, List, cast + +from cloudai import TestRun +from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy +from cloudai.test_definitions.nemo_run import NeMoRunTestDefinition + + +class NeMoRunSlurmCommandGenStrategy(SlurmCommandGenStrategy): + """Command generation strategy for NeMo 2.0 on Slurm systems.""" + + def _parse_slurm_args( + self, job_name_prefix: str, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun + ) -> Dict[str, Any]: + base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, tr) + + tdef: NeMoRunTestDefinition = cast(NeMoRunTestDefinition, tr.test.test_definition) + base_args.update({"image_path": tdef.docker_image.installed_path}) + + return base_args + + def generate_test_command(self, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun) -> List[str]: + command = ["nemo", "llm"] + + task = cmd_args.get("task", "pretrain") + command.append(task) + + recipe_name = cmd_args.get("recipe_name", "llama3_8b") + command.extend(["--factory", recipe_name]) + + command.append("-y") + + if tr.nodes: + command.append(f"trainer.num_nodes={len(tr.nodes)}") + elif tr.num_nodes > 0: + command.append(f"trainer.num_nodes={tr.num_nodes}") + + if tr.test.extra_cmd_args: + command.append(tr.test.extra_cmd_args) + + return command diff --git a/src/cloudai/schema/test_template/nemo_run/template.py b/src/cloudai/schema/test_template/nemo_run/template.py new file mode 100644 index 000000000..0373b60cc --- /dev/null +++ b/src/cloudai/schema/test_template/nemo_run/template.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cloudai import TestTemplate + + +class NeMoRun(TestTemplate): + """Test template for the NeMo-run.""" diff --git a/src/cloudai/test_definitions/__init__.py b/src/cloudai/test_definitions/__init__.py index 3bb348fad..308f3600d 100644 --- a/src/cloudai/test_definitions/__init__.py +++ b/src/cloudai/test_definitions/__init__.py @@ -20,6 +20,7 @@ from .jax_toolbox import JaxToolboxCmdArgs, JaxToolboxTestDefinition from .nccl import NCCLCmdArgs, NCCLTestDefinition from .nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition +from .nemo_run import NeMoRunCmdArgs, NeMoRunTestDefinition from .nemotron import NemotronCmdArgs, NemotronTestDefinition from .sleep import SleepCmdArgs, SleepTestDefinition from .ucc import UCCCmdArgs, UCCTestDefinition @@ -31,6 +32,8 @@ "NCCLTestDefinition", "NeMoLauncherCmdArgs", "NeMoLauncherTestDefinition", + "NeMoRunCmdArgs", + "NeMoRunTestDefinition", "SleepCmdArgs", "SleepTestDefinition", "UCCCmdArgs", diff --git a/src/cloudai/test_definitions/nemo_run.py b/src/cloudai/test_definitions/nemo_run.py new file mode 100644 index 000000000..b1d4cba37 --- /dev/null +++ b/src/cloudai/test_definitions/nemo_run.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +from cloudai import CmdArgs, TestDefinition +from cloudai.installer.installables import DockerImage, Installable + + +class NeMoRunCmdArgs(CmdArgs): + """NeMoRun test command arguments.""" + + docker_image_url: str = "nvcr.io/nvidia/nemo:24.09" + task: Optional[str] = None + recipe_name: Optional[str] = None + + +class NeMoRunTestDefinition(TestDefinition): + """Test object for NeMoLauncher.""" + + cmd_args: NeMoRunCmdArgs + _docker_image: Optional[DockerImage] = None + + @property + def docker_image(self) -> DockerImage: + if not self._docker_image: + self._docker_image = DockerImage(url=self.cmd_args.docker_image_url) + return self._docker_image + + @property + def installables(self) -> list[Installable]: + """Get list of installable objects.""" + return [self.docker_image] diff --git a/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py b/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py new file mode 100644 index 000000000..8fd52d05e --- /dev/null +++ b/tests/slurm_command_gen_strategy/test_nemo_run_slurm_command_gen_strategy.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES +# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pathlib import Path +from unittest.mock import Mock + +import pytest + +from cloudai._core.test import Test +from cloudai._core.test_scenario import TestRun +from cloudai.schema.test_template.nemo_run.slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy +from cloudai.systems import SlurmSystem +from cloudai.test_definitions.nemo_run import NeMoRunCmdArgs, NeMoRunTestDefinition + + +class TestNeMoRunSlurmCommandGenStrategy: + @pytest.fixture + def test_run(self, tmp_path: Path) -> TestRun: + tdef = NeMoRunTestDefinition( + name="t1", + description="desc1", + test_template_name="tt", + cmd_args=NeMoRunCmdArgs(), + extra_env_vars={"TEST_VAR_1": "value1"}, + extra_cmd_args={"extra_args": ""}, + ) + + test = Test(test_definition=tdef, test_template=Mock()) + tr = TestRun( + test=test, + num_nodes=2, + nodes=[], + output_path=tmp_path / "output", + name="test-job", + ) + + return tr + + @pytest.fixture + def cmd_gen_strategy(self, slurm_system: SlurmSystem) -> NeMoRunSlurmCommandGenStrategy: + return NeMoRunSlurmCommandGenStrategy(slurm_system, {}) + + @pytest.mark.parametrize( + "cmd_args, expected_cmd", + [ + ({}, ["nemo", "llm", "pretrain", "--factory", "llama3_8b", "-y", "trainer.num_nodes=2", "extra_args"]), + ( + {"recipe_name": "llama7_13b"}, + ["nemo", "llm", "pretrain", "--factory", "llama7_13b", "-y", "trainer.num_nodes=2", "extra_args"], + ), + ( + {"task": "fine_tune", "recipe_name": "llama7_13b"}, + ["nemo", "llm", "fine_tune", "--factory", "llama7_13b", "-y", "trainer.num_nodes=2", "extra_args"], + ), + ], + ) + def test_generate_test_command( + self, + cmd_gen_strategy: NeMoRunSlurmCommandGenStrategy, + test_run: TestRun, + cmd_args: dict, + expected_cmd: list, + ) -> None: + cmd = cmd_gen_strategy.generate_test_command(test_run.test.test_definition.extra_env_vars, cmd_args, test_run) + assert cmd == expected_cmd, f"Expected command {expected_cmd}, but got {cmd}" diff --git a/tests/test_init.py b/tests/test_init.py index 410e154bb..f408ccfeb 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -48,6 +48,10 @@ NeMoLauncherSlurmJobIdRetrievalStrategy, ) from cloudai.schema.test_template.nemo_launcher.template import NeMoLauncher +from cloudai.schema.test_template.nemo_run.grading_strategy import NeMoRunGradingStrategy +from cloudai.schema.test_template.nemo_run.report_generation_strategy import NeMoRunReportGenerationStrategy +from cloudai.schema.test_template.nemo_run.slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy +from cloudai.schema.test_template.nemo_run.template import NeMoRun from cloudai.schema.test_template.sleep.grading_strategy import SleepGradingStrategy from cloudai.schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy from cloudai.schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy @@ -65,6 +69,7 @@ ChakraReplayTestDefinition, NCCLTestDefinition, NeMoLauncherTestDefinition, + NeMoRunTestDefinition, SleepTestDefinition, UCCTestDefinition, ) @@ -93,6 +98,7 @@ def test_runners(): ((CommandGenStrategy, SlurmSystem, JaxToolbox), JaxToolboxSlurmCommandGenStrategy), ((CommandGenStrategy, SlurmSystem, NcclTest), NcclTestSlurmCommandGenStrategy), ((CommandGenStrategy, SlurmSystem, NeMoLauncher), NeMoLauncherSlurmCommandGenStrategy), + ((CommandGenStrategy, SlurmSystem, NeMoRun), NeMoRunSlurmCommandGenStrategy), ((CommandGenStrategy, SlurmSystem, Sleep), SleepSlurmCommandGenStrategy), ((CommandGenStrategy, SlurmSystem, UCCTest), UCCTestSlurmCommandGenStrategy), ((CommandGenStrategy, StandaloneSystem, Sleep), SleepStandaloneCommandGenStrategy), @@ -100,12 +106,14 @@ def test_runners(): ((GradingStrategy, SlurmSystem, JaxToolbox), JaxToolboxGradingStrategy), ((GradingStrategy, SlurmSystem, NcclTest), NcclTestGradingStrategy), ((GradingStrategy, SlurmSystem, NeMoLauncher), NeMoLauncherGradingStrategy), + ((GradingStrategy, SlurmSystem, NeMoRun), NeMoRunGradingStrategy), ((GradingStrategy, SlurmSystem, Sleep), SleepGradingStrategy), ((GradingStrategy, SlurmSystem, UCCTest), UCCTestGradingStrategy), ((JobIdRetrievalStrategy, SlurmSystem, ChakraReplay), SlurmJobIdRetrievalStrategy), ((JobIdRetrievalStrategy, SlurmSystem, JaxToolbox), SlurmJobIdRetrievalStrategy), ((JobIdRetrievalStrategy, SlurmSystem, NcclTest), SlurmJobIdRetrievalStrategy), ((JobIdRetrievalStrategy, SlurmSystem, NeMoLauncher), NeMoLauncherSlurmJobIdRetrievalStrategy), + ((JobIdRetrievalStrategy, SlurmSystem, NeMoRun), SlurmJobIdRetrievalStrategy), ((JobIdRetrievalStrategy, SlurmSystem, UCCTest), SlurmJobIdRetrievalStrategy), ((JobIdRetrievalStrategy, StandaloneSystem, Sleep), StandaloneJobIdRetrievalStrategy), ((JsonGenStrategy, KubernetesSystem, NcclTest), NcclTestKubernetesJsonGenStrategy), @@ -115,6 +123,7 @@ def test_runners(): ((ReportGenerationStrategy, SlurmSystem, NcclTest), NcclTestReportGenerationStrategy), ((ReportGenerationStrategy, KubernetesSystem, NcclTest), NcclTestReportGenerationStrategy), ((ReportGenerationStrategy, SlurmSystem, NeMoLauncher), NeMoLauncherReportGenerationStrategy), + ((ReportGenerationStrategy, SlurmSystem, NeMoRun), NeMoRunReportGenerationStrategy), ((ReportGenerationStrategy, SlurmSystem, Sleep), SleepReportGenerationStrategy), ((ReportGenerationStrategy, SlurmSystem, UCCTest), UCCTestReportGenerationStrategy), ((ReportGenerationStrategy, StandaloneSystem, Sleep), SleepReportGenerationStrategy), @@ -127,10 +136,11 @@ def test_strategies(key: tuple, value: type): def test_test_templates(): test_templates = Registry().test_templates_map - assert len(test_templates) == 8 + assert len(test_templates) == 9 assert test_templates["ChakraReplay"] == ChakraReplay assert test_templates["NcclTest"] == NcclTest assert test_templates["NeMoLauncher"] == NeMoLauncher + assert test_templates["NeMoRun"] == NeMoRun assert test_templates["Sleep"] == Sleep assert test_templates["UCCTest"] == UCCTest @@ -144,12 +154,13 @@ def test_installers(): def test_definitions(): test_defs = Registry().test_definitions_map - assert len(test_defs) == 8 + assert len(test_defs) == 9 assert test_defs["UCCTest"] == UCCTestDefinition assert test_defs["NcclTest"] == NCCLTestDefinition assert test_defs["ChakraReplay"] == ChakraReplayTestDefinition assert test_defs["Sleep"] == SleepTestDefinition assert test_defs["NeMoLauncher"] == NeMoLauncherTestDefinition + assert test_defs["NeMoRun"] == NeMoRunTestDefinition def test_definitions_matches_templates():