NVIDIA · TaekyungHeo · Nov 20, 2024 · Oct 29, 2024 · Nov 18, 2024 · Nov 18, 2024
diff --git a/conf/common/test/nemo_run_llama3_8b.toml b/conf/common/test/nemo_run_llama3_8b.toml
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nemo_run_llama3_8b"
+description = "nemo_run_llama3_8b"
+test_template_name = "NeMoRun"
+
+[cmd_args]
+"docker_image_url" = "nvcr.io/nvidia/nemo:24.09"
+"task" = "pretrain"
+"recipe_name" = "llama3_8b"
+
+[extra_cmd_args]
+"trainer.max_steps" = "5"
+"trainer.val_check_interval" = "1000"
+"log.ckpt.save_on_train_epoch_end" = "False"
+"log.ckpt.save_last" = "False"
diff --git a/conf/common/test_scenario/nemo_run_llama3_8b.toml b/conf/common/test_scenario/nemo_run_llama3_8b.toml
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name = "nemo_run_llama3_8b"
+
+[[Tests]]
+id = "nemo_run_llama3_8b_1"
+test_name = "nemo_run_llama3_8b"
+num_nodes = "1"
+time_limit = "00:30:00"
+
+[[Tests]]
+id = "nemo_run_llama3_8b_2"
+test_name = "nemo_run_llama3_8b"
+num_nodes = "2"
+time_limit = "00:30:00"
+  [[Tests.dependencies]]
+  type = "start_post_comp"
+  id = "nemo_run_llama3_8b_1"
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
@@ -75,6 +75,10 @@
     NeMoLauncherSlurmJobIdRetrievalStrategy,
 )
 from .schema.test_template.nemo_launcher.template import NeMoLauncher
+from .schema.test_template.nemo_run.grading_strategy import NeMoRunGradingStrategy
+from .schema.test_template.nemo_run.report_generation_strategy import NeMoRunReportGenerationStrategy
+from .schema.test_template.nemo_run.slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy
+from .schema.test_template.nemo_run.template import NeMoRun
 from .schema.test_template.sleep.grading_strategy import SleepGradingStrategy
 from .schema.test_template.sleep.kubernetes_json_gen_strategy import SleepKubernetesJsonGenStrategy
 from .schema.test_template.sleep.report_generation_strategy import SleepReportGenerationStrategy
@@ -94,6 +98,7 @@
     GrokTestDefinition,
     NCCLTestDefinition,
     NeMoLauncherTestDefinition,
+    NeMoRunTestDefinition,
     NemotronTestDefinition,
     SleepTestDefinition,
     UCCTestDefinition,
@@ -115,20 +120,23 @@
     ReportGenerationStrategy, [StandaloneSystem, SlurmSystem], [Sleep], SleepReportGenerationStrategy
 )
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherReportGenerationStrategy)
+Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [NeMoRun], NeMoRunReportGenerationStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NcclTest], NcclTestSlurmCommandGenStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [Sleep], SleepGradingStrategy)
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxReportGenerationStrategy)
 Registry().add_strategy(JobIdRetrievalStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmJobIdRetrievalStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherSlurmCommandGenStrategy)
+Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [NeMoRun], NeMoRunSlurmCommandGenStrategy)
 Registry().add_strategy(ReportGenerationStrategy, [SlurmSystem], [UCCTest], UCCTestReportGenerationStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoLauncher], NeMoLauncherGradingStrategy)
+Registry().add_strategy(GradingStrategy, [SlurmSystem], [NeMoRun], NeMoRunGradingStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxGradingStrategy)
 Registry().add_strategy(GradingStrategy, [SlurmSystem], [UCCTest], UCCTestGradingStrategy)
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [JaxToolbox], JaxToolboxSlurmCommandGenStrategy)
 Registry().add_strategy(
     JobIdRetrievalStrategy,
     [SlurmSystem],
-    [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep],
+    [ChakraReplay, JaxToolbox, NcclTest, UCCTest, Sleep, NeMoRun],
     SlurmJobIdRetrievalStrategy,
 )
 Registry().add_strategy(JobIdRetrievalStrategy, [StandaloneSystem], [Sleep], StandaloneJobIdRetrievalStrategy)
@@ -141,7 +149,7 @@
 Registry().add_strategy(
     JobStatusRetrievalStrategy,
     [SlurmSystem],
-    [ChakraReplay, UCCTest, NeMoLauncher, Sleep],
+    [ChakraReplay, UCCTest, NeMoLauncher, NeMoRun, Sleep],
     DefaultJobStatusRetrievalStrategy,
 )
 Registry().add_strategy(CommandGenStrategy, [SlurmSystem], [UCCTest], UCCTestSlurmCommandGenStrategy)
@@ -162,13 +170,15 @@
 Registry().add_test_definition("ChakraReplay", ChakraReplayTestDefinition)
 Registry().add_test_definition("Sleep", SleepTestDefinition)
 Registry().add_test_definition("NeMoLauncher", NeMoLauncherTestDefinition)
+Registry().add_test_definition("NeMoRun", NeMoRunTestDefinition)
 Registry().add_test_definition("JaxToolboxGPT", GPTTestDefinition)
 Registry().add_test_definition("JaxToolboxGrok", GrokTestDefinition)
 Registry().add_test_definition("JaxToolboxNemotron", NemotronTestDefinition)
 
 Registry().add_test_template("ChakraReplay", ChakraReplay)
 Registry().add_test_template("NcclTest", NcclTest)
 Registry().add_test_template("NeMoLauncher", NeMoLauncher)
+Registry().add_test_template("NeMoRun", NeMoRun)
 Registry().add_test_template("Sleep", Sleep)
 Registry().add_test_template("UCCTest", UCCTest)
 Registry().add_test_template("JaxToolboxGPT", JaxToolbox)

diff --git a/src/cloudai/schema/test_template/nemo_run/__init__.py b/src/cloudai/schema/test_template/nemo_run/__init__.py
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .grading_strategy import NeMoRunGradingStrategy
+from .report_generation_strategy import NeMoRunReportGenerationStrategy
+from .slurm_command_gen_strategy import NeMoRunSlurmCommandGenStrategy
+from .template import NeMoRun
+
+__all__ = [
+    "NeMoRun",
+    "NeMoRunGradingStrategy",
+    "NeMoRunReportGenerationStrategy",
+    "NeMoRunSlurmCommandGenStrategy",
+]
diff --git a/src/cloudai/schema/test_template/nemo_run/grading_strategy.py b/src/cloudai/schema/test_template/nemo_run/grading_strategy.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+from cloudai import GradingStrategy
+
+
+class NeMoRunGradingStrategy(GradingStrategy):
+    """Performance grading strategy for NeMoLauncher test templates on Slurm systems."""
+
+    def grade(self, directory_path: Path, ideal_perf: float) -> float:
+        return 0.0
diff --git a/src/cloudai/schema/test_template/nemo_run/report_generation_strategy.py b/src/cloudai/schema/test_template/nemo_run/report_generation_strategy.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+from cloudai import ReportGenerationStrategy
+
+
+class NeMoRunReportGenerationStrategy(ReportGenerationStrategy):
+    """Strategy for generating reports from NeMo run directories."""
+
+    def can_handle_directory(self, directory_path: Path) -> bool:
+        return False
diff --git a/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py b/src/cloudai/schema/test_template/nemo_run/slurm_command_gen_strategy.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Dict, List, cast
+
+from cloudai import TestRun
+from cloudai.systems.slurm.strategy import SlurmCommandGenStrategy
+from cloudai.test_definitions.nemo_run import NeMoRunTestDefinition
+
+
+class NeMoRunSlurmCommandGenStrategy(SlurmCommandGenStrategy):
+    """Command generation strategy for NeMo 2.0 on Slurm systems."""
+
+    def _parse_slurm_args(
+        self, job_name_prefix: str, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun
+    ) -> Dict[str, Any]:
+        base_args = super()._parse_slurm_args(job_name_prefix, env_vars, cmd_args, tr)
+
+        tdef: NeMoRunTestDefinition = cast(NeMoRunTestDefinition, tr.test.test_definition)
+        base_args.update({"image_path": tdef.docker_image.installed_path})
+
+        return base_args
+
+    def generate_test_command(self, env_vars: Dict[str, str], cmd_args: Dict[str, str], tr: TestRun) -> List[str]:
+        command = ["nemo", "llm"]
+
+        task = cmd_args.get("task", "pretrain")
+        command.append(task)
+
+        recipe_name = cmd_args.get("recipe_name", "llama3_8b")
+        command.extend(["--factory", recipe_name])
+
+        command.append("-y")
+
+        if tr.nodes:
+            command.append(f"trainer.num_nodes={len(tr.nodes)}")
+        elif tr.num_nodes > 0:
+            command.append(f"trainer.num_nodes={tr.num_nodes}")
+
+        if tr.test.extra_cmd_args:
+            command.append(tr.test.extra_cmd_args)
+
+        return command
diff --git a/src/cloudai/schema/test_template/nemo_run/template.py b/src/cloudai/schema/test_template/nemo_run/template.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from cloudai import TestTemplate
+
+
+class NeMoRun(TestTemplate):
+    """Test template for the NeMo-run."""
diff --git a/src/cloudai/test_definitions/__init__.py b/src/cloudai/test_definitions/__init__.py
@@ -20,6 +20,7 @@
 from .jax_toolbox import JaxToolboxCmdArgs, JaxToolboxTestDefinition
 from .nccl import NCCLCmdArgs, NCCLTestDefinition
 from .nemo_launcher import NeMoLauncherCmdArgs, NeMoLauncherTestDefinition
+from .nemo_run import NeMoRunCmdArgs, NeMoRunTestDefinition
 from .nemotron import NemotronCmdArgs, NemotronTestDefinition
 from .sleep import SleepCmdArgs, SleepTestDefinition
 from .ucc import UCCCmdArgs, UCCTestDefinition
@@ -31,6 +32,8 @@
     "NCCLTestDefinition",
     "NeMoLauncherCmdArgs",
     "NeMoLauncherTestDefinition",
+    "NeMoRunCmdArgs",
+    "NeMoRunTestDefinition",
     "SleepCmdArgs",
     "SleepTestDefinition",
     "UCCCmdArgs",

diff --git a/src/cloudai/test_definitions/nemo_run.py b/src/cloudai/test_definitions/nemo_run.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from cloudai import CmdArgs, TestDefinition
+from cloudai.installer.installables import DockerImage, Installable
+
+
+class NeMoRunCmdArgs(CmdArgs):
+    """NeMoRun test command arguments."""
+
+    docker_image_url: str = "nvcr.io/nvidia/nemo:24.09"
+    task: Optional[str] = None
+    recipe_name: Optional[str] = None
+
+
+class NeMoRunTestDefinition(TestDefinition):
+    """Test object for NeMoLauncher."""
+
+    cmd_args: NeMoRunCmdArgs
+    _docker_image: Optional[DockerImage] = None
+
+    @property
+    def docker_image(self) -> DockerImage:
+        if not self._docker_image:
+            self._docker_image = DockerImage(url=self.cmd_args.docker_image_url)
+        return self._docker_image
+
+    @property
+    def installables(self) -> list[Installable]:
+        """Get list of installable objects."""
+        return [self.docker_image]